From 0262f7f42e30e4230b432a209f398f1867f950d9 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 001/244] Revert "[tune] Avoid crash in client mode when return results creating logdir (#14115)" This reverts commit 657683ed5ac56d585a99c52e47b0da8694dbccce. --- python/ray/tune/trial.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/python/ray/tune/trial.py b/python/ray/tune/trial.py index 7507ab50dfb0..0070177803df 100644 --- a/python/ray/tune/trial.py +++ b/python/ray/tune/trial.py @@ -1,7 +1,6 @@ from typing import Callable, Dict, Sequence, Union import json -import ray import ray.cloudpickle as cloudpickle from collections import deque import copy @@ -641,9 +640,4 @@ def __setstate__(self, state): self.__dict__.update(state) validate_trainable(self.trainable_name) - - # Avoid creating logdir in client mode for returned trial results, - # since the dir might not be creatable locally. TODO(ekl) thsi is kind - # of a hack. - if not ray.util.client.ray.is_connected(): - self.init_logdir() # Create logdir if it does not exist + self.init_logdir() # Create logdir if it does not exist From e8457f00620bccf5bd564c07294b8e738c3d53ed Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 002/244] Revert "[autoscaler] Remove Hardcoded 8265 (#14112)" This reverts commit c9131fd19cae352514144a1434049e18558cebaf. --- python/ray/scripts/scripts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ray/scripts/scripts.py b/python/ray/scripts/scripts.py index 50ac89f03bf7..8deaa6f4a2f0 100644 --- a/python/ray/scripts/scripts.py +++ b/python/ray/scripts/scripts.py @@ -117,13 +117,13 @@ def cli(logging_level, logging_format): "-p", required=False, type=int, - default=ray_constants.DEFAULT_DASHBOARD_PORT, + default=8265, help="The local port to forward to the dashboard") @click.option( "--remote-port", required=False, type=int, - default=ray_constants.DEFAULT_DASHBOARD_PORT, + default=8265, help="The remote port your dashboard runs on") def dashboard(cluster_config_file, cluster_name, port, remote_port): """Port-forward a Ray cluster's dashboard to the local machine.""" From 08997cf96dea4b6cf0ec6cd6794f21df7b9cd347 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 003/244] Revert "[autoscaler] Fix bad reference error when specifying IamInstanceProfile by name in config. (#14083)" This reverts commit 6e5625308f4441725003f1fdaa5f55d718577abd. --- python/ray/autoscaler/_private/aws/config.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/ray/autoscaler/_private/aws/config.py b/python/ray/autoscaler/_private/aws/config.py index 9aa3e6d85778..2fb90787b5eb 100644 --- a/python/ray/autoscaler/_private/aws/config.py +++ b/python/ray/autoscaler/_private/aws/config.py @@ -155,11 +155,11 @@ def print_info(resource_string, _tags=workers_tags) tags = {"default": _log_info["head_instance_profile_src"] == "default"} - profile_arn = config["head_node"]["IamInstanceProfile"].get("Arn") - profile_name = _arn_to_name(profile_arn) \ - if profile_arn \ - else config["head_node"]["IamInstanceProfile"]["Name"] - cli_logger.labeled_value("IAM Profile", "{}", profile_name, _tags=tags) + cli_logger.labeled_value( + "IAM Profile", + "{}", + _arn_to_name(config["head_node"]["IamInstanceProfile"]["Arn"]), + _tags=tags) if ("KeyName" in config["head_node"] and "KeyName" in config["worker_nodes"]): From 339c94a40b1c07b7ec9a10403f9e28f8fa94cac4 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 004/244] Revert "[tune] PB2 - add small constant (#14118)" This reverts commit 3225e3c2e728b2ebc9801b281878d8d340f9e4e0. --- python/ray/tune/schedulers/pb2_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/tune/schedulers/pb2_utils.py b/python/ray/tune/schedulers/pb2_utils.py index 37dc422e0337..881d5345f04d 100644 --- a/python/ray/tune/schedulers/pb2_utils.py +++ b/python/ray/tune/schedulers/pb2_utils.py @@ -75,7 +75,7 @@ def normalize(data, wrt): which can be specified. """ return (data - np.min(wrt, axis=0)) / ( - np.max(wrt, axis=0) - np.min(wrt, axis=0) + 1e-8) + np.max(wrt, axis=0) - np.min(wrt, axis=0)) def standardize(data): From eb4da50a5e497aca594131dcf47dfe0e8a552cbd Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 005/244] Revert "[serve] Don't overwrite self.handle in StarletteEndpoint (#14111)" This reverts commit e3e42fe063ae0c88e2ceac35951ef75b07ad51bb. --- python/ray/serve/http_proxy.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/python/ray/serve/http_proxy.py b/python/ray/serve/http_proxy.py index 1aad3e9f4a27..f6fa25bb3df6 100644 --- a/python/ray/serve/http_proxy.py +++ b/python/ray/serve/http_proxy.py @@ -37,9 +37,6 @@ class ServeStarletteEndpoint: def __init__(self, client, endpoint_tag: EndpointTag): self.client = client self.endpoint_tag = endpoint_tag - # This will be lazily populated when the first request comes in. - # TODO(edoakes): we should be able to construct the handle here, but - # that currently breaks pytest. This seems like a bug. self.handle = None async def __call__(self, scope, receive, send): @@ -48,15 +45,14 @@ async def __call__(self, scope, receive, send): headers = {k.decode(): v.decode() for k, v in scope["headers"]} if self.handle is None: self.handle = self.client.get_handle(self.endpoint_tag, sync=False) - - object_ref = await self.handle.options( + self.handle = self.handle.options( method_name=headers.get("X-SERVE-CALL-METHOD".lower(), DEFAULT.VALUE), shard_key=headers.get("X-SERVE-SHARD-KEY".lower(), DEFAULT.VALUE), http_method=scope["method"].upper(), - http_headers=headers).remote( - build_starlette_request(scope, http_body_bytes)) - + http_headers=headers) + request = build_starlette_request(scope, http_body_bytes) + object_ref = await self.handle.remote(request) result = await object_ref if isinstance(result, RayTaskError): From 6444e3054eb9064c0d5bf4714d3f70c4837e9a6e Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 006/244] Revert "[Object Spilling] Remove LRU eviction (#13977)" This reverts commit 0099343923c56c9d86b4b9d36e97a51f22aaf742. --- python/ray/node.py | 4 + python/ray/parameter.py | 19 +++- python/ray/tests/test_actor_failures.py | 22 ++-- python/ray/tests/test_advanced_3.py | 5 +- python/ray/tests/test_basic_2.py | 6 +- python/ray/tests/test_failure.py | 50 +++++++++ python/ray/tests/test_reference_counting.py | 4 +- python/ray/worker.py | 9 ++ src/ray/common/ray_config_def.h | 14 ++- .../plasma/create_request_queue.cc | 11 +- .../plasma/create_request_queue.h | 19 +++- src/ray/object_manager/plasma/store.cc | 56 +++++----- src/ray/object_manager/plasma/store.h | 17 ++- .../test/create_request_queue_test.cc | 59 +++++++--- src/ray/raylet/local_object_manager.cc | 28 +++-- src/ray/raylet/local_object_manager.h | 6 +- src/ray/raylet/main.cc | 2 + src/ray/raylet/node_manager.cc | 104 ++++++++++-------- src/ray/raylet/node_manager.h | 4 + .../raylet/test/local_object_manager_test.cc | 1 + 20 files changed, 311 insertions(+), 129 deletions(-) diff --git a/python/ray/node.py b/python/ray/node.py index 05f3383a552f..cd2dc2250677 100644 --- a/python/ray/node.py +++ b/python/ray/node.py @@ -120,6 +120,10 @@ def __init__(self, raise ValueError( "Internal config parameters can only be set on the head node.") + if ray_params._lru_evict: + assert (connect_only or + head), "LRU Evict can only be passed into the head node." + self._raylet_ip_address = raylet_ip_address ray_params.update_if_absent( diff --git a/python/ray/parameter.py b/python/ray/parameter.py index bdeec7627e58..043cc258c0d9 100644 --- a/python/ray/parameter.py +++ b/python/ray/parameter.py @@ -102,6 +102,7 @@ class RayParams: _system_config (dict): Configuration for overriding RayConfig defaults. Used to set system configuration and for experimental Ray core feature flags. + lru_evict (bool): Enable LRU eviction if space is needed. enable_object_reconstruction (bool): Enable plasma reconstruction on failure. start_initial_python_workers_for_first_job (bool): If true, start @@ -198,22 +199,30 @@ def __init__(self, self.start_initial_python_workers_for_first_job = ( start_initial_python_workers_for_first_job) self._system_config = _system_config or {} + self._lru_evict = lru_evict self._enable_object_reconstruction = enable_object_reconstruction self._check_usage() # Set the internal config options for LRU eviction. if lru_evict: - raise DeprecationWarning( - "The lru_evict flag is deprecated as Ray natively " - "supports object spilling. Please read " - "https://docs.ray.io/en/master/memory-management.html#object-spilling " # noqa - "for more details.") + # Turn off object pinning. + if self._system_config is None: + self._system_config = dict() + if self._system_config.get("object_pinning_enabled", False): + raise Exception( + "Object pinning cannot be enabled if using LRU eviction.") + self._system_config["object_pinning_enabled"] = False + self._system_config["free_objects_period_milliseconds"] = 1000 # Set the internal config options for object reconstruction. if enable_object_reconstruction: # Turn off object pinning. if self._system_config is None: self._system_config = dict() + if lru_evict: + raise Exception( + "Object reconstruction cannot be enabled if using LRU " + "eviction.") print(self._system_config) self._system_config["lineage_pinning_enabled"] = True self._system_config["free_objects_period_milliseconds"] = -1 diff --git a/python/ray/tests/test_actor_failures.py b/python/ray/tests/test_actor_failures.py index 677b0e0fc940..ff9c9fd45a0e 100644 --- a/python/ray/tests/test_actor_failures.py +++ b/python/ray/tests/test_actor_failures.py @@ -32,9 +32,10 @@ def ray_init_with_task_retry_delay(): @pytest.mark.parametrize( "ray_start_regular", [{ "object_store_memory": 150 * 1024 * 1024, + "_lru_evict": True, }], indirect=True) -def test_actor_spilled(ray_start_regular): +def test_actor_eviction(ray_start_regular): object_store_memory = 150 * 1024 * 1024 @ray.remote @@ -57,14 +58,19 @@ def create_object(self, size): ray.get(obj) # Get each object again. At this point, the earlier objects should have - # been spilled. - num_success = 0 + # been evicted. + num_evicted, num_success = 0, 0 for obj in objects: - val = ray.get(obj) - assert isinstance(val, np.ndarray), val - num_success += 1 - # All of objects should've been spilled, so all of them should succeed. - assert num_success == len(objects) + try: + val = ray.get(obj) + assert isinstance(val, np.ndarray), val + num_success += 1 + except ray.exceptions.ObjectLostError: + num_evicted += 1 + # Some objects should have been evicted, and some should still be in the + # object store. + assert num_evicted > 0 + assert num_success > 0 @pytest.mark.skipif(sys.platform == "win32", reason="Very flaky on Windows.") diff --git a/python/ray/tests/test_advanced_3.py b/python/ray/tests/test_advanced_3.py index 5a2b57e2c23d..f9c736689e61 100644 --- a/python/ray/tests/test_advanced_3.py +++ b/python/ray/tests/test_advanced_3.py @@ -344,7 +344,10 @@ def test_initialized_local_mode(shutdown_only_with_initialization_check): def test_wait_reconstruction(shutdown_only): - ray.init(num_cpus=1, object_store_memory=int(10**8)) + ray.init( + num_cpus=1, + object_store_memory=int(10**8), + _system_config={"object_pinning_enabled": 0}) @ray.remote def f(): diff --git a/python/ray/tests/test_basic_2.py b/python/ray/tests/test_basic_2.py index 21fabc4ba55a..b71c63fbf941 100644 --- a/python/ray/tests/test_basic_2.py +++ b/python/ray/tests/test_basic_2.py @@ -342,7 +342,7 @@ def g(x): @pytest.mark.skipif(client_test_enabled(), reason="message size") def test_system_config_when_connecting(ray_start_cluster): - config = {"object_timeout_milliseconds": 200} + config = {"object_pinning_enabled": 0, "object_timeout_milliseconds": 200} cluster = ray.cluster_utils.Cluster() cluster.add_node( _system_config=config, object_store_memory=100 * 1024 * 1024) @@ -360,7 +360,9 @@ def test_system_config_when_connecting(ray_start_cluster): put_ref = ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8)) del put_ref - ray.get(obj_ref) + # This would not raise an exception if object pinning was enabled. + with pytest.raises(ray.exceptions.ObjectLostError): + ray.get(obj_ref) def test_get_multiple(ray_start_regular_shared): diff --git a/python/ray/tests/test_failure.py b/python/ray/tests/test_failure.py index 724033c1965c..b28ebe1ae10d 100644 --- a/python/ray/tests/test_failure.py +++ b/python/ray/tests/test_failure.py @@ -1120,6 +1120,56 @@ def test(self): ray.put(np.zeros(10**8 + 2, dtype=np.uint8)) +def test_fill_object_store_lru_fallback(shutdown_only): + config = { + "free_objects_batch_size": 1, + } + ray.init( + num_cpus=2, + object_store_memory=10**8, + _lru_evict=True, + _system_config=config) + + @ray.remote + def expensive_task(): + return np.zeros((10**8) // 2, dtype=np.uint8) + + # Check that objects out of scope are cleaned up quickly. + ray.get(expensive_task.remote()) + start = time.time() + for _ in range(3): + ray.get(expensive_task.remote()) + end = time.time() + assert end - start < 3 + + obj_refs = [] + for _ in range(3): + obj_ref = expensive_task.remote() + ray.get(obj_ref) + obj_refs.append(obj_ref) + + @ray.remote + class LargeMemoryActor: + def some_expensive_task(self): + return np.zeros(10**8 // 2, dtype=np.uint8) + + def test(self): + return 1 + + actor = LargeMemoryActor.remote() + for _ in range(3): + obj_ref = actor.some_expensive_task.remote() + ray.get(obj_ref) + obj_refs.append(obj_ref) + # Make sure actor does not die + ray.get(actor.test.remote()) + + for _ in range(3): + obj_ref = ray.put(np.zeros(10**8 // 2, dtype=np.uint8)) + ray.get(obj_ref) + obj_refs.append(obj_ref) + + @pytest.mark.parametrize( "ray_start_cluster", [{ "num_nodes": 1, diff --git a/python/ray/tests/test_reference_counting.py b/python/ray/tests/test_reference_counting.py index 0c0f3010af13..9fcd3c25f4c4 100644 --- a/python/ray/tests/test_reference_counting.py +++ b/python/ray/tests/test_reference_counting.py @@ -245,7 +245,9 @@ def pending(input1, input2): def test_feature_flag(shutdown_only): - ray.init(object_store_memory=100 * 1024 * 1024) + ray.init( + object_store_memory=100 * 1024 * 1024, + _system_config={"object_pinning_enabled": 0}) @ray.remote def f(array): diff --git a/python/ray/worker.py b/python/ray/worker.py index 7239b80a982e..5ca73860ad63 100644 --- a/python/ray/worker.py +++ b/python/ray/worker.py @@ -601,6 +601,12 @@ def init( directory for the Ray process. Defaults to an OS-specific conventional location, e.g., "/tmp/ray". _java_worker_options: Overwrite the options to start Java workers. + _lru_evict (bool): If True, when an object store is full, it will evict + objects in LRU order to make more space and when under memory + pressure, ray.ObjectLostError may be thrown. If False, then + reference counting will be used to decide which objects are safe + to evict and when under memory pressure, ray.ObjectStoreFullError + may be thrown. _metrics_export_port(int): Port number Ray exposes system metrics through a Prometheus endpoint. It is currently under active development, and the API is subject to change. @@ -738,6 +744,9 @@ def init( if _system_config is not None and len(_system_config) != 0: raise ValueError("When connecting to an existing cluster, " "_system_config must not be provided.") + if _lru_evict: + raise ValueError("When connecting to an existing cluster, " + "_lru_evict must not be provided.") if _enable_object_reconstruction: raise ValueError( "When connecting to an existing cluster, " diff --git a/src/ray/common/ray_config_def.h b/src/ray/common/ray_config_def.h index 3bcb1554697c..f109bbd59ea9 100644 --- a/src/ray/common/ray_config_def.h +++ b/src/ray/common/ray_config_def.h @@ -57,6 +57,10 @@ RAY_CONFIG(int64_t, debug_dump_period_milliseconds, 10000) /// type of task from starving other types (see issue #3664). RAY_CONFIG(bool, fair_queueing_enabled, true) +/// Whether to enable object pinning for plasma objects. When this is +/// enabled, objects in scope in the cluster will not be LRU evicted. +RAY_CONFIG(bool, object_pinning_enabled, true) + /// Whether to enable distributed reference counting for objects. When this is /// enabled, an object's ref count will include any references held by other /// processes, such as when an ObjectID is serialized and passed as an argument @@ -66,9 +70,11 @@ RAY_CONFIG(bool, fair_queueing_enabled, true) /// information: /// 1. Local Python references to the ObjectID. /// 2. Pending tasks submitted by the local process that depend on the object. -/// If both this flag is turned on, then an object +/// If both this flag and object_pinning_enabled are turned on, then an object /// will not be LRU evicted until it is out of scope in ALL processes in the -/// cluster and all objects that contain it are also out of scope. +/// cluster and all objects that contain it are also out of scope. If this flag +/// is off and object_pinning_enabled is turned on, then an object will not be +/// LRU evicted until it is out of scope on the CREATOR of the ObjectID. RAY_CONFIG(bool, distributed_ref_counting_enabled, true) /// Whether to record the creation sites of object references. This adds more @@ -76,7 +82,7 @@ RAY_CONFIG(bool, distributed_ref_counting_enabled, true) /// creating object references. RAY_CONFIG(bool, record_ref_creation_sites, true) -/// Objects that have been unpinned are +/// If object_pinning_enabled is on, then objects that have been unpinned are /// added to a local cache. When the cache is flushed, all objects in the cache /// will be eagerly evicted in a batch by freeing all plasma copies in the /// cluster. If set, then this is the duration between attempts to flush the @@ -90,7 +96,7 @@ RAY_CONFIG(bool, record_ref_creation_sites, true) /// raylet_heartbeat_period_milliseconds. RAY_CONFIG(int64_t, free_objects_period_milliseconds, 1000) -/// Objects that have been unpinned are +/// If object_pinning_enabled is on, then objects that have been unpinned are /// added to a local cache. When the cache is flushed, all objects in the cache /// will be eagerly evicted in a batch by freeing all plasma copies in the /// cluster. This is the maximum number of objects in the local cache before it diff --git a/src/ray/object_manager/plasma/create_request_queue.cc b/src/ray/object_manager/plasma/create_request_queue.cc index e8f45581b643..ddb9b089157d 100644 --- a/src/ray/object_manager/plasma/create_request_queue.cc +++ b/src/ray/object_manager/plasma/create_request_queue.cc @@ -81,7 +81,16 @@ std::pair CreateRequestQueue::TryRequestImmediately( } bool CreateRequestQueue::ProcessRequest(std::unique_ptr &request) { - request->error = request->create_callback(&request->result); + // TODO(sang): Delete this logic when lru evict is removed. + bool evict_if_full = evict_if_full_; + if (oom_start_time_ns_ != -1) { + // If the first attempt fails, we set the evict_if_full true. + // We need this logic because if lru_evict flag is on, this is false because we + // shouldn't evict objects in the first attempt. + evict_if_full = true; + } + request->error = + request->create_callback(/*evict_if_full=*/evict_if_full, &request->result); return request->error != PlasmaError::OutOfMemory; } diff --git a/src/ray/object_manager/plasma/create_request_queue.h b/src/ray/object_manager/plasma/create_request_queue.h index d22ac292b0a8..d2ac288bdeeb 100644 --- a/src/ray/object_manager/plasma/create_request_queue.h +++ b/src/ray/object_manager/plasma/create_request_queue.h @@ -31,16 +31,22 @@ namespace plasma { class CreateRequestQueue { public: - using CreateObjectCallback = std::function; + using CreateObjectCallback = + std::function; - CreateRequestQueue(int64_t oom_grace_period_s, + CreateRequestQueue(bool evict_if_full, int64_t oom_grace_period_s, ray::SpillObjectsCallback spill_objects_callback, std::function trigger_global_gc, std::function get_time) - : oom_grace_period_ns_(oom_grace_period_s * 1e9), + : evict_if_full_(evict_if_full), + oom_grace_period_ns_(oom_grace_period_s * 1e9), spill_objects_callback_(spill_objects_callback), trigger_global_gc_(trigger_global_gc), - get_time_(get_time) {} + get_time_(get_time) { + RAY_LOG(DEBUG) << "Starting plasma::CreateRequestQueue with OOM grace period " + << oom_grace_period_ns_ << ", evict if full? " + << (evict_if_full_ ? 1 : 0); + } /// Add a request to the queue. The caller should use the returned request ID /// to later get the result of the request. @@ -145,6 +151,11 @@ class CreateRequestQueue { /// a request by retrying. Start at 1 because 0 means "do not retry". uint64_t next_req_id_ = 1; + /// On the first attempt to create an object, whether to evict from the + /// object store to make space. If the first attempt fails, then we will + /// always try to evict. + const bool evict_if_full_; + /// Grace period until we throw the OOM error to the application. /// -1 means grace period is infinite. const int64_t oom_grace_period_ns_; diff --git a/src/ray/object_manager/plasma/store.cc b/src/ray/object_manager/plasma/store.cc index 642d842047c7..920ced48e39d 100644 --- a/src/ray/object_manager/plasma/store.cc +++ b/src/ray/object_manager/plasma/store.cc @@ -129,6 +129,7 @@ PlasmaStore::PlasmaStore(boost::asio::io_service &main_service, std::string dire usage_log_interval_ns_(RayConfig::instance().object_store_usage_log_interval_s() * 1e9), create_request_queue_( + /*evict_if_full=*/RayConfig::instance().object_pinning_enabled(), /*oom_grace_period_s=*/RayConfig::instance().oom_grace_period_s(), spill_objects_callback, object_store_full_callback, /*get_time=*/ @@ -172,19 +173,21 @@ void PlasmaStore::AddToClientObjectIds(const ObjectID &object_id, ObjectTableEnt } // Allocate memory -uint8_t *PlasmaStore::AllocateMemory(size_t size, MEMFD_TYPE *fd, int64_t *map_size, - ptrdiff_t *offset, +uint8_t *PlasmaStore::AllocateMemory(size_t size, bool evict_if_full, MEMFD_TYPE *fd, + int64_t *map_size, ptrdiff_t *offset, const std::shared_ptr &client, bool is_create, PlasmaError *error) { // First free up space from the client's LRU queue if quota enforcement is on. - std::vector client_objects_to_evict; - bool quota_ok = eviction_policy_.EnforcePerClientQuota(client.get(), size, is_create, - &client_objects_to_evict); - if (!quota_ok) { - *error = PlasmaError::OutOfMemory; - return nullptr; + if (evict_if_full) { + std::vector client_objects_to_evict; + bool quota_ok = eviction_policy_.EnforcePerClientQuota(client.get(), size, is_create, + &client_objects_to_evict); + if (!quota_ok) { + *error = PlasmaError::OutOfMemory; + return nullptr; + } + EvictObjects(client_objects_to_evict); } - EvictObjects(client_objects_to_evict); // Try to evict objects until there is enough space. uint8_t *pointer = nullptr; @@ -197,7 +200,7 @@ uint8_t *PlasmaStore::AllocateMemory(size_t size, MEMFD_TYPE *fd, int64_t *map_s // it is not guaranteed that the corresponding pointer in the client will be // 64-byte aligned, but in practice it often will be. pointer = reinterpret_cast(PlasmaAllocator::Memalign(kBlockSize, size)); - if (pointer) { + if (pointer || !evict_if_full) { // If we manage to allocate the memory, return the pointer. If we cannot // allocate the space, but we are also not allowed to evict anything to // make more space, return an error to the client. @@ -233,6 +236,7 @@ uint8_t *PlasmaStore::AllocateMemory(size_t size, MEMFD_TYPE *fd, int64_t *map_s PlasmaError PlasmaStore::HandleCreateObjectRequest(const std::shared_ptr &client, const std::vector &message, + bool evict_if_full, PlasmaObject *object) { uint8_t *input = (uint8_t *)message.data(); size_t input_size = message.size(); @@ -248,9 +252,9 @@ PlasmaError PlasmaStore::HandleCreateObjectRequest(const std::shared_ptr ReadCreateRequest(input, input_size, &object_id, &owner_raylet_id, &owner_ip_address, &owner_port, &owner_worker_id, &data_size, &metadata_size, &device_num); - auto error = - CreateObject(object_id, owner_raylet_id, owner_ip_address, owner_port, - owner_worker_id, data_size, metadata_size, device_num, client, object); + auto error = CreateObject(object_id, owner_raylet_id, owner_ip_address, owner_port, + owner_worker_id, evict_if_full, data_size, metadata_size, + device_num, client, object); if (error == PlasmaError::OutOfMemory) { RAY_LOG(DEBUG) << "Not enough memory to create the object " << object_id << ", data_size=" << data_size << ", metadata_size=" << metadata_size; @@ -258,13 +262,11 @@ PlasmaError PlasmaStore::HandleCreateObjectRequest(const std::shared_ptr return error; } -PlasmaError PlasmaStore::CreateObject(const ObjectID &object_id, - const NodeID &owner_raylet_id, - const std::string &owner_ip_address, int owner_port, - const WorkerID &owner_worker_id, int64_t data_size, - int64_t metadata_size, int device_num, - const std::shared_ptr &client, - PlasmaObject *result) { +PlasmaError PlasmaStore::CreateObject( + const ObjectID &object_id, const NodeID &owner_raylet_id, + const std::string &owner_ip_address, int owner_port, const WorkerID &owner_worker_id, + bool evict_if_full, int64_t data_size, int64_t metadata_size, int device_num, + const std::shared_ptr &client, PlasmaObject *result) { RAY_LOG(DEBUG) << "creating object " << object_id.Hex() << " size " << data_size; auto entry = GetObjectTableEntry(&store_info_, object_id); @@ -282,7 +284,8 @@ PlasmaError PlasmaStore::CreateObject(const ObjectID &object_id, if (device_num == 0) { PlasmaError error = PlasmaError::OK; - pointer = AllocateMemory(total_size, &fd, &map_size, &offset, client, true, &error); + pointer = AllocateMemory(total_size, evict_if_full, &fd, &map_size, &offset, client, + true, &error); if (!pointer) { return error; } @@ -488,9 +491,9 @@ void PlasmaStore::ProcessGetRequest(const std::shared_ptr &client, RAY_CHECK(!entry->pointer); PlasmaError error = PlasmaError::OK; - entry->pointer = - AllocateMemory(entry->data_size + entry->metadata_size, &entry->fd, - &entry->map_size, &entry->offset, client, false, &error); + entry->pointer = AllocateMemory(entry->data_size + entry->metadata_size, + /*evict=*/true, &entry->fd, &entry->map_size, + &entry->offset, client, false, &error); if (entry->pointer) { // TODO(suquark): Not sure if this old behavior is still compatible // with our current object spilling mechanics. @@ -862,8 +865,9 @@ Status PlasmaStore::ProcessMessage(const std::shared_ptr &client, const auto &object_id = GetCreateRequestObjectId(message); const auto &request = flatbuffers::GetRoot(input); - auto handle_create = [this, client, message](PlasmaObject *result) { - return HandleCreateObjectRequest(client, message, result); + auto handle_create = [this, client, message](bool evict_if_full, + PlasmaObject *result) { + return HandleCreateObjectRequest(client, message, evict_if_full, result); }; if (request->try_immediately()) { diff --git a/src/ray/object_manager/plasma/store.h b/src/ray/object_manager/plasma/store.h index c6561bf655b7..eedcb526d809 100644 --- a/src/ray/object_manager/plasma/store.h +++ b/src/ray/object_manager/plasma/store.h @@ -77,6 +77,10 @@ class PlasmaStore { /// \param owner_ip_address IP address of the object's owner. /// \param owner_port Port of the object's owner. /// \param owner_worker_id Worker ID of the object's owner. + /// \param evict_if_full If this is true, then when the object store is full, + /// try to evict objects that are not currently referenced before + /// creating the object. Else, do not evict any objects and + /// immediately return an PlasmaError::OutOfMemory. /// \param data_size Size in bytes of the object to be created. /// \param metadata_size Size in bytes of the object metadata. /// \param device_num The number of the device where the object is being @@ -96,8 +100,8 @@ class PlasmaStore { /// plasma_release. PlasmaError CreateObject(const ObjectID &object_id, const NodeID &owner_raylet_id, const std::string &owner_ip_address, int owner_port, - const WorkerID &owner_worker_id, int64_t data_size, - int64_t metadata_size, int device_num, + const WorkerID &owner_worker_id, bool evict_if_full, + int64_t data_size, int64_t metadata_size, int device_num, const std::shared_ptr &client, PlasmaObject *result); /// Abort a created but unsealed object. If the client is not the @@ -220,7 +224,7 @@ class PlasmaStore { private: PlasmaError HandleCreateObjectRequest(const std::shared_ptr &client, const std::vector &message, - PlasmaObject *object); + bool evict_if_full, PlasmaObject *object); void ReplyToCreateClient(const std::shared_ptr &client, const ObjectID &object_id, uint64_t req_id); @@ -251,9 +255,10 @@ class PlasmaStore { void EraseFromObjectTable(const ObjectID &object_id); - uint8_t *AllocateMemory(size_t size, MEMFD_TYPE *fd, int64_t *map_size, - ptrdiff_t *offset, const std::shared_ptr &client, - bool is_create, PlasmaError *error); + uint8_t *AllocateMemory(size_t size, bool evict_if_full, MEMFD_TYPE *fd, + int64_t *map_size, ptrdiff_t *offset, + const std::shared_ptr &client, bool is_create, + PlasmaError *error); // Start listening for clients. void DoAccept(); diff --git a/src/ray/object_manager/test/create_request_queue_test.cc b/src/ray/object_manager/test/create_request_queue_test.cc index 5b107c71ad27..ec75e0043e79 100644 --- a/src/ray/object_manager/test/create_request_queue_test.cc +++ b/src/ray/object_manager/test/create_request_queue_test.cc @@ -49,6 +49,7 @@ class CreateRequestQueueTest : public ::testing::Test { : oom_grace_period_s_(1), current_time_ns_(0), queue_( + /*evict_if_full=*/true, /*oom_grace_period_s=*/oom_grace_period_s_, /*spill_object_callback=*/[&]() { return false; }, /*on_global_gc=*/[&]() { num_global_gc_++; }, @@ -68,7 +69,7 @@ class CreateRequestQueueTest : public ::testing::Test { }; TEST_F(CreateRequestQueueTest, TestSimple) { - auto request = [&](PlasmaObject *result) { + auto request = [&](bool evict_if_full, PlasmaObject *result) { result->data_size = 1234; return PlasmaError::OK; }; @@ -104,8 +105,10 @@ TEST_F(CreateRequestQueueTest, TestSimple) { } TEST_F(CreateRequestQueueTest, TestOom) { - auto oom_request = [&](PlasmaObject *result) { return PlasmaError::OutOfMemory; }; - auto blocked_request = [&](PlasmaObject *result) { + auto oom_request = [&](bool evict_if_full, PlasmaObject *result) { + return PlasmaError::OutOfMemory; + }; + auto blocked_request = [&](bool evict_if_full, PlasmaObject *result) { result->data_size = 1234; return PlasmaError::OK; }; @@ -138,14 +141,17 @@ TEST(CreateRequestQueueParameterTest, TestOomInfiniteRetry) { int num_global_gc_ = 0; int64_t current_time_ns; CreateRequestQueue queue( + /*evict_if_full=*/true, /*oom_grace_period_s=*/100, // Spilling is failing. /*spill_object_callback=*/[&]() { return false; }, /*on_global_gc=*/[&]() { num_global_gc_++; }, /*get_time=*/[&]() { return current_time_ns; }); - auto oom_request = [&](PlasmaObject *result) { return PlasmaError::OutOfMemory; }; - auto blocked_request = [&](PlasmaObject *result) { + auto oom_request = [&](bool evict_if_full, PlasmaObject *result) { + return PlasmaError::OutOfMemory; + }; + auto blocked_request = [&](bool evict_if_full, PlasmaObject *result) { result->data_size = 1234; return PlasmaError::OK; }; @@ -168,19 +174,20 @@ TEST(CreateRequestQueueParameterTest, TestOomInfiniteRetry) { TEST_F(CreateRequestQueueTest, TestTransientOom) { CreateRequestQueue queue( + /*evict_if_full=*/true, /*oom_grace_period_s=*/oom_grace_period_s_, /*spill_object_callback=*/[&]() { return true; }, /*on_global_gc=*/[&]() { num_global_gc_++; }, /*get_time=*/[&]() { return current_time_ns_; }); auto return_status = PlasmaError::OutOfMemory; - auto oom_request = [&](PlasmaObject *result) { + auto oom_request = [&](bool evict_if_full, PlasmaObject *result) { if (return_status == PlasmaError::OK) { result->data_size = 1234; } return return_status; }; - auto blocked_request = [&](PlasmaObject *result) { + auto blocked_request = [&](bool evict_if_full, PlasmaObject *result) { result->data_size = 1234; return PlasmaError::OK; }; @@ -213,19 +220,20 @@ TEST_F(CreateRequestQueueTest, TestTransientOom) { TEST_F(CreateRequestQueueTest, TestTransientOomThenOom) { bool is_spilling_possible = true; CreateRequestQueue queue( + /*evict_if_full=*/true, /*oom_grace_period_s=*/oom_grace_period_s_, /*spill_object_callback=*/[&]() { return is_spilling_possible; }, /*on_global_gc=*/[&]() { num_global_gc_++; }, /*get_time=*/[&]() { return current_time_ns_; }); auto return_status = PlasmaError::OutOfMemory; - auto oom_request = [&](PlasmaObject *result) { + auto oom_request = [&](bool evict_if_full, PlasmaObject *result) { if (return_status == PlasmaError::OK) { result->data_size = 1234; } return return_status; }; - auto blocked_request = [&](PlasmaObject *result) { + auto blocked_request = [&](bool evict_if_full, PlasmaObject *result) { result->data_size = 1234; return PlasmaError::OK; }; @@ -263,15 +271,38 @@ TEST_F(CreateRequestQueueTest, TestTransientOomThenOom) { AssertNoLeaks(); } +TEST_F(CreateRequestQueueTest, TestEvictIfFull) { + auto oom_request = [&](bool evict_if_full, PlasmaObject *result) { + RAY_CHECK(evict_if_full); + return PlasmaError::OutOfMemory; + }; + + auto client = std::make_shared(); + static_cast(queue_.AddRequest(ObjectID::Nil(), client, oom_request)); + ASSERT_TRUE(queue_.ProcessRequests().IsObjectStoreFull()); + ASSERT_TRUE(queue_.ProcessRequests().IsObjectStoreFull()); +} + TEST(CreateRequestQueueParameterTest, TestNoEvictIfFull) { int64_t current_time_ns = 0; CreateRequestQueue queue( + /*evict_if_full=*/false, /*oom_grace_period_s=*/1, /*spill_object_callback=*/[&]() { return false; }, /*on_global_gc=*/[&]() {}, /*get_time=*/[&]() { return current_time_ns; }); - auto oom_request = [&](PlasmaObject *result) { return PlasmaError::OutOfMemory; }; + bool first_try = true; + + auto oom_request = [&](bool evict_if_full, PlasmaObject *result) { + if (first_try) { + RAY_CHECK(!evict_if_full); + first_try = false; + } else { + RAY_CHECK(evict_if_full); + } + return PlasmaError::OutOfMemory; + }; auto client = std::make_shared(); static_cast(queue.AddRequest(ObjectID::Nil(), client, oom_request)); @@ -281,7 +312,7 @@ TEST(CreateRequestQueueParameterTest, TestNoEvictIfFull) { } TEST_F(CreateRequestQueueTest, TestClientDisconnected) { - auto request = [&](PlasmaObject *result) { + auto request = [&](bool evict_if_full, PlasmaObject *result) { result->data_size = 1234; return PlasmaError::OK; }; @@ -310,7 +341,7 @@ TEST_F(CreateRequestQueueTest, TestClientDisconnected) { } TEST_F(CreateRequestQueueTest, TestTryRequestImmediately) { - auto request = [&](PlasmaObject *result) { + auto request = [&](bool evict_if_full, PlasmaObject *result) { result->data_size = 1234; return PlasmaError::OK; }; @@ -335,7 +366,9 @@ TEST_F(CreateRequestQueueTest, TestTryRequestImmediately) { // Queue is empty, but request would block. Check that we do not attempt to // retry the request. - auto oom_request = [&](PlasmaObject *result) { return PlasmaError::OutOfMemory; }; + auto oom_request = [&](bool evict_if_full, PlasmaObject *result) { + return PlasmaError::OutOfMemory; + }; result = queue_.TryRequestImmediately(ObjectID::Nil(), client, oom_request); ASSERT_EQ(result.first.data_size, 0); ASSERT_EQ(result.second, PlasmaError::OutOfMemory); diff --git a/src/ray/raylet/local_object_manager.cc b/src/ray/raylet/local_object_manager.cc index d37576a48ede..3ee7de57c816 100644 --- a/src/ray/raylet/local_object_manager.cc +++ b/src/ray/raylet/local_object_manager.cc @@ -23,6 +23,7 @@ namespace raylet { void LocalObjectManager::PinObjects(const std::vector &object_ids, std::vector> &&objects, const rpc::Address &owner_address) { + RAY_CHECK(object_pinning_enabled_); for (size_t i = 0; i < object_ids.size(); i++) { const auto &object_id = object_ids[i]; auto &object = objects[i]; @@ -60,17 +61,20 @@ void LocalObjectManager::WaitForObjectFree(const rpc::Address &owner_address, } void LocalObjectManager::ReleaseFreedObject(const ObjectID &object_id) { - RAY_LOG(DEBUG) << "Unpinning object " << object_id; - // The object should be in one of these stats. pinned, spilling, or spilled. - RAY_CHECK((pinned_objects_.count(object_id) > 0) || - (spilled_objects_url_.count(object_id) > 0) || - (objects_pending_spill_.count(object_id) > 0)); - if (automatic_object_deletion_enabled_) { - spilled_object_pending_delete_.push(object_id); - } - if (pinned_objects_.count(object_id)) { - pinned_objects_size_ -= pinned_objects_[object_id].first->GetSize(); - pinned_objects_.erase(object_id); + // object_pinning_enabled_ flag is off when the --lru-evict flag is on. + if (object_pinning_enabled_) { + RAY_LOG(DEBUG) << "Unpinning object " << object_id; + // The object should be in one of these stats. pinned, spilling, or spilled. + RAY_CHECK((pinned_objects_.count(object_id) > 0) || + (spilled_objects_url_.count(object_id) > 0) || + (objects_pending_spill_.count(object_id) > 0)); + if (automatic_object_deletion_enabled_) { + spilled_object_pending_delete_.push(object_id); + } + if (pinned_objects_.count(object_id)) { + pinned_objects_size_ -= pinned_objects_[object_id].first->GetSize(); + pinned_objects_.erase(object_id); + } } // Try to evict all copies of the object from the cluster. @@ -89,7 +93,7 @@ void LocalObjectManager::FlushFreeObjects() { on_objects_freed_(objects_to_free_); objects_to_free_.clear(); } - if (automatic_object_deletion_enabled_) { + if (object_pinning_enabled_ && automatic_object_deletion_enabled_) { // Deletion wouldn't work when the object pinning is not enabled. ProcessSpilledObjectsDeleteQueue(free_objects_batch_size_); } diff --git a/src/ray/raylet/local_object_manager.h b/src/ray/raylet/local_object_manager.h index 285060ab5cd3..267edabd9d8a 100644 --- a/src/ray/raylet/local_object_manager.h +++ b/src/ray/raylet/local_object_manager.h @@ -41,7 +41,7 @@ class LocalObjectManager { const NodeID &node_id, size_t free_objects_batch_size, int64_t free_objects_period_ms, IOWorkerPoolInterface &io_worker_pool, gcs::ObjectInfoAccessor &object_info_accessor, - rpc::CoreWorkerClientPool &owner_client_pool, + rpc::CoreWorkerClientPool &owner_client_pool, bool object_pinning_enabled, bool automatic_object_deletion_enabled, int max_io_workers, int64_t min_spilling_size, bool is_external_storage_type_fs, std::function &)> on_objects_freed, @@ -54,6 +54,7 @@ class LocalObjectManager { io_worker_pool_(io_worker_pool), object_info_accessor_(object_info_accessor), owner_client_pool_(owner_client_pool), + object_pinning_enabled_(object_pinning_enabled), automatic_object_deletion_enabled_(automatic_object_deletion_enabled), on_objects_freed_(on_objects_freed), last_free_objects_at_ms_(current_time_ms()), @@ -202,6 +203,9 @@ class LocalObjectManager { /// this node. rpc::CoreWorkerClientPool &owner_client_pool_; + /// Whether to enable pinning for plasma objects. + bool object_pinning_enabled_; + /// Whether to enable automatic deletion when refs are gone out of scope. bool automatic_object_deletion_enabled_; diff --git a/src/ray/raylet/main.cc b/src/ray/raylet/main.cc index 729c400fe31a..1d47f23b356a 100644 --- a/src/ray/raylet/main.cc +++ b/src/ray/raylet/main.cc @@ -205,6 +205,8 @@ int main(int argc, char *argv[]) { RayConfig::instance().metrics_report_interval_ms() / 2; node_manager_config.fair_queueing_enabled = RayConfig::instance().fair_queueing_enabled(); + node_manager_config.object_pinning_enabled = + RayConfig::instance().object_pinning_enabled(); node_manager_config.automatic_object_deletion_enabled = RayConfig::instance().automatic_object_deletion_enabled(); node_manager_config.store_socket_name = store_socket_name; diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index 4eb3941dd260..2287fd3e821b 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -130,6 +130,7 @@ NodeManager::NodeManager(boost::asio::io_service &io_service, const NodeID &self std::chrono::milliseconds(config.report_resources_period_ms)), debug_dump_period_(config.debug_dump_period_ms), fair_queueing_enabled_(config.fair_queueing_enabled), + object_pinning_enabled_(config.object_pinning_enabled), temp_dir_(config.temp_dir), object_manager_profile_timer_(io_service), initial_config_(config), @@ -161,6 +162,7 @@ NodeManager::NodeManager(boost::asio::io_service &io_service, const NodeID &self self_node_id_, RayConfig::instance().free_objects_batch_size(), RayConfig::instance().free_objects_period_milliseconds(), worker_pool_, gcs_client_->Objects(), worker_rpc_pool_, + /* object_pinning_enabled */ config.object_pinning_enabled, /* automatic_object_deletion_enabled */ config.automatic_object_deletion_enabled, /*max_io_workers*/ config.max_io_workers, @@ -2067,42 +2069,52 @@ void NodeManager::HandleTaskReconstruction(const TaskID &task_id, rpc::Address owner_addr; bool has_owner = dependency_manager_.GetOwnerAddress(required_object_id, &owner_addr); if (has_owner) { - RAY_LOG(DEBUG) << "Required object " << required_object_id - << " fetch timed out, asking owner " - << WorkerID::FromBinary(owner_addr.worker_id()); - // The owner's address exists. Poll the owner to check if the object is - // still in scope. If not, mark the object as failed. - // TODO(swang): If the owner has died, we could also mark the object as - // failed as soon as we hear about the owner's failure from the GCS, - // avoiding the raylet's reconstruction timeout. - auto client = std::unique_ptr( - new rpc::CoreWorkerClient(owner_addr, client_call_manager_)); - - rpc::GetObjectStatusRequest request; - request.set_object_id(required_object_id.Binary()); - request.set_owner_worker_id(owner_addr.worker_id()); - client->GetObjectStatus( - request, [this, required_object_id, owner_addr]( - Status status, const rpc::GetObjectStatusReply &reply) { - if (!status.ok() || reply.status() == rpc::GetObjectStatusReply::OUT_OF_SCOPE || - reply.status() == rpc::GetObjectStatusReply::FREED) { - // The owner is gone, or the owner replied that the object has - // gone out of scope (this is an edge case in the distributed ref - // counting protocol where a borrower dies before it can notify - // the owner of another borrower), or the object value has been - // freed. Store an error in the local plasma store so that an - // exception will be thrown when the worker tries to get the - // value. - rpc::ObjectReference ref; - ref.set_object_id(required_object_id.Binary()); - ref.mutable_owner_address()->CopyFrom(owner_addr); - MarkObjectsAsFailed(ErrorType::OBJECT_UNRECONSTRUCTABLE, {ref}, JobID::Nil()); - } - // Do nothing if the owner replied that the object is available. The - // object manager will continue trying to fetch the object, and this - // handler will get triggered again if the object is still - // unavailable after another timeout. - }); + if (!RayConfig::instance().object_pinning_enabled()) { + // LRU eviction is enabled. The object may still be in scope, but we + // weren't able to fetch the value within the timeout, so the value has + // most likely been evicted. Mark the object as unreachable. + rpc::ObjectReference ref; + ref.set_object_id(required_object_id.Binary()); + ref.mutable_owner_address()->CopyFrom(owner_addr); + MarkObjectsAsFailed(ErrorType::OBJECT_UNRECONSTRUCTABLE, {ref}, JobID::Nil()); + } else { + RAY_LOG(DEBUG) << "Required object " << required_object_id + << " fetch timed out, asking owner " + << WorkerID::FromBinary(owner_addr.worker_id()); + // The owner's address exists. Poll the owner to check if the object is + // still in scope. If not, mark the object as failed. + // TODO(swang): If the owner has died, we could also mark the object as + // failed as soon as we hear about the owner's failure from the GCS, + // avoiding the raylet's reconstruction timeout. + auto client = std::unique_ptr( + new rpc::CoreWorkerClient(owner_addr, client_call_manager_)); + + rpc::GetObjectStatusRequest request; + request.set_object_id(required_object_id.Binary()); + request.set_owner_worker_id(owner_addr.worker_id()); + client->GetObjectStatus(request, [this, required_object_id, owner_addr]( + Status status, + const rpc::GetObjectStatusReply &reply) { + if (!status.ok() || reply.status() == rpc::GetObjectStatusReply::OUT_OF_SCOPE || + reply.status() == rpc::GetObjectStatusReply::FREED) { + // The owner is gone, or the owner replied that the object has + // gone out of scope (this is an edge case in the distributed ref + // counting protocol where a borrower dies before it can notify + // the owner of another borrower), or the object value has been + // freed. Store an error in the local plasma store so that an + // exception will be thrown when the worker tries to get the + // value. + rpc::ObjectReference ref; + ref.set_object_id(required_object_id.Binary()); + ref.mutable_owner_address()->CopyFrom(owner_addr); + MarkObjectsAsFailed(ErrorType::OBJECT_UNRECONSTRUCTABLE, {ref}, JobID::Nil()); + } + // Do nothing if the owner replied that the object is available. The + // object manager will continue trying to fetch the object, and this + // handler will get triggered again if the object is still + // unavailable after another timeout. + }); + } } else { RAY_LOG(WARNING) << "Ray cannot get the value of ObjectIDs that are generated " @@ -2404,16 +2416,18 @@ void NodeManager::HandlePinObjectIDs(const rpc::PinObjectIDsRequest &request, for (const auto &object_id_binary : request.object_ids()) { object_ids.push_back(ObjectID::FromBinary(object_id_binary)); } - std::vector> results; - if (!GetObjectsFromPlasma(object_ids, &results)) { - RAY_LOG(WARNING) - << "Failed to get objects that should have been in the object store. These " - "objects may have been evicted while there are still references in scope."; - // TODO(suquark): Maybe "Status::ObjectNotFound" is more accurate here. - send_reply_callback(Status::Invalid("Failed to get objects."), nullptr, nullptr); - return; + if (object_pinning_enabled_) { + std::vector> results; + if (!GetObjectsFromPlasma(object_ids, &results)) { + RAY_LOG(WARNING) + << "Failed to get objects that should have been in the object store. These " + "objects may have been evicted while there are still references in scope."; + // TODO(suquark): Maybe "Status::ObjectNotFound" is more accurate here. + send_reply_callback(Status::Invalid("Failed to get objects."), nullptr, nullptr); + return; + } + local_object_manager_.PinObjects(object_ids, std::move(results), owner_address); } - local_object_manager_.PinObjects(object_ids, std::move(results), owner_address); // Wait for the object to be freed by the owner, which keeps the ref count. local_object_manager_.WaitForObjectFree(owner_address, object_ids); send_reply_callback(Status::OK(), nullptr, nullptr); diff --git a/src/ray/raylet/node_manager.h b/src/ray/raylet/node_manager.h index d0819550958a..606dc3ac6fa7 100644 --- a/src/ray/raylet/node_manager.h +++ b/src/ray/raylet/node_manager.h @@ -93,6 +93,8 @@ struct NodeManagerConfig { uint64_t debug_dump_period_ms; /// Whether to enable fair queueing between task classes in raylet. bool fair_queueing_enabled; + /// Whether to enable pinning for plasma objects. + bool object_pinning_enabled; /// Whether to enable automatic object deletion for object spilling. bool automatic_object_deletion_enabled; /// The store socket name. @@ -799,6 +801,8 @@ class NodeManager : public rpc::NodeManagerServiceHandler, int64_t debug_dump_period_; /// Whether to enable fair queueing between task classes in raylet. bool fair_queueing_enabled_; + /// Whether to enable pinning for plasma objects. + bool object_pinning_enabled_; /// Incremented each time we encounter a potential resource deadlock condition. /// This is reset to zero when the condition is cleared. int resource_deadlock_warned_ = 0; diff --git a/src/ray/raylet/test/local_object_manager_test.cc b/src/ray/raylet/test/local_object_manager_test.cc index 148ed6514631..d056928c0219 100644 --- a/src/ray/raylet/test/local_object_manager_test.cc +++ b/src/ray/raylet/test/local_object_manager_test.cc @@ -280,6 +280,7 @@ class LocalObjectManagerTest : public ::testing::Test { manager_node_id_(NodeID::FromRandom()), manager(manager_node_id_, free_objects_batch_size, /*free_objects_period_ms=*/1000, worker_pool, object_table, client_pool, + /*object_pinning_enabled=*/true, /*automatic_object_delete_enabled=*/true, /*max_io_workers=*/2, /*min_spilling_size=*/0, From 9633470bb29cec6f4431b6715859d40338e5d96a Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 007/244] =?UTF-8?q?Revert=20"Revert=20"Revert=20"Unhandled?= =?UTF-8?q?=20exception=20handler=20based=20on=20local=20ref=20counti?= =?UTF-8?q?=E2=80=A6=20(#14113)"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit d09ba50bdf68adc59d76ccc620db2b8dbbf31d22. --- BUILD.bazel | 10 --- python/ray/_raylet.pyx | 25 +----- python/ray/includes/libcoreworker.pxd | 1 - python/ray/tests/test_failure.py | 46 ----------- python/ray/worker.py | 79 ++++++++++++++----- src/ray/common/ray_object.h | 8 -- src/ray/core_worker/core_worker.cc | 2 +- src/ray/core_worker/core_worker.h | 3 - .../memory_store/memory_store.cc | 29 +------ .../memory_store/memory_store.h | 9 +-- src/ray/core_worker/test/memory_store_test.cc | 66 ---------------- 11 files changed, 68 insertions(+), 210 deletions(-) delete mode 100644 src/ray/core_worker/test/memory_store_test.cc diff --git a/BUILD.bazel b/BUILD.bazel index 7dbd8fadb526..c1745e468852 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -702,16 +702,6 @@ cc_test( ], ) -cc_test( - name = "memory_store_test", - srcs = ["src/ray/core_worker/test/memory_store_test.cc"], - copts = COPTS, - deps = [ - ":core_worker_lib", - "@com_google_googletest//:gtest_main", - ], -) - cc_test( name = "direct_actor_transport_test", srcs = ["src/ray/core_worker/test/direct_actor_transport_test.cc"], diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 3dda95988cd3..da00f627345e 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -724,20 +724,6 @@ cdef void delete_spilled_objects_handler( job_id=None) -cdef void unhandled_exception_handler(const CRayObject& error) nogil: - with gil: - worker = ray.worker.global_worker - data = None - metadata = None - if error.HasData(): - data = Buffer.make(error.GetData()) - if error.HasMetadata(): - metadata = Buffer.make(error.GetMetadata()).to_pybytes() - # TODO(ekl) why does passing a ObjectRef.nil() lead to shutdown errors? - object_ids = [None] - worker.raise_errors([(data, metadata)], object_ids) - - # This function introduces ~2-7us of overhead per call (i.e., it can be called # up to hundreds of thousands of times per second). cdef void get_py_stack(c_string* stack_out) nogil: @@ -847,7 +833,6 @@ cdef class CoreWorker: options.spill_objects = spill_objects_handler options.restore_spilled_objects = restore_spilled_objects_handler options.delete_spilled_objects = delete_spilled_objects_handler - options.unhandled_exception_handler = unhandled_exception_handler options.get_lang_stack = get_py_stack options.ref_counting_enabled = True options.is_local_mode = local_mode @@ -1458,13 +1443,9 @@ cdef class CoreWorker: object_ref.native()) def remove_object_ref_reference(self, ObjectRef object_ref): - cdef: - CObjectID c_object_id = object_ref.native() - # We need to release the gil since object destruction may call the - # unhandled exception handler. - with nogil: - CCoreWorkerProcess.GetCoreWorker().RemoveLocalReference( - c_object_id) + # Note: faster to not release GIL for short-running op. + CCoreWorkerProcess.GetCoreWorker().RemoveLocalReference( + object_ref.native()) def serialize_and_promote_object_ref(self, ObjectRef object_ref): cdef: diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd index 2eb5f109bf65..6114b9e7d58c 100644 --- a/python/ray/includes/libcoreworker.pxd +++ b/python/ray/includes/libcoreworker.pxd @@ -250,7 +250,6 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: (void( const c_vector[c_string]&, CWorkerType) nogil) delete_spilled_objects - (void(const CRayObject&) nogil) unhandled_exception_handler (void(c_string *stack_out) nogil) get_lang_stack c_bool ref_counting_enabled c_bool is_local_mode diff --git a/python/ray/tests/test_failure.py b/python/ray/tests/test_failure.py index b28ebe1ae10d..fca209743129 100644 --- a/python/ray/tests/test_failure.py +++ b/python/ray/tests/test_failure.py @@ -20,52 +20,6 @@ get_error_message, Semaphore) -def test_unhandled_errors(ray_start_regular): - @ray.remote - def f(): - raise ValueError() - - @ray.remote - class Actor: - def f(self): - raise ValueError() - - a = Actor.remote() - num_exceptions = 0 - - def interceptor(e): - nonlocal num_exceptions - num_exceptions += 1 - - # Test we report unhandled exceptions. - ray.worker._unhandled_error_handler = interceptor - x1 = f.remote() - x2 = a.f.remote() - del x1 - del x2 - wait_for_condition(lambda: num_exceptions == 2) - - # Test we don't report handled exceptions. - x1 = f.remote() - x2 = a.f.remote() - with pytest.raises(ray.exceptions.RayError) as err: # noqa - ray.get([x1, x2]) - del x1 - del x2 - time.sleep(1) - assert num_exceptions == 2, num_exceptions - - # Test suppression with env var works. - try: - os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] = "1" - x1 = f.remote() - del x1 - time.sleep(1) - assert num_exceptions == 2, num_exceptions - finally: - del os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] - - def test_failed_task(ray_start_regular, error_pubsub): @ray.remote def throw_exception_fct1(): diff --git a/python/ray/worker.py b/python/ray/worker.py index 5ca73860ad63..00d99930cf95 100644 --- a/python/ray/worker.py +++ b/python/ray/worker.py @@ -9,6 +9,7 @@ import logging import os import redis +from six.moves import queue import sys import threading import time @@ -68,12 +69,6 @@ logger = logging.getLogger(__name__) -# Visible for testing. -def _unhandled_error_handler(e: Exception): - logger.error("Unhandled error (suppress with " - "RAY_IGNORE_UNHANDLED_ERRORS=1): {}".format(e)) - - class Worker: """A class used to define the control flow of a worker process. @@ -282,14 +277,6 @@ def put_object(self, value, object_ref=None): self.core_worker.put_serialized_object( serialized_value, object_ref=object_ref)) - def raise_errors(self, data_metadata_pairs, object_refs): - context = self.get_serialization_context() - out = context.deserialize_objects(data_metadata_pairs, object_refs) - if "RAY_IGNORE_UNHANDLED_ERRORS" in os.environ: - return - for e in out: - _unhandled_error_handler(e) - def deserialize_objects(self, data_metadata_pairs, object_refs): context = self.get_serialization_context() return context.deserialize_objects(data_metadata_pairs, object_refs) @@ -876,6 +863,13 @@ def custom_excepthook(type, value, tb): sys.excepthook = custom_excepthook +# The last time we raised a TaskError in this process. We use this value to +# suppress redundant error messages pushed from the workers. +last_task_error_raise_time = 0 + +# The max amount of seconds to wait before printing out an uncaught error. +UNCAUGHT_ERROR_GRACE_PERIOD = 5 + def print_logs(redis_client, threads_stopped, job_id): """Prints log messages from workers on all of the nodes. @@ -1026,7 +1020,42 @@ def color_for(data: Dict[str, str]) -> str: file=print_file) -def listen_error_messages_raylet(worker, threads_stopped): +def print_error_messages_raylet(task_error_queue, threads_stopped): + """Prints message received in the given output queue. + + This checks periodically if any un-raised errors occurred in the + background. + + Args: + task_error_queue (queue.Queue): A queue used to receive errors from the + thread that listens to Redis. + threads_stopped (threading.Event): A threading event used to signal to + the thread that it should exit. + """ + + while True: + # Exit if we received a signal that we should stop. + if threads_stopped.is_set(): + return + + try: + error, t = task_error_queue.get(block=False) + except queue.Empty: + threads_stopped.wait(timeout=0.01) + continue + # Delay errors a little bit of time to attempt to suppress redundant + # messages originating from the worker. + while t + UNCAUGHT_ERROR_GRACE_PERIOD > time.time(): + threads_stopped.wait(timeout=1) + if threads_stopped.is_set(): + break + if t < last_task_error_raise_time + UNCAUGHT_ERROR_GRACE_PERIOD: + logger.debug(f"Suppressing error from worker: {error}") + else: + logger.error(f"Possible unhandled error from worker: {error}") + + +def listen_error_messages_raylet(worker, task_error_queue, threads_stopped): """Listen to error messages in the background on the driver. This runs in a separate thread on the driver and pushes (error, time) @@ -1034,6 +1063,8 @@ def listen_error_messages_raylet(worker, threads_stopped): Args: worker: The worker class that this thread belongs to. + task_error_queue (queue.Queue): A queue used to communicate with the + thread that prints the errors found by this thread. threads_stopped (threading.Event): A threading event used to signal to the thread that it should exit. """ @@ -1072,9 +1103,8 @@ def listen_error_messages_raylet(worker, threads_stopped): error_message = error_data.error_message if (error_data.type == ray_constants.TASK_PUSH_ERROR): - # TODO(ekl) remove task push errors entirely now that we have - # the separate unhandled exception handler. - pass + # Delay it a bit to see if we can suppress it + task_error_queue.put((error_message, time.time())) else: logger.warning(error_message) except (OSError, redis.exceptions.ConnectionError) as e: @@ -1237,12 +1267,19 @@ def connect(node, # temporarily using this implementation which constantly queries the # scheduler for new error messages. if mode == SCRIPT_MODE: + q = queue.Queue() worker.listener_thread = threading.Thread( target=listen_error_messages_raylet, name="ray_listen_error_messages", - args=(worker, worker.threads_stopped)) + args=(worker, q, worker.threads_stopped)) + worker.printer_thread = threading.Thread( + target=print_error_messages_raylet, + name="ray_print_error_messages", + args=(q, worker.threads_stopped)) worker.listener_thread.daemon = True worker.listener_thread.start() + worker.printer_thread.daemon = True + worker.printer_thread.start() if log_to_driver: global_worker_stdstream_dispatcher.add_handler( "ray_print_logs", print_to_stdstream) @@ -1295,6 +1332,8 @@ def disconnect(exiting_interpreter=False): worker.import_thread.join_import_thread() if hasattr(worker, "listener_thread"): worker.listener_thread.join() + if hasattr(worker, "printer_thread"): + worker.printer_thread.join() if hasattr(worker, "logger_thread"): worker.logger_thread.join() worker.threads_stopped.clear() @@ -1406,11 +1445,13 @@ def get(object_refs, *, timeout=None): raise ValueError("'object_refs' must either be an object ref " "or a list of object refs.") + global last_task_error_raise_time # TODO(ujvl): Consider how to allow user to retrieve the ready objects. values, debugger_breakpoint = worker.get_objects( object_refs, timeout=timeout) for i, value in enumerate(values): if isinstance(value, RayError): + last_task_error_raise_time = time.time() if isinstance(value, ray.exceptions.ObjectLostError): worker.core_worker.dump_object_store_memory_usage() if isinstance(value, RayTaskError): diff --git a/src/ray/common/ray_object.h b/src/ray/common/ray_object.h index c036550a8652..633a5d787c7e 100644 --- a/src/ray/common/ray_object.h +++ b/src/ray/common/ray_object.h @@ -92,20 +92,12 @@ class RayObject { /// large to return directly as part of a gRPC response). bool IsInPlasmaError() const; - /// Mark this object as accessed before. - void SetAccessed() { accessed_ = true; }; - - /// Check if this object was accessed before. - bool WasAccessed() const { return accessed_; } - private: std::shared_ptr data_; std::shared_ptr metadata_; const std::vector nested_ids_; /// Whether this class holds a data copy. bool has_data_copy_; - /// Whether this object was accessed. - bool accessed_ = false; }; } // namespace ray diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 06d12387c8ad..0180e0a7ab84 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -422,7 +422,7 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_ return Status::OK(); }, options_.ref_counting_enabled ? reference_counter_ : nullptr, local_raylet_client_, - options_.check_signals, options_.unhandled_exception_handler)); + options_.check_signals)); auto check_node_alive_fn = [this](const NodeID &node_id) { auto node = gcs_client_->Nodes().Get(node_id); diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h index 47023df7b40b..2ced7a10fdb8 100644 --- a/src/ray/core_worker/core_worker.h +++ b/src/ray/core_worker/core_worker.h @@ -82,7 +82,6 @@ struct CoreWorkerOptions { spill_objects(nullptr), restore_spilled_objects(nullptr), delete_spilled_objects(nullptr), - unhandled_exception_handler(nullptr), get_lang_stack(nullptr), kill_main(nullptr), ref_counting_enabled(false), @@ -147,8 +146,6 @@ struct CoreWorkerOptions { /// Application-language callback to delete objects from external storage. std::function &, rpc::WorkerType)> delete_spilled_objects; - /// Function to call on error objects never retrieved. - std::function unhandled_exception_handler; /// Language worker callback to get the current call stack. std::function get_lang_stack; // Function that tries to interrupt the currently running Python thread. diff --git a/src/ray/core_worker/store_provider/memory_store/memory_store.cc b/src/ray/core_worker/store_provider/memory_store/memory_store.cc index 7897b6504e82..6dad1b37be72 100644 --- a/src/ray/core_worker/store_provider/memory_store/memory_store.cc +++ b/src/ray/core_worker/store_provider/memory_store/memory_store.cc @@ -93,7 +93,6 @@ void GetRequest::Set(const ObjectID &object_id, std::shared_ptr objec if (is_ready_) { return; // We have already hit the number of objects to return limit. } - object->SetAccessed(); objects_.emplace(object_id, object); if (objects_.size() == num_objects_ || (abort_if_any_object_is_exception_ && object->IsException() && @@ -107,7 +106,6 @@ std::shared_ptr GetRequest::Get(const ObjectID &object_id) const { std::unique_lock lock(mutex_); auto iter = objects_.find(object_id); if (iter != objects_.end()) { - iter->second->SetAccessed(); return iter->second; } @@ -118,13 +116,11 @@ CoreWorkerMemoryStore::CoreWorkerMemoryStore( std::function store_in_plasma, std::shared_ptr counter, std::shared_ptr raylet_client, - std::function check_signals, - std::function unhandled_exception_handler) + std::function check_signals) : store_in_plasma_(store_in_plasma), ref_counter_(counter), raylet_client_(raylet_client), - check_signals_(check_signals), - unhandled_exception_handler_(unhandled_exception_handler) {} + check_signals_(check_signals) {} void CoreWorkerMemoryStore::GetAsync( const ObjectID &object_id, std::function)> callback) { @@ -140,7 +136,6 @@ void CoreWorkerMemoryStore::GetAsync( } // It's important for performance to run the callback outside the lock. if (ptr != nullptr) { - ptr->SetAccessed(); callback(ptr); } } @@ -151,7 +146,6 @@ std::shared_ptr CoreWorkerMemoryStore::GetOrPromoteToPlasma( auto iter = objects_.find(object_id); if (iter != objects_.end()) { auto obj = iter->second; - obj->SetAccessed(); if (obj->IsInPlasmaError()) { return nullptr; } @@ -216,8 +210,6 @@ bool CoreWorkerMemoryStore::Put(const RayObject &object, const ObjectID &object_ if (should_add_entry) { // If there is no existing get request, then add the `RayObject` to map. objects_.emplace(object_id, object_entry); - } else { - OnErase(object_entry); } } @@ -231,7 +223,6 @@ bool CoreWorkerMemoryStore::Put(const RayObject &object, const ObjectID &object_ // It's important for performance to run the callbacks outside the lock. for (const auto &cb : async_callbacks) { - object_entry->SetAccessed(); cb(object_entry); } @@ -266,7 +257,6 @@ Status CoreWorkerMemoryStore::GetImpl(const std::vector &object_ids, const auto &object_id = object_ids[i]; auto iter = objects_.find(object_id); if (iter != objects_.end()) { - iter->second->SetAccessed(); (*results)[i] = iter->second; if (remove_after_get) { // Note that we cannot remove the object_id from `objects_` now, @@ -436,7 +426,6 @@ void CoreWorkerMemoryStore::Delete(const absl::flat_hash_set &object_i if (it->second->IsInPlasmaError()) { plasma_ids_to_delete->insert(object_id); } else { - OnErase(it->second); objects_.erase(it); } } @@ -446,11 +435,7 @@ void CoreWorkerMemoryStore::Delete(const absl::flat_hash_set &object_i void CoreWorkerMemoryStore::Delete(const std::vector &object_ids) { absl::MutexLock lock(&mu_); for (const auto &object_id : object_ids) { - auto it = objects_.find(object_id); - if (it != objects_.end()) { - OnErase(it->second); - objects_.erase(it); - } + objects_.erase(object_id); } } @@ -466,14 +451,6 @@ bool CoreWorkerMemoryStore::Contains(const ObjectID &object_id, bool *in_plasma) return false; } -void CoreWorkerMemoryStore::OnErase(std::shared_ptr obj) { - // TODO(ekl) note that this doesn't warn on errors that are stored in plasma. - if (obj->IsException() && !obj->IsInPlasmaError() && !obj->WasAccessed() && - unhandled_exception_handler_ != nullptr) { - unhandled_exception_handler_(*obj); - } -} - MemoryStoreStats CoreWorkerMemoryStore::GetMemoryStoreStatisticalData() { absl::MutexLock lock(&mu_); MemoryStoreStats item; diff --git a/src/ray/core_worker/store_provider/memory_store/memory_store.h b/src/ray/core_worker/store_provider/memory_store/memory_store.h index 0ca94ef6cc02..709227f65206 100644 --- a/src/ray/core_worker/store_provider/memory_store/memory_store.h +++ b/src/ray/core_worker/store_provider/memory_store/memory_store.h @@ -35,8 +35,7 @@ class CoreWorkerMemoryStore { std::function store_in_plasma = nullptr, std::shared_ptr counter = nullptr, std::shared_ptr raylet_client = nullptr, - std::function check_signals = nullptr, - std::function unhandled_exception_handler = nullptr); + std::function check_signals = nullptr); ~CoreWorkerMemoryStore(){}; /// Put an object with specified ID into object store. @@ -144,9 +143,6 @@ class CoreWorkerMemoryStore { std::vector> *results, bool abort_if_any_object_is_exception); - /// Called when an object is erased from the store. - void OnErase(std::shared_ptr obj); - /// Optional callback for putting objects into the plasma store. std::function store_in_plasma_; @@ -177,9 +173,6 @@ class CoreWorkerMemoryStore { /// Function passed in to be called to check for signals (e.g., Ctrl-C). std::function check_signals_; - - /// Function called to report unhandled exceptions. - std::function unhandled_exception_handler_; }; } // namespace ray diff --git a/src/ray/core_worker/test/memory_store_test.cc b/src/ray/core_worker/test/memory_store_test.cc deleted file mode 100644 index f4403e4a887e..000000000000 --- a/src/ray/core_worker/test/memory_store_test.cc +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright 2017 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "ray/core_worker/store_provider/memory_store/memory_store.h" - -#include "gtest/gtest.h" -#include "ray/common/test_util.h" - -namespace ray { - -TEST(TestMemoryStore, TestReportUnhandledErrors) { - std::vector> results; - WorkerContext context(WorkerType::WORKER, WorkerID::FromRandom(), JobID::FromInt(0)); - int unhandled_count = 0; - - std::shared_ptr provider = - std::make_shared( - nullptr, nullptr, nullptr, nullptr, - [&](const RayObject &obj) { unhandled_count++; }); - RayObject obj1(rpc::ErrorType::TASK_EXECUTION_EXCEPTION); - RayObject obj2(rpc::ErrorType::TASK_EXECUTION_EXCEPTION); - auto id1 = ObjectID::FromRandom(); - auto id2 = ObjectID::FromRandom(); - - // Check delete without get. - RAY_CHECK(provider->Put(obj1, id1)); - RAY_CHECK(provider->Put(obj2, id2)); - ASSERT_EQ(unhandled_count, 0); - provider->Delete({id1, id2}); - ASSERT_EQ(unhandled_count, 2); - unhandled_count = 0; - - // Check delete after get. - RAY_CHECK(provider->Put(obj1, id1)); - RAY_CHECK(provider->Put(obj1, id2)); - provider->Get({id1}, 1, 100, context, false, &results); - provider->GetOrPromoteToPlasma(id2); - provider->Delete({id1, id2}); - ASSERT_EQ(unhandled_count, 0); - - // Check delete after async get. - provider->GetAsync({id2}, [](std::shared_ptr obj) {}); - RAY_CHECK(provider->Put(obj1, id1)); - RAY_CHECK(provider->Put(obj2, id2)); - provider->GetAsync({id1}, [](std::shared_ptr obj) {}); - provider->Delete({id1, id2}); - ASSERT_EQ(unhandled_count, 0); -} - -} // namespace ray - -int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} From dcb27d6f4473102589ccd9e71dad20ffe6e575e5 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 008/244] Revert "Release process update (#13798)" This reverts commit 16c943c83010581cdcc395fd614b4598d6385486. --- release/RELEASE_CHECKLIST.md | 3 +- release/RELEASE_PROCESS.rst | 7 ++--- release/release_logs/1.2.0/microbenchmark.txt | 28 ------------------- release/release_logs/1.2.0/notes.txt | 3 -- .../1.2.0/rllib_regression_tf.txt | 27 ------------------ .../1.2.0/rllib_regression_torch.txt | 27 ------------------ .../1.2.0/scalability/distributed.txt | 4 --- .../1.2.0/scalability/object_store.txt | 1 - .../1.2.0/scalability/single_node.txt | 5 ---- .../1.2.0/stress_tests/test_dead_actors.txt | 4 --- .../1.2.0/stress_tests/test_many_tasks.txt | 17 ----------- .../stress_tests/test_placement_group.txt | 3 -- 12 files changed, 4 insertions(+), 125 deletions(-) delete mode 100644 release/release_logs/1.2.0/microbenchmark.txt delete mode 100644 release/release_logs/1.2.0/notes.txt delete mode 100644 release/release_logs/1.2.0/rllib_regression_tf.txt delete mode 100644 release/release_logs/1.2.0/rllib_regression_torch.txt delete mode 100644 release/release_logs/1.2.0/scalability/distributed.txt delete mode 100644 release/release_logs/1.2.0/scalability/object_store.txt delete mode 100644 release/release_logs/1.2.0/scalability/single_node.txt delete mode 100644 release/release_logs/1.2.0/stress_tests/test_dead_actors.txt delete mode 100644 release/release_logs/1.2.0/stress_tests/test_many_tasks.txt delete mode 100644 release/release_logs/1.2.0/stress_tests/test_placement_group.txt diff --git a/release/RELEASE_CHECKLIST.md b/release/RELEASE_CHECKLIST.md index f529b38ec52a..0c742a94d19f 100644 --- a/release/RELEASE_CHECKLIST.md +++ b/release/RELEASE_CHECKLIST.md @@ -56,7 +56,6 @@ This checklist is meant to be used in conjunction with the RELEASE_PROCESS.rst d - [ ] Results added to `release/release_logs` - [ ] stress_tests - [ ] unit_gpu_tests -- [ ] Scalability Envelope Tests - [ ] ASAN Test - [ ] K8s Test - [ ] K8s cluster launcher test @@ -108,4 +107,4 @@ This checklist is meant to be used in conjunction with the RELEASE_PROCESS.rst d - [ ] PR to bump master version is merged - [ ] Release is announced internally - [ ] Release is announced externally -- [ ] Any code/doc changes made during the release process contributed back to master branch +- [ ] Any code/doc changes made during the release process contributed back to master branch \ No newline at end of file diff --git a/release/RELEASE_PROCESS.rst b/release/RELEASE_PROCESS.rst index f7eb6292fb49..2502a08657ca 100644 --- a/release/RELEASE_PROCESS.rst +++ b/release/RELEASE_PROCESS.rst @@ -316,11 +316,10 @@ to proceed with the final stages of the release! of the docs, trigger a new build of the "latest" branch in readthedocs to see if that fixes it. -7. **Update latest Docker Image:** SET THE VERSION NUMBER IN `docker/fix-docker-latest.sh`, then run the script ot update the "latest" tag +7. **Update latest Docker Image:** Message Ian Rodney to bump the "latest" tag in Dockerhub for the - ``rayproject/ray`` and ``rayproject/ray-ml`` Docker images to point to the Docker images built from the release. (Make sure there is no permission denied error, you will likely have to ask Thomas for permissions). - - Check the dockerhub to verify the update worked. https://hub.docker.com/repository/docker/rayproject/ray/tags?page=1&name=latest&ordering=last_updated + ``rayproject/ray`` and ``rayproject/ray-ml`` Docker images to point to the Docker images built from the release. (If you have privileges in these + docker projects, you can do this step yourself.) 8. **Send out an email announcing the release** to the engineering@anyscale.com Google group, and post a slack message in the Announcements channel of the diff --git a/release/release_logs/1.2.0/microbenchmark.txt b/release/release_logs/1.2.0/microbenchmark.txt deleted file mode 100644 index 064e8b4411d4..000000000000 --- a/release/release_logs/1.2.0/microbenchmark.txt +++ /dev/null @@ -1,28 +0,0 @@ -single client get calls per second 48106.48 +- 847.52 -single client put calls per second 42709.1 +- 84.85 -multi client put calls per second 172608.71 +- 3071.81 -single client get calls (Plasma Store) per second 10669.26 +- 286.63 -single client put calls (Plasma Store) per second 6622.51 +- 47.03 -multi client put calls (Plasma Store) per second 9804.51 +- 462.32 -single client put gigabytes per second 11.45 +- 10.79 -multi client put gigabytes per second 35.06 +- 0.26 -single client tasks sync per second 1899.11 +- 87.63 -single client tasks async per second 18599.58 +- 124.02 -multi client tasks async per second 50388.88 +- 2585.47 -1:1 actor calls sync per second 3053.21 +- 60.37 -1:1 actor calls async per second 7768.59 +- 268.78 -1:1 actor calls concurrent per second 7106.24 +- 219.87 -1:n actor calls async per second 17132.11 +- 881.8 -n:n actor calls async per second 51037.11 +- 1732.95 -n:n actor calls with arg async per second 13746.19 +- 171.94 -1:1 async-actor calls sync per second 2103.39 +- 52.51 -1:1 async-actor calls async per second 4100.13 +- 53.6 -1:1 async-actor calls with args async per second 3085.78 +- 165.8 -1:n async-actor calls async per second 13906.28 +- 363.9 -n:n async-actor calls async per second 40269.65 +- 1113.55 -client: get calls per second 2414.77 +- 43.07 -client: put calls per second 1346.13 +- 8.2 -client: remote put calls per second 58855.54 +- 849.21 -client: 1:1 actor calls sync per second 730.58 +- 11.66 -client: 1:1 actor calls async per second 774.79 +- 14.1 -client: 1:1 actor calls concurrent per second 805.73 +- 11.46 \ No newline at end of file diff --git a/release/release_logs/1.2.0/notes.txt b/release/release_logs/1.2.0/notes.txt deleted file mode 100644 index 91c693f445a4..000000000000 --- a/release/release_logs/1.2.0/notes.txt +++ /dev/null @@ -1,3 +0,0 @@ -The test.pypi.org wheel does not match the release wheel because there was #14062 was discovered during the sanity check. - -Wheels were re-sanity checked by pip installing from s3. diff --git a/release/release_logs/1.2.0/rllib_regression_tf.txt b/release/release_logs/1.2.0/rllib_regression_tf.txt deleted file mode 100644 index 8760b66ffb64..000000000000 --- a/release/release_logs/1.2.0/rllib_regression_tf.txt +++ /dev/null @@ -1,27 +0,0 @@ -== Status == -Memory usage on this node: 8.8/480.3 GiB -Using FIFO scheduling algorithm. -Resources requested: 0/64 CPUs, 0.0/8 GPUs, 0.0/325.83 GiB heap, 0.0/99.07 GiB objects (0/1.0 accelerator_type:V100) -Result logdir: /home/ray/ray_results/a2c-tf-atari -Result logdir: /home/ray/ray_results/apex-dqn-tf-atari -Result logdir: /home/ray/ray_results/dqn-tf-atari -Result logdir: /home/ray/ray_results/impala-tf-atari -Result logdir: /home/ray/ray_results/ppo-tf-atari -Result logdir: /home/ray/ray_results/sac-tf-halfcheetah-pybullet -Number of trials: 12/12 (12 TERMINATED) -+-------------------------------------------+------------+-------+--------+------------------+---------+-----------+----------------------+----------------------+--------------------+ -| Trial name | status | loc | iter | total time (s) | ts | reward | episode_reward_max | episode_reward_min | episode_len_mean | -|-------------------------------------------+------------+-------+--------+------------------+---------+-----------+----------------------+----------------------+--------------------| -| A2C_BreakoutNoFrameskip-v4_e6509_00000 | TERMINATED | | 355 | 3604.01 | 4137500 | 1.86 | 10 | 0 | 815.78 | -| A2C_BreakoutNoFrameskip-v4_e6509_00001 | TERMINATED | | 354 | 3601.32 | 4067500 | 1.79 | 10 | 0 | 803.07 | -| APEX_BreakoutNoFrameskip-v4_e6509_00002 | TERMINATED | | 98 | 3626.91 | 7297440 | 1.4 | 9 | 0 | 739.886 | -| APEX_BreakoutNoFrameskip-v4_e6509_00003 | TERMINATED | | 97 | 3607.18 | 7222240 | 1.17816 | 5 | 0 | 702.362 | -| DQN_BreakoutNoFrameskip-v4_e6509_00004 | TERMINATED | | 35 | 3636.53 | 360000 | 1.25 | 6 | 0 | 710.49 | -| DQN_BreakoutNoFrameskip-v4_e6509_00005 | TERMINATED | | 35 | 3631.05 | 360000 | 1.36 | 9 | 0 | 723.54 | -| IMPALA_BreakoutNoFrameskip-v4_e6509_00006 | TERMINATED | | 350 | 3607.49 | 3024500 | 1.87 | 9 | 0 | 816.3 | -| IMPALA_BreakoutNoFrameskip-v4_e6509_00007 | TERMINATED | | 349 | 3601.95 | 3025500 | 1.21 | 6 | 0 | 716.7 | -| PPO_BreakoutNoFrameskip-v4_e6509_00008 | TERMINATED | | 1858 | 3600.41 | 9290000 | 1.69 | 10 | 0 | 792.13 | -| PPO_BreakoutNoFrameskip-v4_e6509_00009 | TERMINATED | | 1851 | 3601.2 | 9255000 | 1.6 | 11 | 0 | 770.95 | -| SAC_HalfCheetahBulletEnv-v0_e6509_00010 | TERMINATED | | 45 | 3670.33 | 54000 | 269.06 | 622.238 | -454.818 | 1000 | -| SAC_HalfCheetahBulletEnv-v0_e6509_00011 | TERMINATED | | 45 | 3654.38 | 54000 | 473.166 | 628.875 | 156.264 | 1000 | -+-------------------------------------------+------------+-------+--------+------------------+---------+-----------+----------------------+----------------------+--------------------+ diff --git a/release/release_logs/1.2.0/rllib_regression_torch.txt b/release/release_logs/1.2.0/rllib_regression_torch.txt deleted file mode 100644 index 11309f5e3c68..000000000000 --- a/release/release_logs/1.2.0/rllib_regression_torch.txt +++ /dev/null @@ -1,27 +0,0 @@ -== Status == -Memory usage on this node: 8.6/480.3 GiB -Using FIFO scheduling algorithm. -Resources requested: 0/64 CPUs, 0.0/8 GPUs, 0.0/325.73 GiB heap, 0.0/99.07 GiB objects (0/1.0 accelerator_type:V100) -Result logdir: /home/ray/ray_results/a2c-torch-atari -Result logdir: /home/ray/ray_results/apex-dqn-torch-atari -Result logdir: /home/ray/ray_results/dqn-torch-atari -Result logdir: /home/ray/ray_results/impala-torch-atari -Result logdir: /home/ray/ray_results/ppo-torch-atari -Result logdir: /home/ray/ray_results/sac-torch-halfcheetah-pybullet -Number of trials: 12/12 (12 TERMINATED) -+-------------------------------------------+------------+-------+--------+------------------+---------+-----------+----------------------+----------------------+--------------------+ -| Trial name | status | loc | iter | total time (s) | ts | reward | episode_reward_max | episode_reward_min | episode_len_mean | -|-------------------------------------------+------------+-------+--------+------------------+---------+-----------+----------------------+----------------------+--------------------| -| A2C_BreakoutNoFrameskip-v4_a6f57_00000 | TERMINATED | | 353 | 3603.76 | 3378500 | 1.93 | 15 | 0 | 821.58 | -| A2C_BreakoutNoFrameskip-v4_a6f57_00001 | TERMINATED | | 353 | 3608.48 | 3404500 | 1.15 | 6 | 0 | 701.51 | -| APEX_BreakoutNoFrameskip-v4_a6f57_00002 | TERMINATED | | 113 | 3615.57 | 5680160 | 1.6381 | 9 | 0 | 773.381 | -| APEX_BreakoutNoFrameskip-v4_a6f57_00003 | TERMINATED | | 114 | 3636.38 | 5764800 | 1.39655 | 6 | 0 | 735.914 | -| DQN_BreakoutNoFrameskip-v4_a6f57_00004 | TERMINATED | | 27 | 3684.72 | 280000 | 1.79 | 12 | 0 | 743.6 | -| DQN_BreakoutNoFrameskip-v4_a6f57_00005 | TERMINATED | | 27 | 3685.26 | 280000 | 1.14 | 5 | 0 | 699.19 | -| IMPALA_BreakoutNoFrameskip-v4_a6f57_00006 | TERMINATED | | 356 | 3606.67 | 7850250 | 1.7803 | 12 | 0 | 795.455 | -| IMPALA_BreakoutNoFrameskip-v4_a6f57_00007 | TERMINATED | | 355 | 3609.98 | 7903500 | 1.68217 | 8 | 0 | 796.659 | -| PPO_BreakoutNoFrameskip-v4_a6f57_00008 | TERMINATED | | 1401 | 3601.51 | 7005000 | 2.61 | 10 | 0 | 897.83 | -| PPO_BreakoutNoFrameskip-v4_a6f57_00009 | TERMINATED | | 1406 | 3600.35 | 7030000 | 1.47 | 11 | 0 | 647.8 | -| SAC_HalfCheetahBulletEnv-v0_a6f57_00010 | TERMINATED | | 37 | 3686.44 | 46000 | 641.43 | 723.144 | 504.62 | 1000 | -| SAC_HalfCheetahBulletEnv-v0_a6f57_00011 | TERMINATED | | 37 | 3645.16 | 46000 | 631.65 | 664.021 | 599.864 | 1000 | -+-------------------------------------------+------------+-------+--------+------------------+---------+-----------+----------------------+----------------------+--------------------+ diff --git a/release/release_logs/1.2.0/scalability/distributed.txt b/release/release_logs/1.2.0/scalability/distributed.txt deleted file mode 100644 index 860875201cea..000000000000 --- a/release/release_logs/1.2.0/scalability/distributed.txt +++ /dev/null @@ -1,4 +0,0 @@ -Actor time: 34.21903751100001 (10000 actors) │ -Task time: 386.82114117900005 (10000 tasks) │ -PG time: 31.368525181999985 (1000 placement groups) │ -Node launch time: 756.3447095859999 (250 nodes) \ No newline at end of file diff --git a/release/release_logs/1.2.0/scalability/object_store.txt b/release/release_logs/1.2.0/scalability/object_store.txt deleted file mode 100644 index 0471a93ba429..000000000000 --- a/release/release_logs/1.2.0/scalability/object_store.txt +++ /dev/null @@ -1 +0,0 @@ -Broadcast time: 135.75278311699998 (1073741824 B x 50 nodes) diff --git a/release/release_logs/1.2.0/scalability/single_node.txt b/release/release_logs/1.2.0/scalability/single_node.txt deleted file mode 100644 index 7a100e3eae98..000000000000 --- a/release/release_logs/1.2.0/scalability/single_node.txt +++ /dev/null @@ -1,5 +0,0 @@ -Many args time: 11.433474627000002 (10000 args) -Many returns time: 4.487700554 (3000 returns) -Ray.get time: 21.957432587999996 (10000 args) -Queued task time: 124.148238013 (1000000 tasks) -Ray.get large object time: 35.118229127000006 (107374182400 bytes) \ No newline at end of file diff --git a/release/release_logs/1.2.0/stress_tests/test_dead_actors.txt b/release/release_logs/1.2.0/stress_tests/test_dead_actors.txt deleted file mode 100644 index 2e73606f2328..000000000000 --- a/release/release_logs/1.2.0/stress_tests/test_dead_actors.txt +++ /dev/null @@ -1,4 +0,0 @@ -Finished in: 133.60612034797668s -Average iteration time: 1.3360581374168397s -Max iteration time: 5.137001276016235s -Min iteration time: 0.15551400184631348s diff --git a/release/release_logs/1.2.0/stress_tests/test_many_tasks.txt b/release/release_logs/1.2.0/stress_tests/test_many_tasks.txt deleted file mode 100644 index ffc9bc3cd483..000000000000 --- a/release/release_logs/1.2.0/stress_tests/test_many_tasks.txt +++ /dev/null @@ -1,17 +0,0 @@ -Stage 0 results: - Total time: 50.40076494216919 -Stage 1 results: - Total time: 191.78780102729797 - Average iteration time: 19.178766775131226 - Max iteration time: 21.238199949264526 - Min iteration time: 18.299438953399658 -Stage 2 results: - Total time: 280.4905333518982 - Average iteration time: 56.0978446483612 - Max iteration time: 56.96464133262634 - Min iteration time: 53.859785318374634 -Stage 3 results: - Actor creation time: 0.3304018974304199 - Total time: 2303.117142677307 -Stage 4 results: - Scheduling spread: 66.90121385927009. \ No newline at end of file diff --git a/release/release_logs/1.2.0/stress_tests/test_placement_group.txt b/release/release_logs/1.2.0/stress_tests/test_placement_group.txt deleted file mode 100644 index 62f8a7b74786..000000000000 --- a/release/release_logs/1.2.0/stress_tests/test_placement_group.txt +++ /dev/null @@ -1,3 +0,0 @@ -Avg placement group creating time: 0.2691924729741867 ms -Avg placement group removing time: 0.8786630945927776 ms -Stress Test succeed. \ No newline at end of file From f10481d6f4236c0aa8af0b3e31fc7bbf0949cfc1 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 009/244] Revert "skip test_basic_reconstruction_actor_task on win (#14110)" This reverts commit b0c0b6c27b2679a0221ccd8403be3b439005c7d9. --- python/ray/tests/test_reconstruction.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/ray/tests/test_reconstruction.py b/python/ray/tests/test_reconstruction.py index bad48419f58e..1589f77d8332 100644 --- a/python/ray/tests/test_reconstruction.py +++ b/python/ray/tests/test_reconstruction.py @@ -220,7 +220,6 @@ def dependent_task(x): pass -@pytest.mark.skipif(sys.platform == "win32", reason="Very flaky on Windows.") @pytest.mark.parametrize("reconstruction_enabled", [False, True]) def test_basic_reconstruction_actor_task(ray_start_cluster, reconstruction_enabled): From a986d6ea30ce25b518c05543e49b3250c22e1499 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 010/244] Revert "[Serve] Add support for variable routes (#13968)" This reverts commit 017c14dfba9897f99cde286205c491a7b4f78fa5. --- doc/source/serve/advanced.rst | 33 ------ python/ray/serve/controller.py | 11 +- python/ray/serve/endpoint_state.py | 2 +- python/ray/serve/http_proxy.py | 168 +++++++++++++---------------- python/ray/serve/http_util.py | 11 +- python/ray/serve/tests/test_api.py | 23 ---- 6 files changed, 83 insertions(+), 165 deletions(-) diff --git a/doc/source/serve/advanced.rst b/doc/source/serve/advanced.rst index ca9b8e9cebf2..7a6027ad54c3 100644 --- a/doc/source/serve/advanced.rst +++ b/doc/source/serve/advanced.rst @@ -421,36 +421,3 @@ in :mod:`serve.start `: Using the "EveryNode" option, you can point a cloud load balancer to the instance group of Ray cluster to achieve high availability of Serve's HTTP proxies. - -Variable HTTP Routes -==================== - -Ray Serve supports capturing path parameters. For example, in a call of the form - -.. code-block:: python - - client.create_endpoint("my_endpoint", backend="my_backend", route="/api/{username}") - -the ``username`` parameter will be accessible in your backend code as follows: - -.. code-block:: python - - def my_backend(request): - username = request.path_params["username"] - ... - -Ray Serve uses Starlette's Router class under the hood for routing, so type -conversion for path parameters is also supported, as well as multiple path parameters. -For example, suppose this route is used: - -.. code-block:: python - - client.create_endpoint( - "complex", backend="f", route="/api/{user_id:int}/{number:float}") - -Then for a query to the route ``/api/123/3.14``, the ``request.path_params`` dictionary -available in the backend will be ``{"user_id": 123, "number": 3.14}``, where ``123`` is -a Python int and ``3.14`` is a Python float. - -For full details on the supported path parameters, see Starlette's -`path parameters documentation `_. diff --git a/python/ray/serve/controller.py b/python/ray/serve/controller.py index 8996c342dab7..0ad444a54b36 100644 --- a/python/ray/serve/controller.py +++ b/python/ray/serve/controller.py @@ -163,13 +163,10 @@ async def shadow_traffic(self, endpoint_name: str, backend_tag: BackendTag, self.endpoint_state.shadow_traffic(endpoint_name, backend_tag, proportion) - async def create_endpoint( - self, - endpoint: str, - traffic_dict: Dict[str, float], - route: Optional[str], - methods: List[str], - ) -> None: + # TODO(architkulkarni): add Optional for route after cloudpickle upgrade + async def create_endpoint(self, endpoint: str, + traffic_dict: Dict[str, float], route, + methods: List[str]) -> None: """Create a new endpoint with the specified route and methods. If the route is None, this is a "headless" endpoint that will not diff --git a/python/ray/serve/endpoint_state.py b/python/ray/serve/endpoint_state.py index 39a67d090c86..bdbfe2c39351 100644 --- a/python/ray/serve/endpoint_state.py +++ b/python/ray/serve/endpoint_state.py @@ -20,7 +20,7 @@ def __init__(self, kv_store: RayInternalKVStore, long_poll_host: LongPollHost): self._kv_store = kv_store self._long_poll_host = long_poll_host - self._routes: Dict[str, Tuple[EndpointTag, Any]] = dict() + self._routes: Dict[BackendTag, Tuple[EndpointTag, Any]] = dict() self._traffic_policies: Dict[EndpointTag, TrafficPolicy] = dict() checkpoint = self._kv_store.get(CHECKPOINT_KEY) diff --git a/python/ray/serve/http_proxy.py b/python/ray/serve/http_proxy.py index f6fa25bb3df6..5f722276e7ca 100644 --- a/python/ray/serve/http_proxy.py +++ b/python/ray/serve/http_proxy.py @@ -1,82 +1,23 @@ import asyncio import socket -from typing import List, Dict, Tuple +from typing import List import uvicorn import starlette.responses -import starlette.routing import ray from ray.exceptions import RayTaskError -from ray.serve.common import EndpointTag from ray.serve.constants import LongPollKey from ray.util import metrics from ray.serve.utils import _get_logger from ray.serve.http_util import Response, build_starlette_request from ray.serve.long_poll import LongPollAsyncClient +from ray.serve.router import Router from ray.serve.handle import DEFAULT logger = _get_logger() -class ServeStarletteEndpoint: - """Wraps the given Serve endpoint in a Starlette endpoint. - - Implements the ASGI protocol. Constructs a Starlette endpoint for use by - a Starlette app or Starlette Router which calls the given Serve endpoint - using the given Serve client. - - Usage: - route = starlette.routing.Route( - "/api", - ServeStarletteEndpoint(self.client, endpoint_tag), - methods=methods) - app = starlette.applications.Starlette(routes=[route]) - """ - - def __init__(self, client, endpoint_tag: EndpointTag): - self.client = client - self.endpoint_tag = endpoint_tag - self.handle = None - - async def __call__(self, scope, receive, send): - http_body_bytes = await self.receive_http_body(scope, receive, send) - - headers = {k.decode(): v.decode() for k, v in scope["headers"]} - if self.handle is None: - self.handle = self.client.get_handle(self.endpoint_tag, sync=False) - self.handle = self.handle.options( - method_name=headers.get("X-SERVE-CALL-METHOD".lower(), - DEFAULT.VALUE), - shard_key=headers.get("X-SERVE-SHARD-KEY".lower(), DEFAULT.VALUE), - http_method=scope["method"].upper(), - http_headers=headers) - request = build_starlette_request(scope, http_body_bytes) - object_ref = await self.handle.remote(request) - result = await object_ref - - if isinstance(result, RayTaskError): - error_message = "Task Error. Traceback: {}.".format(result) - await Response( - error_message, status_code=500).send(scope, receive, send) - elif isinstance(result, starlette.responses.Response): - await result(scope, receive, send) - else: - await Response(result).send(scope, receive, send) - - async def receive_http_body(self, scope, receive, send): - body_buffer = [] - more_body = True - while more_body: - message = await receive() - assert message["type"] == "http.request" - - more_body = message["more_body"] - body_buffer.append(message["body"]) - - return b"".join(body_buffer) - - class HTTPProxy: """This class is meant to be instantiated and run by an ASGI HTTP server. @@ -92,12 +33,8 @@ def __init__(self, controller_name): self.client = ray.serve.connect() controller = ray.get_actor(controller_name) - - self.router = starlette.routing.Router(default=self._not_found) - - # route -> (endpoint_tag, methods). Updated via long polling. - self.route_table: Dict[str, Tuple[EndpointTag, List[str]]] = {} - + self.route_table = {} # Should be updated via long polling. + self.router = Router(controller) self.long_poll_client = LongPollAsyncClient(controller, { LongPollKey.ROUTE_TABLE: self._update_route_table, }) @@ -107,38 +44,40 @@ def __init__(self, controller_name): description="The number of HTTP requests processed.", tag_keys=("route", )) + async def setup(self): + await self.router.setup_in_async_loop() + async def _update_route_table(self, route_table): logger.debug(f"HTTP Proxy: Get updated route table: {route_table}.") self.route_table = route_table - routes = [ - starlette.routing.Route( - route, - ServeStarletteEndpoint(self.client, endpoint_tag), - methods=methods) - for route, (endpoint_tag, methods) in route_table.items() - if not self._is_headless(route) - ] + async def receive_http_body(self, scope, receive, send): + body_buffer = [] + more_body = True + while more_body: + message = await receive() + assert message["type"] == "http.request" - routes.append( - starlette.routing.Route("/-/routes", self._display_route_table)) + more_body = message["more_body"] + body_buffer.append(message["body"]) - self.router.routes = routes + return b"".join(body_buffer) - async def _not_found(self, scope, receive, send): - current_path = scope["path"] - error_message = ("Path {} not found. " - "Please ping http://.../-/routes for route table." - ).format(current_path) - response = Response(error_message, status_code=404) - await response.send(scope, receive, send) + def _make_error_sender(self, scope, receive, send): + async def sender(error_message, status_code): + response = Response(error_message, status_code=status_code) + await response.send(scope, receive, send) - async def _display_route_table(self, request): - return starlette.responses.JSONResponse(self.route_table) + return sender - def _is_headless(self, route: str): - """Returns True if `route` corresponds to a headless endpoint.""" - return not route.startswith("/") + async def _handle_system_request(self, scope, receive, send): + current_path = scope["path"] + if current_path == "/-/routes": + await Response(self.route_table).send(scope, receive, send) + else: + await Response( + "System path {} not found".format(current_path), + status_code=404).send(scope, receive, send) async def __call__(self, scope, receive, send): """Implements the ASGI protocol. @@ -147,6 +86,8 @@ async def __call__(self, scope, receive, send): https://asgi.readthedocs.io/en/latest/specs/index.html. """ + error_sender = self._make_error_sender(scope, receive, send) + assert self.route_table is not None, ( "Route table must be set via set_route_table.") assert scope["type"] == "http" @@ -154,7 +95,51 @@ async def __call__(self, scope, receive, send): self.request_counter.record(1, tags={"route": current_path}) - await self.router(scope, receive, send) + if current_path.startswith("/-/"): + await self._handle_system_request(scope, receive, send) + return + + try: + endpoint_name, methods_allowed = self.route_table[current_path] + except KeyError: + error_message = ( + "Path {} not found. " + "Please ping http://.../-/routes for routing table" + ).format(current_path) + await error_sender(error_message, 404) + return + + if scope["method"] not in methods_allowed: + error_message = ("Methods {} not allowed. " + "Available HTTP methods are {}.").format( + scope["method"], methods_allowed) + await error_sender(error_message, 405) + return + + http_body_bytes = await self.receive_http_body(scope, receive, send) + + headers = {k.decode(): v.decode() for k, v in scope["headers"]} + + handle = self.client.get_handle( + endpoint_name, sync=False).options( + method_name=headers.get("X-SERVE-CALL-METHOD".lower(), + DEFAULT.VALUE), + shard_key=headers.get("X-SERVE-SHARD-KEY".lower(), + DEFAULT.VALUE), + http_method=scope["method"].upper(), + http_headers=headers) + + request = build_starlette_request(scope, http_body_bytes) + object_ref = await handle.remote(request) + result = await object_ref + + if isinstance(result, RayTaskError): + error_message = "Task Error. Traceback: {}.".format(result) + await error_sender(error_message, 500) + elif isinstance(result, starlette.responses.Response): + await result(scope, receive, send) + else: + await Response(result).send(scope, receive, send) @ray.remote @@ -172,6 +157,7 @@ async def __init__( self.setup_complete = asyncio.Event() self.app = HTTPProxy(controller_name) + await self.app.setup() self.wrapped_app = self.app for middleware in http_middlewares: diff --git a/python/ray/serve/http_util.py b/python/ray/serve/http_util.py index e8a51adf3d52..0aa4ccf84604 100644 --- a/python/ray/serve/http_util.py +++ b/python/ray/serve/http_util.py @@ -19,16 +19,7 @@ async def mock_receive(): "more_body": False } - # scope["router"] and scope["endpoint"] contain references to a router and - # endpoint object, respectively, which each in turn contain a reference to - # the Serve client, which cannot be serialized. - # The solution is to delete these from scope, as they will not be used. - # Per ASGI recommendation, copy scope before passing to child. - child_scope = scope.copy() - del child_scope["router"] - del child_scope["endpoint"] - - return starlette.requests.Request(child_scope, mock_receive) + return starlette.requests.Request(scope, mock_receive) class Response: diff --git a/python/ray/serve/tests/test_api.py b/python/ray/serve/tests/test_api.py index abfdbf1fb25a..62f239f78782 100644 --- a/python/ray/serve/tests/test_api.py +++ b/python/ray/serve/tests/test_api.py @@ -989,29 +989,6 @@ async def echo_body(starlette_request): assert resp == long_string -def test_variable_routes(serve_instance): - client = serve_instance - - def f(starlette_request): - return starlette_request.path_params - - client.create_backend("f", f) - client.create_endpoint("basic", backend="f", route="/api/{username}") - - # Test multiple variables and test type conversion - client.create_endpoint( - "complex", backend="f", route="/api/{user_id:int}/{number:float}") - - assert requests.get("http://127.0.0.1:8000/api/scaly").json() == { - "username": "scaly" - } - - assert requests.get("http://127.0.0.1:8000/api/23/12.345").json() == { - "user_id": 23, - "number": 12.345 - } - - if __name__ == "__main__": import sys sys.exit(pytest.main(["-v", "-s", __file__])) From bbc67868ec9c181b93b56a2bf0fdd62f6449b567 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 011/244] Revert "[tune] enable more tests (#13969)" This reverts commit 266bd8446f17f85261f183c527623a508ac1b134. --- python/ray/tune/BUILD | 2 +- .../test_convergence_gaussian_process.py | 25 +++----- python/ray/tune/tests/test_function_api.py | 57 ++++++++----------- 3 files changed, 34 insertions(+), 50 deletions(-) diff --git a/python/ray/tune/BUILD b/python/ray/tune/BUILD index 52e6d0ed116b..b013dc4e4751 100644 --- a/python/ray/tune/BUILD +++ b/python/ray/tune/BUILD @@ -87,7 +87,7 @@ py_test( py_test( name = "test_function_api", - size = "medium", + size = "small", srcs = ["tests/test_function_api.py"], deps = [":tune_lib"], tags = ["exclusive"], diff --git a/python/ray/tune/tests/test_convergence_gaussian_process.py b/python/ray/tune/tests/test_convergence_gaussian_process.py index c0abecdd3aef..c81eff8ef6e7 100644 --- a/python/ray/tune/tests/test_convergence_gaussian_process.py +++ b/python/ray/tune/tests/test_convergence_gaussian_process.py @@ -1,4 +1,3 @@ -import math import numpy as np import ray @@ -16,41 +15,33 @@ def loss(config, reporter): class ConvergenceTest(unittest.TestCase): """Test convergence in gaussian process.""" - def shutDown(self): - ray.shutdown() - def test_convergence_gaussian_process(self): np.random.seed(0) ray.init(local_mode=True, num_cpus=1, num_gpus=1) - # This is the space of parameters to explore - space = {"x": tune.uniform(0, 20)} + space = { + "x": (0, 20) # This is the space of parameters to explore + } resources_per_trial = {"cpu": 1, "gpu": 0} # Following bayesian optimization - gp = BayesOptSearch(random_search_steps=10) + gp = BayesOptSearch( + space, metric="loss", mode="min", random_search_steps=10) gp.repeat_float_precision = 5 gp = ConcurrencyLimiter(gp, 1) # Execution of the BO. analysis = tune.run( loss, - metric="loss", - mode="min", # stop=EarlyStopping("loss", mode="min", patience=5), search_alg=gp, - config=space, + config={}, num_samples=100, # Number of iterations resources_per_trial=resources_per_trial, raise_on_failed_trial=False, fail_fast=True, verbose=1) - assert len(analysis.trials) in {13, 43} # it is 43 on the cluster? - assert math.isclose(analysis.best_config["x"], 0, abs_tol=1e-8) + assert len(analysis.trials) == 41 - -if __name__ == "__main__": - import pytest - import sys - sys.exit(pytest.main(["-v", __file__])) + ray.shutdown() diff --git a/python/ray/tune/tests/test_function_api.py b/python/ray/tune/tests/test_function_api.py index e18ee35e07cc..f7084a1fac2c 100644 --- a/python/ray/tune/tests/test_function_api.py +++ b/python/ray/tune/tests/test_function_api.py @@ -6,6 +6,7 @@ import unittest import ray +import ray.cloudpickle as cloudpickle from ray.rllib import _register_all from ray import tune @@ -229,7 +230,7 @@ def train(config, checkpoint_dir=None): new_trainable2 = wrapped(logger_creator=self.logger_creator) new_trainable2.restore(checkpoint) result = new_trainable2.train() - self.assertEqual(result[TRAINING_ITERATION], 1) + self.assertEquals(result[TRAINING_ITERATION], 1) checkpoint = new_trainable2.save() new_trainable2.stop() @@ -404,15 +405,14 @@ def train(config, checkpoint_dir=None): def testEnabled(self): def train(config, checkpoint_dir=None): is_active = tune.is_session_enabled() - result = {"active": is_active} if is_active: - tune.report(**result) - return result + tune.report(active=is_active) + return is_active - assert train({})["active"] is False + assert train({}) is False analysis = tune.run(train) t = analysis.trials[0] - assert t.last_result["active"], t.last_result + assert t.last_result["active"] def testBlankCheckpoint(self): def train(config, checkpoint_dir=None): @@ -450,11 +450,11 @@ def train(config, data=None): trial_1, trial_2 = tune.run( with_parameters(train, data=data), num_samples=2).trials - self.assertEqual(data.data[101], 0) - self.assertEqual(trial_1.last_result["metric"], 500_000) - self.assertEqual(trial_1.last_result["hundred"], 1) - self.assertEqual(trial_2.last_result["metric"], 500_000) - self.assertEqual(trial_2.last_result["hundred"], 1) + self.assertEquals(data.data[101], 0) + self.assertEquals(trial_1.last_result["metric"], 500_000) + self.assertEquals(trial_1.last_result["hundred"], 1) + self.assertEquals(trial_2.last_result["metric"], 500_000) + self.assertEquals(trial_2.last_result["hundred"], 1) self.assertTrue(str(trial_1).startswith("train_")) # With checkpoint dir parameter @@ -465,11 +465,11 @@ def train(config, checkpoint_dir="DIR", data=None): trial_1, trial_2 = tune.run( with_parameters(train, data=data), num_samples=2).trials - self.assertEqual(data.data[101], 0) - self.assertEqual(trial_1.last_result["metric"], 500_000) - self.assertEqual(trial_1.last_result["cp"], "DIR") - self.assertEqual(trial_2.last_result["metric"], 500_000) - self.assertEqual(trial_2.last_result["cp"], "DIR") + self.assertEquals(data.data[101], 0) + self.assertEquals(trial_1.last_result["metric"], 500_000) + self.assertEquals(trial_1.last_result["cp"], "DIR") + self.assertEquals(trial_2.last_result["metric"], 500_000) + self.assertEquals(trial_2.last_result["cp"], "DIR") self.assertTrue(str(trial_1).startswith("train_")) def testWithParameters2(self): @@ -482,9 +482,7 @@ def train(config, data=None): tune.report(metric=len(data.data)) trainable = tune.with_parameters(train, data=Data()) - # ray.cloudpickle will crash for some reason - import cloudpickle as cp - dumped = cp.dumps(trainable) + dumped = cloudpickle.dumps(trainable) assert sys.getsizeof(dumped) < 100 * 1024 def testReturnAnonymous(self): @@ -496,8 +494,8 @@ def train(config): "a": tune.grid_search([4, 8]) }).trials - self.assertEqual(trial_1.last_result[DEFAULT_METRIC], 4) - self.assertEqual(trial_2.last_result[DEFAULT_METRIC], 8) + self.assertEquals(trial_1.last_result[DEFAULT_METRIC], 4) + self.assertEquals(trial_2.last_result[DEFAULT_METRIC], 8) def testReturnSpecific(self): def train(config): @@ -508,8 +506,8 @@ def train(config): "a": tune.grid_search([4, 8]) }).trials - self.assertEqual(trial_1.last_result["m"], 4) - self.assertEqual(trial_2.last_result["m"], 8) + self.assertEquals(trial_1.last_result["m"], 4) + self.assertEquals(trial_2.last_result["m"], 8) def testYieldAnonymous(self): def train(config): @@ -521,8 +519,8 @@ def train(config): "a": tune.grid_search([4, 8]) }).trials - self.assertEqual(trial_1.last_result[DEFAULT_METRIC], 4 + 9) - self.assertEqual(trial_2.last_result[DEFAULT_METRIC], 8 + 9) + self.assertEquals(trial_1.last_result[DEFAULT_METRIC], 4 + 9) + self.assertEquals(trial_2.last_result[DEFAULT_METRIC], 8 + 9) def testYieldSpecific(self): def train(config): @@ -534,10 +532,5 @@ def train(config): "a": tune.grid_search([4, 8]) }).trials - self.assertEqual(trial_1.last_result["m"], 4 + 9) - self.assertEqual(trial_2.last_result["m"], 8 + 9) - - -if __name__ == "__main__": - import pytest - sys.exit(pytest.main(["-v", __file__])) + self.assertEquals(trial_1.last_result["m"], 4 + 9) + self.assertEquals(trial_2.last_result["m"], 8 + 9) From c29de666f5c9dc0d76600d837148f0de87e179b6 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 012/244] Revert "[Serve] [Doc] Add version warning (#14001)" This reverts commit 2cc2adb12889ac224baee23a2412904c83bb3bba. --- doc/source/serve/index.rst | 4 ---- 1 file changed, 4 deletions(-) diff --git a/doc/source/serve/index.rst b/doc/source/serve/index.rst index d5c6853dfc13..f15093b6c0cb 100644 --- a/doc/source/serve/index.rst +++ b/doc/source/serve/index.rst @@ -1,7 +1,3 @@ -.. warning:: - Ray Serve is changing fast! You're probably running the latest pip release and not the nightly build, so please ensure you're viewing the correct version of this documentation. - `Here's the documentation for the latest pip release of Ray Serve `_. - .. _rayserve: ============================================ From fddf23f7bad9fcbda5ec36345bdd25256511d272 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 013/244] Revert "[docs] new Ray Cluster documentation (#13839)" This reverts commit 50961189d1bd643e1e9f5641ce4e186cd0ed960c. --- doc/examples/plot_example-lm.rst | 2 +- doc/requirements-doc.txt | 1 - doc/source/cluster/autoscaling.rst | 164 +++ doc/source/cluster/cloud.rst | 162 +-- doc/source/cluster/config.rst | 1138 ++--------------- doc/source/cluster/deploy.rst | 4 - doc/source/cluster/index.rst | 229 +++- doc/source/cluster/kubernetes.rst | 2 +- doc/source/cluster/launcher.rst | 66 + doc/source/cluster/quickstart.rst | 240 ---- doc/source/cluster/reference.rst | 11 - doc/source/cluster/sdk.rst | 13 - doc/source/conf.py | 1 - doc/source/dask-on-ray.rst | 2 +- doc/source/index.rst | 7 +- doc/source/serve/deployment.rst | 2 +- doc/source/starting-ray.rst | 2 +- .../tune/_tutorials/tune-distributed.rst | 6 +- doc/source/tune/user-guide.rst | 2 +- 19 files changed, 551 insertions(+), 1503 deletions(-) create mode 100644 doc/source/cluster/autoscaling.rst create mode 100644 doc/source/cluster/launcher.rst delete mode 100644 doc/source/cluster/quickstart.rst delete mode 100644 doc/source/cluster/reference.rst delete mode 100644 doc/source/cluster/sdk.rst diff --git a/doc/examples/plot_example-lm.rst b/doc/examples/plot_example-lm.rst index 204f470b3f29..843a7e782310 100644 --- a/doc/examples/plot_example-lm.rst +++ b/doc/examples/plot_example-lm.rst @@ -11,7 +11,7 @@ You can view the `code for this example`_. .. _`code for this example`: https://github.com/ray-project/ray/tree/master/doc/examples/lm -To use Ray cluster launcher on AWS, install boto (``pip install boto3``) and configure your AWS credentials in ``~/.aws/credentials`` as described on the :ref:`Automatic Cluster Setup page `. +To use Ray cluster launcher on AWS, install boto (``pip install boto3``) and configure your AWS credentials in ``~/.aws/credentials`` as described on the :ref:`Automatic Cluster Setup page `. We provide an `example config file `__ (``lm-cluster.yaml``). In the example config file, we use an ``m5.xlarge`` on-demand instance as the head node, and use ``p3.2xlarge`` GPU spot instances as the worker nodes. We set the minimal number of workers to 1 and maximum workers to 2 in the config, which can be modified according to your own demand. diff --git a/doc/requirements-doc.txt b/doc/requirements-doc.txt index a9a34624a629..cb2c358fa1fa 100644 --- a/doc/requirements-doc.txt +++ b/doc/requirements-doc.txt @@ -25,7 +25,6 @@ sphinx-jsonschema sphinx-tabs sphinx-version-warning sphinx-book-theme -sphinxcontrib.yt starlette tabulate uvicorn diff --git a/doc/source/cluster/autoscaling.rst b/doc/source/cluster/autoscaling.rst new file mode 100644 index 000000000000..ecb7af15565a --- /dev/null +++ b/doc/source/cluster/autoscaling.rst @@ -0,0 +1,164 @@ +.. _ref-autoscaling: + +Cluster Autoscaling +=================== + +.. tip:: Before you continue, be sure to have read :ref:`cluster-cloud`. + +Basics +------ + +The Ray Cluster Launcher will automatically enable a load-based autoscaler. The scheduler will look at the task, actor, and placement group resource demands from the cluster, and tries to add the minimum set of nodes that can fulfill these demands. When nodes are idle for more than a timeout, they will be removed, down to the ``min_workers`` limit. The head node is never removed. + +To avoid launching too many nodes at once, the number of nodes allowed to be pending is limited by the ``upscaling_speed`` setting. By default it is set to ``1.0``, which means the cluster can be growing in size by at most ``100%`` at any time (e.g., if the cluster currently has 20 nodes, at most 20 pending launches are allowed). This fraction can be set to as high as needed, e.g., ``99999`` to allow the cluster to quickly grow to its max size. + +In more detail, the autoscaler implements the following control loop: + + 1. It calculates the number of nodes required to satisfy all currently pending tasks, actor, and placement group requests. + 2. If the number of nodes required total divided by the number of current nodes exceeds ``1 + upscaling_speed``, then the number of nodes launched will be limited by that threshold. + 3. If a node is idle for a timeout (5 minutes by default), it is removed from the cluster. + +The basic autoscaling config settings are as follows: + +.. code-block:: yaml + + # An unique identifier for the head node and workers of this cluster. + cluster_name: default + + # The minimum number of workers nodes to launch in addition to the head + # node. This number should be >= 0. + min_workers: 0 + + # The autoscaler will scale up the cluster faster with higher upscaling speed. + # E.g., if the task requires adding more nodes then autoscaler will gradually + # scale up the cluster in chunks of upscaling_speed*currently_running_nodes. + # This number should be > 0. + upscaling_speed: 1.0 + + # If a node is idle for this many minutes, it will be removed. A node is + # considered idle if there are no tasks or actors running on it. + idle_timeout_minutes: 5 + +Programmatically Scaling a Cluster +---------------------------------- + +You can from within a Ray program command the autoscaler to scale the cluster up to a desired size with ``request_resources()`` call. The cluster will immediately attempt to scale to accomodate the requested resources, bypassing normal upscaling speed constraints. + +.. autofunction:: ray.autoscaler.sdk.request_resources + +Manually Adding Nodes without Resources (Unmanaged Nodes) +--------------------------------------------------------- + +In some cases, adding special nodes without any resources (i.e. `num_cpus=0`) may be desirable. Such nodes can be used as a driver which connects to the cluster to launch jobs. + +In order to manually add a node to an autoscaled cluster, the `ray-cluster-name` tag should be set and `ray-node-type` tag should be set to `unmanaged`. + +Unmanaged nodes **must have 0 resources**. + +If you are using the `available_node_types` field, you should create a custom node type with `resources: {}`, and `max_workers: 0` when configuring the autoscaler. + +The autoscaler will not attempt to start, stop, or update unmanaged nodes. The user is responsible for properly setting up and cleaning up unmanaged nodes. + + +Multiple Node Type Autoscaling +------------------------------ + +Ray supports multiple node types in a single cluster. In this mode of operation, the scheduler will choose the types of nodes to add based on the resource demands, instead of always adding the same kind of node type. + +The concept of a cluster node type encompasses both the physical instance type (e.g., AWS p3.8xl GPU nodes vs m4.16xl CPU nodes), as well as other attributes (e.g., IAM role, the machine image, etc). `Custom resources `__ can be specified for each node type so that Ray is aware of the demand for specific node types at the application level (e.g., a task may request to be placed on a machine with a specific role or machine image via custom resource). + +An example of configuring multiple node types is as follows `(full example) `__: + +.. code-block:: yaml + + # Specify the allowed node types and the resources they provide. + # The key is the name of the node type, which is just for debugging purposes. + # The node config specifies the launch config and physical instance type. + available_node_types: + cpu_4_ondemand: + node_config: + InstanceType: m4.xlarge + # For AWS instances, autoscaler will automatically add the available + # CPUs/GPUs/accelerator_type ({"CPU": 4} for m4.xlarge) in "resources". + # resources: {"CPU": 4} + min_workers: 1 + max_workers: 5 + cpu_16_spot: + node_config: + InstanceType: m4.4xlarge + InstanceMarketOptions: + MarketType: spot + # Autoscaler will auto fill the CPU resources below. + resources: {"Custom1": 1, "is_spot": 1} + max_workers: 10 + gpu_1_ondemand: + node_config: + InstanceType: p2.xlarge + # Autoscaler will auto fill the CPU/GPU resources below. + resources: {"Custom2": 2} + max_workers: 4 + worker_setup_commands: + - pip install tensorflow-gpu # Example command. + gpu_8_ondemand: + node_config: + InstanceType: p3.8xlarge + # Autoscaler autofills the "resources" below. + # resources: {"CPU": 32, "GPU": 4, "accelerator_type:V100": 1} + max_workers: 2 + worker_setup_commands: + - pip install tensorflow-gpu # Example command. + + # Specify the node type of the head node (as configured above). + head_node_type: cpu_4_ondemand + + +The above config defines two CPU node types (``cpu_4_ondemand`` and ``cpu_16_spot``), and two GPU types (``gpu_1_ondemand`` and ``gpu_8_ondemand``). Each node type has a name (e.g., ``cpu_4_ondemand``), which has no semantic meaning and is only for debugging. Let's look at the inner fields of the ``gpu_1_ondemand`` node type: + +The node config tells the underlying Cloud provider how to launch a node of this type. This node config is merged with the top level node config of the YAML and can override fields (i.e., to specify the p2.xlarge instance type here): + +.. code-block:: yaml + + node_config: + InstanceType: p2.xlarge + +The resources field tells the autoscaler what kinds of resources this node provides. This can include custom resources as well (e.g., "Custom2"). This field enables the autoscaler to automatically select the right kind of nodes to launch given the resource demands of the application. The resources specified here will be automatically passed to the ``ray start`` command for the node via an environment variable. For more information, see also the `resource demand scheduler `__: + +.. code-block:: yaml + + resources: {"CPU": 4, "GPU": 1, "Custom2": 2} + +The ``min_workers`` and ``max_workers`` fields constrain the minimum and maximum number of nodes of this type to launch, respectively: + +.. code-block:: yaml + + min_workers: 1 + max_workers: 4 + +The ``worker_setup_commands`` field (and also the ``initialization_commands`` field, not shown) can be used to override the setup and initialization commands for a node type. Note that you can only override the setup for worker nodes. The head node's setup commands are always configured via the top level field in the cluster YAML: + +.. code-block:: yaml + + worker_setup_commands: + - pip install tensorflow-gpu # Example command. + +Docker Support for Multi-type clusters +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For each node type, you can specify ``worker_image`` and ``pull_before_run`` fields. These will override any top level ``docker`` section values (see :ref:`autoscaler-docker`). The ``worker_run_options`` field is combined with top level ``docker: run_options`` field to produce the docker run command for the given node_type. Ray will automatically select the Nvidia docker runtime if it is available. + +The following configuration is for a GPU enabled node type: + +.. code-block:: yaml + + available_node_types: + gpu_1_ondemand: + max_workers: 2 + worker_setup_commands: + - pip install tensorflow-gpu # Example command. + + # Docker specific commands for gpu_1_ondemand + pull_before_run: True + worker_image: + - rayproject/ray-ml:latest-gpu + worker_run_options: # Appended to top-level docker field. + - "-v /home:/home" diff --git a/doc/source/cluster/cloud.rst b/doc/source/cluster/cloud.rst index d2e7b90d55eb..ea59f95eaa79 100644 --- a/doc/source/cluster/cloud.rst +++ b/doc/source/cluster/cloud.rst @@ -272,116 +272,6 @@ There are two ways of running private clusters: $ ray down ray/python/ray/autoscaler/local/example-full.yaml -.. _manual-cluster: - -Manual Ray Cluster Setup ------------------------- - -The most preferable way to run a Ray cluster is via the Ray Cluster Launcher. However, it is also possible to start a Ray cluster by hand. - -This section assumes that you have a list of machines and that the nodes in the cluster can communicate with each other. It also assumes that Ray is installed -on each machine. To install Ray, follow the `installation instructions`_. - -.. _`installation instructions`: http://docs.ray.io/en/master/installation.html - -Starting Ray on each machine -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -On the head node (just choose some node to be the head node), run the following. -If the ``--port`` argument is omitted, Ray will choose port 6379, falling back to a -random port. - -.. code-block:: bash - - $ ray start --head --port=6379 - ... - Next steps - To connect to this Ray runtime from another node, run - ray start --address=':6379' --redis-password='' - - If connection fails, check your firewall settings and network configuration. - -The command will print out the address of the Redis server that was started -(the local node IP address plus the port number you specified). - -**Then on each of the other nodes**, run the following. Make sure to replace -``
`` with the value printed by the command on the head node (it -should look something like ``123.45.67.89:6379``). - -Note that if your compute nodes are on their own subnetwork with Network -Address Translation, to connect from a regular machine outside that subnetwork, -the command printed by the head node will not work. You need to find the -address that will reach the head node from the second machine. If the head node -has a domain address like compute04.berkeley.edu, you can simply use that in -place of an IP address and rely on the DNS. - -.. code-block:: bash - - $ ray start --address=
--redis-password='' - -------------------- - Ray runtime started. - -------------------- - - To terminate the Ray runtime, run - ray stop - -If you wish to specify that a machine has 10 CPUs and 1 GPU, you can do this -with the flags ``--num-cpus=10`` and ``--num-gpus=1``. See the :ref:`Configuration ` page for more information. - -If you see ``Unable to connect to Redis. If the Redis instance is on a -different machine, check that your firewall is configured properly.``, -this means the ``--port`` is inaccessible at the given IP address (because, for -example, the head node is not actually running Ray, or you have the wrong IP -address). - -If you see ``Ray runtime started.``, then the node successfully connected to -the IP address at the ``--port``. You should now be able to connect to the -cluster with ``ray.init(address='auto')``. - -If ``ray.init(address='auto')`` keeps repeating -``redis_context.cc:303: Failed to connect to Redis, retrying.``, then the node -is failing to connect to some other port(s) besides the main port. - -.. code-block:: bash - - If connection fails, check your firewall settings and network configuration. - -If the connection fails, to check whether each port can be reached from a node, -you can use a tool such as ``nmap`` or ``nc``. - -.. code-block:: bash - - $ nmap -sV --reason -p $PORT $HEAD_ADDRESS - Nmap scan report for compute04.berkeley.edu (123.456.78.910) - Host is up, received echo-reply ttl 60 (0.00087s latency). - rDNS record for 123.456.78.910: compute04.berkeley.edu - PORT STATE SERVICE REASON VERSION - 6379/tcp open redis syn-ack ttl 60 Redis key-value store - Service detection performed. Please report any incorrect results at https://nmap.org/submit/ . - $ nc -vv -z $HEAD_ADDRESS $PORT - Connection to compute04.berkeley.edu 6379 port [tcp/*] succeeded! - -If the node cannot access that port at that IP address, you might see - -.. code-block:: bash - - $ nmap -sV --reason -p $PORT $HEAD_ADDRESS - Nmap scan report for compute04.berkeley.edu (123.456.78.910) - Host is up (0.0011s latency). - rDNS record for 123.456.78.910: compute04.berkeley.edu - PORT STATE SERVICE REASON VERSION - 6379/tcp closed redis reset ttl 60 - Service detection performed. Please report any incorrect results at https://nmap.org/submit/ . - $ nc -vv -z $HEAD_ADDRESS $PORT - nc: connect to compute04.berkeley.edu port 6379 (tcp) failed: Connection refused - - -Stopping Ray -~~~~~~~~~~~~ - -When you want to stop the Ray processes, run ``ray stop`` on each node. - - Additional Cloud Providers -------------------------- @@ -393,62 +283,16 @@ Security On cloud providers, nodes will be launched into their own security group by default, with traffic allowed only between nodes in the same group. A new SSH key will also be created and saved to your local machine for access to the cluster. -.. _using-ray-on-a-cluster: - -Running a Ray program on the Ray cluster ----------------------------------------- - -To run a distributed Ray program, you'll need to execute your program on the same machine as one of the nodes. - -.. tabs:: - .. group-tab:: Python - - Within your program/script, you must call ``ray.init`` and add the ``address`` parameter to ``ray.init`` (like ``ray.init(address=...)``). This causes Ray to connect to the existing cluster. For example: - - .. code-block:: python - - ray.init(address="auto") - - .. group-tab:: Java - - You need to add the ``ray.address`` parameter to your command line (like ``-Dray.address=...``). - - To connect your program to the Ray cluster, run it like this: - - .. code-block:: bash - - java -classpath \ - -Dray.address=
\ - - - .. note:: Specifying ``auto`` as the address hasn't been implemented in Java yet. You need to provide the actual address. You can find the address of the server from the output of the ``ray up`` command. - - -.. note:: A common mistake is setting the address to be a cluster node while running the script on your laptop. This will not work because the script needs to be started/executed on one of the Ray nodes. - -To verify that the correct number of nodes have joined the cluster, you can run the following. - -.. code-block:: python - - import time - - @ray.remote - def f(): - time.sleep(0.01) - return ray.services.get_node_ip_address() - - # Get a list of the IP addresses of the nodes that have joined the cluster. - set(ray.get([f.remote() for _ in range(1000)])) - What's Next? ------------- Now that you have a working understanding of the cluster launcher, check out: -* :ref:`ref-cluster-quick-start`: A end-to-end demo to run an application that autoscales. -* :ref:`cluster-config`: A complete reference of how to configure your Ray cluster. +* :ref:`cluster-config`: A guide to configuring your Ray cluster. * :ref:`cluster-commands`: A short user guide to the various cluster launcher commands. +* A `step by step guide`_ to using the cluster launcher +* :ref:`ref-autoscaling`: An overview of how Ray autoscaling works. diff --git a/doc/source/cluster/config.rst b/doc/source/cluster/config.rst index 430d5473de0c..8260e8f6b7e6 100644 --- a/doc/source/cluster/config.rst +++ b/doc/source/cluster/config.rst @@ -1,286 +1,82 @@ .. _cluster-config: -Cluster YAML Configuration Options -================================== +Configuring your Cluster +======================== -The cluster configuration is defined within a YAML file that will be used by the Cluster Launcher to launch the head node, and by the Autoscaler to launch worker nodes. Once the cluster configuration is defined, you will need to use the :ref:`Ray CLI ` to perform any operations such as starting and stopping the cluster. +.. tip:: Before you continue, be sure to have read :ref:`cluster-cloud`. -Syntax ------- +To launch a cluster, you must first create a *cluster configuration file*, which specifies some important details about the cluster. -.. parsed-literal:: +Quickstart +---------- - :ref:`cluster_name `: str - :ref:`max_workers `: int - :ref:`upscaling_speed `: float - :ref:`idle_timeout_minutes `: int - :ref:`docker `: - :ref:`docker ` - :ref:`provider `: - :ref:`provider ` - :ref:`auth `: - :ref:`auth ` - :ref:`available_node_types `: - :ref:`node_types ` - :ref:`worker_nodes `: - :ref:`node_config ` - :ref:`head_node_type `: str - :ref:`file_mounts `: - :ref:`file_mounts ` - :ref:`cluster_synced_files `: - - str - :ref:`rsync_exclude `: - - str - :ref:`rsync_filter `: - - str - :ref:`initialization_commands `: - - str - :ref:`setup_commands `: - - str - :ref:`head_setup_commands `: - - str - :ref:`worker_setup_commands `: - - str - :ref:`head_start_ray_commands `: - - str - :ref:`worker_start_ray_commands `: - - str +At a minimum, we need to specify: -Custom types ------------- +* the name of your cluster, +* the number of workers in the cluster +* the cloud provider +* any setup commands that should run on the node upon launch. -.. _cluster-configuration-docker-type: +Here is an example cluster configuration file: -Docker -~~~~~~ - -.. parsed-literal:: - :ref:`image `: str - :ref:`head_image `: str - :ref:`worker_image `: str - :ref:`container_name `: str - :ref:`pull_before_run `: bool - :ref:`run_options `: - - str - :ref:`head_run_options `: - - str - :ref:`worker_run_options `: - - str - :ref:`disable_automatic_runtime_detection `: bool - :ref:`disable_shm_size_detection `: bool - -.. _cluster-configuration-auth-type: - -Auth -~~~~ - -.. tabs:: - .. group-tab:: AWS - - .. parsed-literal:: - - :ref:`ssh_user `: str - :ref:`ssh_private_key `: str - - .. group-tab:: Azure - - .. parsed-literal:: - - :ref:`ssh_user `: str - :ref:`ssh_private_key `: str - :ref:`ssh_public_key `: str - - .. group-tab:: GCP - - .. parsed-literal:: - - :ref:`ssh_user `: str - :ref:`ssh_private_key `: str - -.. _cluster-configuration-provider-type: - -Provider -~~~~~~~~ - -.. tabs:: - .. group-tab:: AWS - - .. parsed-literal:: - - :ref:`type `: str - :ref:`region `: str - :ref:`availability_zone `: str - :ref:`cache_stopped_nodes `: bool - - .. group-tab:: Azure - - .. parsed-literal:: - - :ref:`type `: str - :ref:`location `: str - :ref:`resource_group `: str - :ref:`subscription_id `: str - :ref:`cache_stopped_nodes `: bool - - .. group-tab:: GCP - - .. parsed-literal:: - - :ref:`type `: str - :ref:`region `: str - :ref:`availability_zone `: str - :ref:`project_id `: str - :ref:`cache_stopped_nodes `: bool - -.. _cluster-configuration-node-types-type: - -Node types -~~~~~~~~~~ - -The nodes types object's keys represent the names of the different node types. - -.. parsed-literal:: - : - :ref:`node_config `: - :ref:`Node config ` - :ref:`resources `: - :ref:`Resources ` - :ref:`min_workers `: int - :ref:`max_workers `: int - :ref:`worker_setup_commands `: - - str - :ref:`docker `: - :ref:`Node Docker ` - : - ... - ... - -.. _cluster-configuration-node-config-type: - -Node config -~~~~~~~~~~~ - -.. tabs:: - .. group-tab:: AWS - - A YAML object as defined in `the AWS docs `_. - - .. group-tab:: Azure - - A YAML object as defined in `the deployment template `_ whose resources are defined in `the Azure docs `_. - - .. group-tab:: GCP - - A YAML object as defined in `the GCP docs `_. - -.. _cluster-configuration-node-docker-type: - -Node Docker -~~~~~~~~~~~ - -.. parsed-literal:: - - :ref:`image `: str - :ref:`pull_before_run `: bool - :ref:`run_options `: - - str - :ref:`disable_automatic_runtime_detection `: bool - :ref:`disable_shm_size_detection `: bool - -.. _cluster-configuration-resources-type: - -Resources -~~~~~~~~~ - -.. parsed-literal:: - - :ref:`CPU `: int - :ref:`GPU `: int - : int - : int - ... - -.. _cluster-configuration-file-mounts-type: - -File mounts -~~~~~~~~~~~ - -.. parsed-literal:: - : str # Path 1 on local machine - : str # Path 2 on local machine - ... - -Properties and Definitions --------------------------- - -.. _cluster-configuration-cluster-name: +.. code-block:: yaml -``cluster_name`` -~~~~~~~~~~~~~~~~ + # A unique identifier for this cluster. + cluster_name: basic-ray -The name of the cluster. This is the namespace of the cluster. + # The maximum number of workers nodes to launch in addition to the head + # node. + max_workers: 0 # this means zero workers -* **Required:** Yes -* **Importance:** High -* **Type:** String -* **Default:** "default" -* **Pattern:** ``[a-zA-Z0-9_]+`` + # Cloud-provider specific configuration. + provider: + type: aws + region: us-west-2 + availability_zone: us-west-2a -.. _cluster-configuration-max-workers: + # How Ray will authenticate with newly launched nodes. + auth: + ssh_user: ubuntu -``max_workers`` -~~~~~~~~~~~~~~~ + setup_commands: + - pip install ray[all] + # The following line demonstrate that you can specify arbitrary + # startup scripts on the cluster. + - touch /tmp/some_file.txt -The maximum number of workers the cluster will have at any given time. +Most of the example YAML file is optional. Here is a `reference minimal YAML file `__, and you can find the defaults for `optional fields in this YAML file `__. -* **Required:** No -* **Importance:** High -* **Type:** Integer -* **Default:** ``2`` -* **Minimum:** ``0`` -* **Maximum:** Unbounded +In another example, the `AWS example configuration file `__ cluster config file will create a small cluster with an m5.large head node (on-demand) configured to autoscale up to two m5.large `spot workers `__. -.. _cluster-configuration-upscaling-speed: +**You are encouraged to copy the example YAML file and modify it to your needs. This may include adding additional setup commands to install libraries or sync local data files.** -``upscaling_speed`` -~~~~~~~~~~~~~~~~~~~ +Setup Commands +-------------- -The number of nodes allowed to be pending as a multiple of the current number of nodes. For example, if set to 1.0, the cluster can grow in size by at most 100% at any time, so if the cluster currently has 20 nodes, at most 20 pending launches are allowed. +.. tip:: After you have customized the nodes, create a new machine image (or docker container) and use that in the config file to reduce setup times. -* **Required:** No -* **Importance:** Medium -* **Type:** Float -* **Default:** ``1.0`` -* **Minimum:** ``0.0`` -* **Maximum:** Unbounded +The setup commands you use should ideally be *idempotent* (i.e., can be run multiple times without changing the result). This allows Ray to safely update nodes after they have been created. -.. _cluster-configuration-idle-timeout-minutes: +You can usually make commands idempotent with small modifications, e.g. ``git clone foo`` can be rewritten as ``test -e foo || git clone foo`` which checks if the repo is already cloned first. -``idle_timeout_minutes`` -~~~~~~~~~~~~~~~~~~~~~~~~ +.. _autoscaler-docker: -The number of minutes that need to pass before an idle worker node is removed by the Autoscaler. +Docker Support +-------------- -* **Required:** No -* **Importance:** Medium -* **Type:** Integer -* **Default:** ``5`` -* **Minimum:** ``0`` -* **Maximum:** Unbounded +The cluster launcher is fully compatible with Docker images. To use Docker, provide a ``docker_image`` and ``container_name`` in the ``docker`` field of the YAML. -.. _cluster-configuration-docker: +.. code-block:: yaml -``docker`` -~~~~~~~~~~ + docker: + container_name: "ray_container" + image: "rayproject/ray-ml:latest-gpu" -Configure Ray to run in Docker containers. +We provide docker images on `DockerHub `__. The ``rayproject/ray-ml:latest`` image is a quick way to get up and running . -* **Required:** No -* **Importance:** High -* **Type:** :ref:`Docker ` -* **Default:** ``{}`` +When the cluster is launched, all of the Ray tasks will be executed completely inside of the container. For GPU support, Ray will automatically select the Nvidia docker runtime if available, and you just need to specify a docker image with the CUDA support (``rayproject/ray-ml:latest-gpu`` and all of our ``-gpu`` images have this). -In rare cases when Docker is not available on the system by default (e.g., bad AMI), add the following commands to :ref:`initialization_commands ` to install it. +If Docker is not installed, add the following commands to ``initialization_commands`` to install it. .. code-block:: yaml @@ -290,813 +86,59 @@ In rare cases when Docker is not available on the system by default (e.g., bad A - sudo usermod -aG docker $USER - sudo systemctl restart docker -f -.. _cluster-configuration-provider: - -``provider`` -~~~~~~~~~~~~ - -The cloud provider-specific configuration properties. - -* **Required:** Yes -* **Importance:** High -* **Type:** :ref:`Provider ` - -.. _cluster-configuration-auth: - -``auth`` -~~~~~~~~ - -Authentication credentials that Ray will use to launch nodes. - -* **Required:** Yes -* **Importance:** High -* **Type:** :ref:`Auth ` - -.. _cluster-configuration-available-node-types: - -``available_node_types`` -~~~~~~~~~~~~~~~~~~~~~~~~ - -Tells the autoscaler the allowed node types and the resources they provide. -The key is the name of the node type, which is just for debugging purposes. - -* **Required:** No -* **Importance:** High -* **Type:** :ref:`Node types ` -* **Default:** - -.. tabs:: - .. group-tab:: AWS - - .. code-block:: yaml - - available_node_types: - ray.head.default: - node_config: - InstanceType: m5.large - BlockDeviceMappings: - - DeviceName: /dev/sda1 - Ebs: - VolumeSize: 100 - resources: {"CPU": 2} - min_workers: 0 - max_workers: 0 - ray.worker.small: - node_config: - InstanceType: m5.large - InstanceMarketOptions: - MarketType: spot - resources: {"CPU": 2} - min_workers: 0 - max_workers: 1 - -.. _cluster-configuration-head-node-type: - -``head_node_type`` -~~~~~~~~~~~~~~~~~~ - -The key for one of the node types in :ref:`available_node_types `. This node type will be used to launch the head node. - - -* **Required:** Yes -* **Importance:** High -* **Type:** String -* **Pattern:** ``[a-zA-Z0-9_]+`` - -.. _cluster-configuration-worker-nodes: - -``worker_nodes`` -~~~~~~~~~~~~~~~~ - -The configuration to be used to launch worker nodes on the cloud service provider. Generally, node configs are set in the :ref:`node config of each node type `. Setting this property allows propagation of a default value to all the node types when they launch as workers (e.g., using spot instances across all workers can be configured here so that it doesn't have to be set across all instance types). - -* **Required:** No -* **Importance:** Low -* **Type:** :ref:`Node config ` -* **Default:** ``{}`` - -.. _cluster-configuration-file-mounts: - -``file_mounts`` -~~~~~~~~~~~~~~~ - -The files or directories to copy to the head and worker nodes. - -* **Required:** No -* **Importance:** High -* **Type:** :ref:`File mounts ` -* **Default:** ``[]`` - -.. _cluster-configuration-cluster-synced-files: - -``cluster_synced_files`` -~~~~~~~~~~~~~~~~~~~~~~~~ - -A list of paths to the files or directories to copy from the head node to the worker nodes. The same path on the head node will be copied to the worker node. This behavior is a subset of the file_mounts behavior, so in the vast majority of cases one should just use :ref:`file_mounts `. - -* **Required:** No -* **Importance:** Low -* **Type:** List of String -* **Default:** ``[]`` - -.. _cluster-configuration-rsync-exclude: - -``rsync_exclude`` -~~~~~~~~~~~~~~~~~ - -A list of patterns for files to exclude when running ``rsync up`` or ``rsync down``. The filter is applied on the source directory only. - -Example for a pattern in the list: ``**/.git/**``. - -* **Required:** No -* **Importance:** Low -* **Type:** List of String -* **Default:** ``[]`` - -.. _cluster-configuration-rsync-filter: - -``rsync_filter`` -~~~~~~~~~~~~~~~~ - -A list of patterns for files to exclude when running ``rsync up`` or ``rsync down``. The filter is applied on the source directory and recursively through all subdirectories. - -Example for a pattern in the list: ``.gitignore``. - -* **Required:** No -* **Importance:** Low -* **Type:** List of String -* **Default:** ``[]`` - -.. _cluster-configuration-initialization-commands: - -``initialization_commands`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -A list of commands that will be run before the :ref:`setup commands `. If Docker is enabled, these commands will run outside the container and before Docker is setup. - -* **Required:** No -* **Importance:** Medium -* **Type:** List of String -* **Default:** ``[]`` - -.. _cluster-configuration-setup-commands: - -``setup_commands`` -~~~~~~~~~~~~~~~~~~ - -A list of commands to run to set up nodes. These commands will always run on the head and worker nodes and will be merged with :ref:`head setup commands ` for head and with :ref:`worker setup commands ` for workers. - -* **Required:** No -* **Importance:** Medium -* **Type:** List of String -* **Default:** - -.. tabs:: - .. group-tab:: AWS - - .. code-block:: yaml - - # Default setup_commands: - setup_commands: - - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc - - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl - -- Setup commands should ideally be *idempotent* (i.e., can be run multiple times without changing the result); this allows Ray to safely update nodes after they have been created. You can usually make commands idempotent with small modifications, e.g. ``git clone foo`` can be rewritten as ``test -e foo || git clone foo`` which checks if the repo is already cloned first. - -- Setup commands are run sequentially but separately. For example, if you are using anaconda, you need to run ``conda activate env && pip install -U ray`` because splitting the command into two setup commands will not work. - -- Ideally, you should avoid using setup_commands by creating a docker image with all the dependencies preinstalled to minimize startup time. - -- **Tip**: if you also want to run apt-get commands during setup add the following list of commands: - - .. code-block:: yaml - - setup_commands: - - sudo pkill -9 apt-get || true - - sudo pkill -9 dpkg || true - - sudo dpkg --configure -a - -.. _cluster-configuration-head-setup-commands: - -``head_setup_commands`` -~~~~~~~~~~~~~~~~~~~~~~~ - -A list of commands to run to set up the head node. These commands will be merged with the general :ref:`setup commands `. - -* **Required:** No -* **Importance:** Low -* **Type:** List of String -* **Default:** ``[]`` - -.. _cluster-configuration-worker-setup-commands: - -``worker_setup_commands`` -~~~~~~~~~~~~~~~~~~~~~~~~~ - -A list of commands to run to set up the worker nodes. These commands will be merged with the general :ref:`setup commands `. - -* **Required:** No -* **Importance:** Low -* **Type:** List of String -* **Default:** ``[]`` - -.. _cluster-configuration-head-start-ray-commands: - -``head_start_ray_commands`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Commands to start ray on the head node. You don't need to change this. - -* **Required:** No -* **Importance:** Low -* **Type:** List of String -* **Default:** - -.. tabs:: - .. group-tab:: AWS - - .. code-block:: yaml - - head_start_ray_commands: - - ray stop - - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml - -.. _cluster-configuration-worker-start-ray-commands: - -``worker_start_ray_commands`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Command to start ray on worker nodes. You don't need to change this. - -* **Required:** No -* **Importance:** Low -* **Type:** List of String -* **Default:** - -.. tabs:: - .. group-tab:: AWS - - .. code-block:: yaml - - worker_start_ray_commands: - - ray stop - - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 - -.. _cluster-configuration-image: - -``docker.image`` -~~~~~~~~~~~~~~~~ - -The default Docker image to pull in the head and worker nodes. This can be overridden by the :ref:`head_image ` and :ref:`worker_image ` fields. If neither `image` nor (:ref:`head_image ` and :ref:`worker_image `) are specified, Ray will not use Docker. - -* **Required:** Yes (If Docker is in use.) -* **Importance:** High -* **Type:** String - -The Ray project provides Docker images on `DockerHub `_. The repository includes following images: - -* ``rayproject/ray-ml:latest-gpu``: CUDA support, includes ML dependencies. -* ``rayproject/ray:latest-gpu``: CUDA support, no ML dependencies. -* ``rayproject/ray-ml:latest``: No CUDA support, includes ML dependencies. -* ``rayproject/ray:latest``: No CUDA support, no ML dependencies. - -.. _cluster-configuration-head-image: - -``docker.head_image`` -~~~~~~~~~~~~~~~~~~~~~ -Docker image for the head node to override the default :ref:`docker image `. - -* **Required:** No -* **Importance:** Low -* **Type:** String - -.. _cluster-configuration-worker-image: - -``docker.worker_image`` -~~~~~~~~~~~~~~~~~~~~~~~ -Docker image for the worker nodes to override the default :ref:`docker image `. - -* **Required:** No -* **Importance:** Low -* **Type:** String - -.. _cluster-configuration-container-name: - -``docker.container_name`` -~~~~~~~~~~~~~~~~~~~~~~~~~ - -The name to use when starting the Docker container. - -* **Required:** Yes (If Docker is in use.) -* **Importance:** Low -* **Type:** String -* **Default:** ray_container - -.. _cluster-configuration-pull-before-run: - -``docker.pull_before_run`` -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If enabled, the latest version of image will be pulled when starting Docker. If disabled, ``docker run`` will only pull the image if no cached version is present. - -* **Required:** No -* **Importance:** Medium -* **Type:** Boolean -* **Default:** ``True`` - -.. _cluster-configuration-run-options: - -``docker.run_options`` -~~~~~~~~~~~~~~~~~~~~~~ - -The extra options to pass to ``docker run``. - -* **Required:** No -* **Importance:** Medium -* **Type:** List of String -* **Default:** ``[]`` - -.. _cluster-configuration-head-run-options: - -``docker.head_run_options`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The extra options to pass to ``docker run`` for head node only. - -* **Required:** No -* **Importance:** Low -* **Type:** List of String -* **Default:** ``[]`` +Common cluster configurations +----------------------------- -.. _cluster-configuration-worker-run-options: +The `example-full.yaml `__ configuration is enough to get started with Ray, but for more compute intensive workloads you will want to change the instance types to e.g. use GPU or larger compute instance by editing the yaml file. -``docker.worker_run_options`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Here are a few common configurations (note that we use AWS in the examples, but these examples are generic): -The extra options to pass to ``docker run`` for worker nodes only. +**GPU single node**: use Ray on a single large GPU instance. -* **Required:** No -* **Importance:** Low -* **Type:** List of String -* **Default:** ``[]`` - -.. _cluster-configuration-disable-automatic-runtime-detection: - -``docker.disable_automatic_runtime_detection`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If enabled, Ray will not try to use the NVIDIA Container Runtime if GPUs are present. - -* **Required:** No -* **Importance:** Low -* **Type:** Boolean -* **Default:** ``False`` - - -.. _cluster-configuration-disable-shm-size-detection: - -``docker.disable_shm_size_detection`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If enabled, Ray will not automatically specify the size ``/dev/shm`` for the started container and the runtime's default value (64MiB for Docker) will be used. - -* **Required:** No -* **Importance:** Low -* **Type:** Boolean -* **Default:** ``False`` - - -.. _cluster-configuration-ssh-user: - -``auth.ssh_user`` -~~~~~~~~~~~~~~~~~ - -The user that Ray will authenticate with when launching new nodes. - -* **Required:** Yes -* **Importance:** High -* **Type:** String - -.. _cluster-configuration-ssh-private-key: - -``auth.ssh_private_key`` -~~~~~~~~~~~~~~~~~~~~~~~~ - -.. tabs:: - .. group-tab:: AWS - - The path to an existing private key for Ray to use. If not configured, Ray will create a new private keypair (default behavior). If configured, the key must be added to the project-wide metadata and ``KeyName`` has to be defined in the :ref:`node configuration `. - - * **Required:** No - * **Importance:** Low - * **Type:** String - - .. group-tab:: Azure - - The path to an existing private key for Ray to use. - - * **Required:** Yes - * **Importance:** High - * **Type:** String - - You may use ``ssh-keygen -t rsa -b 4096`` to generate a new ssh keypair. - - .. group-tab:: GCP - - The path to an existing private key for Ray to use. If not configured, Ray will create a new private keypair (default behavior). If configured, the key must be added to the project-wide metadata and ``KeyName`` has to be defined in the :ref:`node configuration `. - - * **Required:** No - * **Importance:** Low - * **Type:** String - -.. _cluster-configuration-ssh-public-key: - -``auth.ssh_public_key`` -~~~~~~~~~~~~~~~~~~~~~~~ - -.. tabs:: - .. group-tab:: AWS - - Not available. - - .. group-tab:: Azure - - The path to an existing public key for Ray to use. - - * **Required:** Yes - * **Importance:** High - * **Type:** String - - .. group-tab:: GCP - - Not available. - -.. _cluster-configuration-type: - -``provider.type`` -~~~~~~~~~~~~~~~~~ - -.. tabs:: - .. group-tab:: AWS - - The cloud service provider. For AWS, this must be set to ``aws``. - - * **Required:** Yes - * **Importance:** High - * **Type:** String - - .. group-tab:: Azure - - The cloud service provider. For Azure, this must be set to ``azure``. - - * **Required:** Yes - * **Importance:** High - * **Type:** String - - .. group-tab:: GCP - - The cloud service provider. For GCP, this must be set to ``gcp``. - - * **Required:** Yes - * **Importance:** High - * **Type:** String - -.. _cluster-configuration-region: - -``provider.region`` -~~~~~~~~~~~~~~~~~~~ - -.. tabs:: - .. group-tab:: AWS - - The region to use for deployment of the Ray cluster. - - * **Required:** Yes - * **Importance:** High - * **Type:** String - * **Default:** us-west-2 - - .. group-tab:: Azure - - Not available. - - .. group-tab:: GCP - - The region to use for deployment of the Ray cluster. - - * **Required:** Yes - * **Importance:** High - * **Type:** String - * **Default:** us-west1 - -.. _cluster-configuration-availability-zone: - -``provider.availability_zone`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. tabs:: - .. group-tab:: AWS - - A string specifying a comma-separated list of availability zone(s) that nodes may be launched in. - - * **Required:** No - * **Importance:** Low - * **Type:** String - * **Default:** us-west-2a,us-west-2b - - .. group-tab:: Azure - - Not available. - - .. group-tab:: GCP - - A string specifying a comma-separated list of availability zone(s) that nodes may be launched in. - - * **Required:** No - * **Importance:** Low - * **Type:** String - * **Default:** us-west1-a - -.. _cluster-configuration-location: - -``provider.location`` -~~~~~~~~~~~~~~~~~~~~~ - -.. tabs:: - .. group-tab:: AWS - - Not available. - - .. group-tab:: Azure - - The location to use for deployment of the Ray cluster. - - * **Required:** Yes - * **Importance:** High - * **Type:** String - * **Default:** westus2 - - .. group-tab:: GCP - - Not available. - -.. _cluster-configuration-resource-group: - -``provider.resource_group`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. tabs:: - .. group-tab:: AWS - - Not available. - - .. group-tab:: Azure - - The resource group to use for deployment of the Ray cluster. - - * **Required:** Yes - * **Importance:** High - * **Type:** String - * **Default:** ray-cluster - - .. group-tab:: GCP - - Not available. - -.. _cluster-configuration-subscription-id: - -``provider.subscription_id`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. tabs:: - .. group-tab:: AWS - - Not available. - - .. group-tab:: Azure - - The subscription ID to use for deployment of the Ray cluster. If not specified, Ray will use the default from the Azure CLI. - - * **Required:** No - * **Importance:** High - * **Type:** String - * **Default:** ``""`` - - .. group-tab:: GCP - - Not available. - -.. _cluster-configuration-project-id: - -``provider.project_id`` -~~~~~~~~~~~~~~~~~~~~~~~ - -.. tabs:: - .. group-tab:: AWS - - Not available. - - .. group-tab:: Azure - - Not available. - - .. group-tab:: GCP - - The globally unique project ID to use for deployment of the Ray cluster. - - * **Required:** No - * **Importance:** Low - * **Type:** String - * **Default:** ``null`` - -.. _cluster-configuration-cache-stopped-nodes: - -``provider.cache_stopped_nodes`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If enabled, nodes will be *stopped* when the cluster scales down. If disabled, nodes will be *terminated* instead. Stopped nodes launch faster than terminated nodes. - - -* **Required:** No -* **Importance:** Low -* **Type:** Boolean -* **Default:** ``True`` - -.. _cluster-configuration-node-config: - -``available_node_types..node_type.node_config`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The configuration to be used to launch the nodes on the cloud service provider. Among other things, this will specify the instance type to be launched. - -* **Required:** Yes -* **Importance:** High -* **Type:** :ref:`Node config ` - -.. _cluster-configuration-resources: - -``available_node_types..node_type.resources`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The resources that a node type provides, which enables the autoscaler to automatically select the right type of nodes to launch given the resource demands of the application. The resources specified will be automatically passed to the ``ray start`` command for the node via an environment variable. If not provided, Autoscaler can automatically detect them only for AWS/Kubernetes cloud providers. For more information, see also the `resource demand scheduler `_ - -* **Required:** Yes (except for AWS/K8s) -* **Importance:** High -* **Type:** :ref:`Resources ` -* **Default:** ``{}`` - -In some cases, adding special nodes without any resources may be desirable. Such nodes can be used as a driver which connects to the cluster to launch jobs. In order to manually add a node to an autoscaled cluster, the *ray-cluster-name* tag should be set and *ray-node-type* tag should be set to unmanaged. Unmanaged nodes can be created by setting the resources to ``{}`` and the :ref:`maximum workers ` to 0. The Autoscaler will not attempt to start, stop, or update unmanaged nodes. The user is responsible for properly setting up and cleaning up unmanaged nodes. - -.. _cluster-configuration-node-min-workers: - -``available_node_types..node_type.min_workers`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The minimum number of workers to maintain for this node type regardless of utilization. - -* **Required:** No -* **Importance:** High -* **Type:** Integer -* **Default:** ``0`` -* **Minimum:** ``0`` -* **Maximum:** Unbounded - -.. _cluster-configuration-node-max-workers: - -``available_node_types..node_type.max_workers`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The maximum number of workers to have in the cluster for this node type regardless of utilization. This takes precedence over :ref:`minimum workers `. - -* **Required:** No -* **Importance:** High -* **Type:** Integer -* **Default:** ``0`` -* **Minimum:** ``0`` -* **Maximum:** Unbounded - -.. _cluster-configuration-node-type-worker-setup-commands: - -``available_node_types..node_type.worker_setup_commands`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -A list of commands to run to set up worker nodes of this type. These commands will replace the general :ref:`worker setup commands ` for the node. - -* **Required:** No -* **Importance:** low -* **Type:** List of String -* **Default:** ``[]`` - -.. _cluster-configuration-cpu: - -``available_node_types..node_type.resources.CPU`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. tabs:: - .. group-tab:: AWS - - The number of CPUs made available by this node. If not configured, Autoscaler can automatically detect them only for AWS/Kubernetes cloud providers. - - * **Required:** Yes (except for AWS/K8s) - * **Importance:** High - * **Type:** Integer - - .. group-tab:: Azure - - The number of CPUs made available by this node. - - * **Required:** Yes - * **Importance:** High - * **Type:** Integer - - .. group-tab:: GCP - - The number of CPUs made available by this node. - - * **Required:** No - * **Importance:** High - * **Type:** Integer - - -.. _cluster-configuration-gpu: - -``available_node_types..node_type.resources.GPU`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. tabs:: - .. group-tab:: AWS - - The number of GPUs made available by this node. If not configured, Autoscaler can automatically detect them only for AWS/Kubernetes cloud providers. - - * **Required:** No - * **Importance:** Low - * **Type:** Integer - - .. group-tab:: Azure - - The number of GPUs made available by this node. - - * **Required:** No - * **Importance:** High - * **Type:** Integer - - .. group-tab:: GCP - - The number of GPUs made available by this node. - - * **Required:** No - * **Importance:** High - * **Type:** Integer - -.. _cluster-configuration-node-docker: - -``available_node_types..docker`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -A set of overrides to the top-level :ref:`Docker ` configuration. +.. code-block:: yaml -* **Required:** No -* **Importance:** Low -* **Type:** :ref:`docker ` -* **Default:** ``{}`` + max_workers: 0 + head_node: + InstanceType: p2.8xlarge -Examples --------- -Minimal configuration -~~~~~~~~~~~~~~~~~~~~~ +**Mixed GPU and CPU nodes**: for RL applications that require proportionally more +CPU than GPU resources, you can use additional CPU workers with a GPU head node. -.. tabs:: - .. group-tab:: AWS +.. code-block:: yaml - .. literalinclude:: ../../../python/ray/autoscaler/aws/example-minimal.yaml - :language: yaml + max_workers: 10 + head_node: + InstanceType: p2.8xlarge + worker_nodes: + InstanceType: m4.16xlarge - .. group-tab:: Azure - - .. literalinclude:: ../../../python/ray/autoscaler/azure/example-minimal.yaml - :language: yaml +**Autoscaling CPU cluster**: use a small head node and have Ray auto-scale +workers as needed. This can be a cost-efficient configuration for clusters with +bursty workloads. You can also request spot workers for additional cost savings. - .. group-tab:: GCP - - .. literalinclude:: ../../../python/ray/autoscaler/gcp/example-minimal.yaml - :language: yaml +.. code-block:: yaml -Full configuration -~~~~~~~~~~~~~~~~~~ + min_workers: 0 + max_workers: 10 + head_node: + InstanceType: m4.large + worker_nodes: + InstanceMarketOptions: + MarketType: spot + InstanceType: m4.16xlarge -.. tabs:: - .. group-tab:: AWS +**Autoscaling GPU cluster**: similar to the autoscaling CPU cluster, but +with GPU worker nodes instead. - .. literalinclude:: ../../../python/ray/autoscaler/aws/example-full.yaml - :language: yaml +.. code-block:: yaml - .. group-tab:: Azure - - .. literalinclude:: ../../../python/ray/autoscaler/azure/example-full.yaml - :language: yaml + min_workers: 0 # NOTE: older Ray versions may need 1+ GPU workers (#2106) + max_workers: 10 + head_node: + InstanceType: m4.large + worker_nodes: + InstanceMarketOptions: + MarketType: spot + InstanceType: p2.xlarge - .. group-tab:: GCP - - .. literalinclude:: ../../../python/ray/autoscaler/gcp/example-full.yaml - :language: yaml diff --git a/doc/source/cluster/deploy.rst b/doc/source/cluster/deploy.rst index 24bcfe456e0d..e9253614f496 100644 --- a/doc/source/cluster/deploy.rst +++ b/doc/source/cluster/deploy.rst @@ -3,10 +3,6 @@ Ray with Cluster Managers ========================= -.. note:: - - If you're using AWS, Azure or GCP you can use the :ref:`Ray Cluster Launcher ` to simplify the cluster setup process. - .. toctree:: :maxdepth: 2 diff --git a/doc/source/cluster/index.rst b/doc/source/cluster/index.rst index f32fab54874a..c95eca1cb2b7 100644 --- a/doc/source/cluster/index.rst +++ b/doc/source/cluster/index.rst @@ -1,26 +1,229 @@ .. _cluster-index: -Ray Cluster Overview -==================== +Distributed Ray Overview +======================== -What is a Ray cluster? +One of Ray's strengths is the ability to leverage multiple machines in the same program. Ray can, of course, be run on a single machine (and is done so often) but the real power is using Ray on a cluster of machines. + +Key Concepts +------------ + +* **Ray Nodes**: A Ray cluster consists of a **head node** and a set of **worker nodes**. The head node needs to be started first, and the worker nodes are given the address of the head node to form the cluster. The Ray cluster itself can also "auto-scale," meaning that it can interact with a Cloud Provider to request or release instances according to application workload. + +* **Ports**: Ray processes communicate via TCP ports. When starting a Ray cluster, either on prem or on the cloud, it is important to open the right ports so that Ray functions correctly. See :ref:`the Ray Ports documentation ` for more details. + +* **Ray Cluster Launcher**: The :ref:`Ray Cluster Launcher ` is a simple tool that automatically provisions machines and launches a multi-node Ray cluster. You can use the cluster launcher on GCP, Amazon EC2, Azure, or even Kubernetes. + +Summary +------- + +Clusters are started with the :ref:`Ray Cluster Launcher ` or :ref:`manually `. + +You can also create a Ray cluster using a standard cluster manager such as :ref:`Kubernetes `, :ref:`YARN `, or :ref:`SLURM `. + +After a cluster is started, you need to connect your program to the Ray cluster by starting a driver process on the same node as where you ran ``ray start``: + +.. tabs:: + .. code-tab:: python + + # This must + import ray + ray.init(address='auto') + + .. group-tab:: java + + .. code-block:: java + + import io.ray.api.Ray; + + public class MyRayApp { + + public static void main(String[] args) { + Ray.init(); + ... + } + } + + .. code-block:: bash + + java -classpath \ + -Dray.address=
\ + + +and then the rest of your script should be able to leverage Ray as a distributed framework! + + +Using the cluster launcher +-------------------------- + +The ``ray up`` command uses the :ref:`Ray Cluster Launcher ` to start a cluster on the cloud, creating a designated "head node" and worker nodes. Any Python process that runs ``ray.init(address=...)`` on any of the cluster nodes will connect to the ray cluster. + +.. important:: Calling ``ray.init`` on your laptop will not work if using ``ray up``, since your laptop will not be the head node. + +Here is an example of using the Cluster Launcher on AWS: + +.. code-block:: shell + + # First, run `pip install boto3` and `aws configure` + # + # Create or update the cluster. When the command finishes, it will print + # out the command that can be used to SSH into the cluster head node. + $ ray up ray/python/ray/autoscaler/aws/example-full.yaml + +You can monitor the Ray cluster status with ``ray monitor cluster.yaml`` and ssh into the head node with ``ray attach cluster.yaml``. + +.. _manual-cluster: + +Manual Ray Cluster Setup ------------------------ -One of Ray's strengths is the ability to leverage multiple machines in the same program. Ray can, of course, be run on a single machine (and is done so often), but the real power is using Ray on a cluster of machines. +The most preferable way to run a Ray cluster is via the :ref:`Ray Cluster Launcher `. However, it is also possible to start a Ray cluster by hand. + +This section assumes that you have a list of machines and that the nodes in the cluster can communicate with each other. It also assumes that Ray is installed +on each machine. To install Ray, follow the `installation instructions`_. + +.. _`installation instructions`: http://docs.ray.io/en/master/installation.html + +Starting Ray on each machine +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +On the head node (just choose some node to be the head node), run the following. +If the ``--port`` argument is omitted, Ray will choose port 6379, falling back to a +random port. + +.. code-block:: bash + + $ ray start --head --port=6379 + ... + Next steps + To connect to this Ray runtime from another node, run + ray start --address=':6379' --redis-password='' + + If connection fails, check your firewall settings and network configuration. + +The command will print out the address of the Redis server that was started +(the local node IP address plus the port number you specified). + +**Then on each of the other nodes**, run the following. Make sure to replace +``
`` with the value printed by the command on the head node (it +should look something like ``123.45.67.89:6379``). + +Note that if your compute nodes are on their own subnetwork with Network +Address Translation, to connect from a regular machine outside that subnetwork, +the command printed by the head node will not work. You need to find the +address that will reach the head node from the second machine. If the head node +has a domain address like compute04.berkeley.edu, you can simply use that in +place of an IP address and rely on the DNS. + +.. code-block:: bash + + $ ray start --address=
--redis-password='' + -------------------- + Ray runtime started. + -------------------- + + To terminate the Ray runtime, run + ray stop + +If you wish to specify that a machine has 10 CPUs and 1 GPU, you can do this +with the flags ``--num-cpus=10`` and ``--num-gpus=1``. See the :ref:`Configuration ` page for more information. + +If you see ``Unable to connect to Redis. If the Redis instance is on a +different machine, check that your firewall is configured properly.``, +this means the ``--port`` is inaccessible at the given IP address (because, for +example, the head node is not actually running Ray, or you have the wrong IP +address). + +If you see ``Ray runtime started.``, then the node successfully connected to +the IP address at the ``--port``. You should now be able to connect to the +cluster with ``ray.init(address='auto')``. + +If ``ray.init(address='auto')`` keeps repeating +``redis_context.cc:303: Failed to connect to Redis, retrying.``, then the node +is failing to connect to some other port(s) besides the main port. + +.. code-block:: bash + + If connection fails, check your firewall settings and network configuration. + +If the connection fails, to check whether each port can be reached from a node, +you can use a tool such as ``nmap`` or ``nc``. + +.. code-block:: bash + + $ nmap -sV --reason -p $PORT $HEAD_ADDRESS + Nmap scan report for compute04.berkeley.edu (123.456.78.910) + Host is up, received echo-reply ttl 60 (0.00087s latency). + rDNS record for 123.456.78.910: compute04.berkeley.edu + PORT STATE SERVICE REASON VERSION + 6379/tcp open redis syn-ack ttl 60 Redis key-value store + Service detection performed. Please report any incorrect results at https://nmap.org/submit/ . + $ nc -vv -z $HEAD_ADDRESS $PORT + Connection to compute04.berkeley.edu 6379 port [tcp/*] succeeded! + +If the node cannot access that port at that IP address, you might see + +.. code-block:: bash + + $ nmap -sV --reason -p $PORT $HEAD_ADDRESS + Nmap scan report for compute04.berkeley.edu (123.456.78.910) + Host is up (0.0011s latency). + rDNS record for 123.456.78.910: compute04.berkeley.edu + PORT STATE SERVICE REASON VERSION + 6379/tcp closed redis reset ttl 60 + Service detection performed. Please report any incorrect results at https://nmap.org/submit/ . + $ nc -vv -z $HEAD_ADDRESS $PORT + nc: connect to compute04.berkeley.edu port 6379 (tcp) failed: Connection refused + + +Stopping Ray +~~~~~~~~~~~~ + +When you want to stop the Ray processes, run ``ray stop`` on each node. + +.. _using-ray-on-a-cluster: + +Running a Ray program on the Ray cluster +---------------------------------------- + +To run a distributed Ray program, you'll need to execute your program on the same machine as one of the nodes. + +.. tabs:: + .. group-tab:: Python + + Within your program/script, you must call ``ray.init`` and add the ``address`` parameter to ``ray.init`` (like ``ray.init(address=...)``). This causes Ray to connect to the existing cluster. For example: + + .. code-block:: python + + ray.init(address="auto") + + .. group-tab:: Java + + You need to add the ``ray.address`` parameter to your command line (like ``-Dray.address=...``). + + To connect your program to the Ray cluster, run it like this: + + .. code-block:: bash + + java -classpath \ + -Dray.address=
\ + -A Ray cluster consists of a **head node** and a set of **worker nodes**. The head node needs to be started first, and the worker nodes are given the address of the head node to form the cluster. + .. note:: Specifying ``auto`` as the address hasn't been implemented in Java yet. You need to provide the actual address. You can find the address of the server from the output of the ``ray up`` command. -You can use the Ray Cluster Launcher to provision machines and launch a multi-node Ray cluster. You can use the cluster launcher on AWS, GCP, Azure, Kubernetes, on-premise, and Staroid or even on your custom node provider. Ray clusters can also make use of the Ray Autoscaler, which allows Ray to interact with a cloud provider to request or release instances according to application workload. -How does it work? ------------------ +.. note:: A common mistake is setting the address to be a cluster node while running the script on your laptop. This will not work because the script needs to be started/executed on one of the Ray nodes. -The Ray Cluster Launcher will automatically enable a load-based autoscaler. The autoscaler resource demand scheduler will look at the pending tasks, actors, and placement groups resource demands from the cluster, and try to add the minimum list of nodes that can fulfill these demands. When worker nodes are idle for more than :ref:`idle_timeout_minutes `, they will be removed (the head node is never removed unless the cluster is teared down). +To verify that the correct number of nodes have joined the cluster, you can run the following. -Autoscaler uses a simple binpacking algorithm to binpack the user demands into the available cluster resources. The remaining unfulfilled demands are placed on the smallest list of nodes that satisfies the demand while maximizing utilization (starting from the smallest node). +.. code-block:: python -**Here is "A Glimpse into the Ray Autoscaler" and how to debug/monitor your cluster:** + import time -2021-19-01 by Ameer Haj-Ali, Anyscale, Inc. + @ray.remote + def f(): + time.sleep(0.01) + return ray.services.get_node_ip_address() -.. youtube:: BJ06eJasdu4 + # Get a list of the IP addresses of the nodes that have joined the cluster. + set(ray.get([f.remote() for _ in range(1000)])) diff --git a/doc/source/cluster/kubernetes.rst b/doc/source/cluster/kubernetes.rst index 1234ece998c0..94711b59507e 100644 --- a/doc/source/cluster/kubernetes.rst +++ b/doc/source/cluster/kubernetes.rst @@ -41,7 +41,7 @@ Below is a brief overview of the two tools. The Ray Cluster Launcher ------------------------ -The :ref:`Ray Cluster Launcher ` is geared towards experimentation and development and can be used to launch Ray clusters on Kubernetes (among other backends). +The :ref:`Ray Cluster Launcher ` is geared towards experimentation and development and can be used to launch Ray clusters on Kubernetes (among other backends). It allows you to manage an autoscaling Ray Cluster from your local environment using the :ref:`Ray CLI `. For example, you can use ``ray up`` to launch a Ray cluster on Kubernetes and ``ray exec`` to execute commands in the Ray head node's pod. Note that using the Cluster Launcher requires Ray to be :ref:`installed locally `. diff --git a/doc/source/cluster/launcher.rst b/doc/source/cluster/launcher.rst new file mode 100644 index 000000000000..8c63f04f9a4f --- /dev/null +++ b/doc/source/cluster/launcher.rst @@ -0,0 +1,66 @@ +.. _ref-automatic-cluster: + +Launching Cloud Clusters with Ray +================================= + +Ray comes with a built-in cluster launcher that makes deploying a Ray cluster simple. + +The cluster launcher will provision resources from a node provider (like :ref:`AWS EC2 ` or :ref:`Kubernetes `) to instantiate the specified cluster, and start a Ray cluster on the provisioned resources. + +You can configure the Ray Cluster Launcher to use with :ref:`a cloud provider `, an existing :ref:`Kubernetes cluster `, or a private cluster of machines. + +.. tabs:: + .. group-tab:: AWS + + .. code-block:: shell + + # First, run `pip install boto3` and `aws configure` + # + # Create or update the cluster. When the command finishes, it will print + # out the command that can be used to SSH into the cluster head node. + $ ray up ray/python/ray/autoscaler/aws/example-full.yaml + + See :ref:`the AWS section ` for full instructions. + + .. group-tab:: GCP + + .. code-block:: shell + + # First, ``pip install google-api-python-client`` + # set up your GCP credentials, and + # create a new GCP project. + # + # Create or update the cluster. When the command finishes, it will print + # out the command that can be used to SSH into the cluster head node. + $ ray up ray/python/ray/autoscaler/gcp/example-full.yaml + + See :ref:`the GCP section ` for full instructions. + + .. group-tab:: Azure + + .. code-block:: shell + + # First, install the Azure CLI + # ``pip install azure-cli azure-core``) then + # login using (``az login``). + # + # Create or update the cluster. When the command finishes, it will print + # out the command that can be used to SSH into the cluster head node. + $ ray up ray/python/ray/autoscaler/azure/example-full.yaml + + See :ref:`the Azure section ` for full instructions. + + +Once the Ray cluster is running, you can manually SSH into it or use provided commands like ``ray attach``, ``ray rsync-up``, and ``ray exec`` to access it and run Ray programs. + + +.. toctree:: + + /cluster/cloud.rst + /cluster/config.rst + /cluster/commands.rst + +Questions or Issues? +-------------------- + +.. include:: /_help.rst diff --git a/doc/source/cluster/quickstart.rst b/doc/source/cluster/quickstart.rst deleted file mode 100644 index f02db280e4b4..000000000000 --- a/doc/source/cluster/quickstart.rst +++ /dev/null @@ -1,240 +0,0 @@ -.. _ref-cluster-quick-start: - -Quick Start Cluster Autoscaling Demo -==================================== - -This quick start demonstrates the capabilities of the Ray cluster. Using the Ray cluster, we'll take a sample application designed to run on a laptop and scale it up in the cloud. Ray will launch clusters and scale Python with just a few commands. - -About the demo --------------- - -This demo will walk through an end-to-end flow: - -1. Create a (basic) Python application. -2. Launch a cluster on a cloud provider. -3. Run the application in the cloud. - -Requirements -~~~~~~~~~~~~ - -To run this demo, you will need: - -* Python installed on your development machine (typically your laptop), and -* an account at your preferred cloud provider (AWS, Azure or GCP). - -Setup -~~~~~ - -Before we start, you will need to install some Python dependencies as follows: - -.. tabs:: - .. group-tab:: AWS - - .. code-block:: shell - - $ pip install -U ray boto3 - - .. group-tab:: Azure - - .. code-block:: shell - - $ pip install -U ray azure-cli azure-core - - .. group-tab:: GCP - - .. code-block:: shell - - $ pip install -U ray google-api-python-client - -Next, if you're not set up to use your cloud provider from the command line, you'll have to configure your credentials: - -.. tabs:: - .. group-tab:: AWS - - Configure your credentials in ``~/.aws/credentials`` as described in `the AWS docs `_. - - .. group-tab:: Azure - - Log in using ``az login``, then configure your credentials with ``az account set -s ``. - - .. group-tab:: GCP - - Set the ``GOOGLE_APPLICATION_CREDENTIALS`` environment variable as described in `the GCP docs `_. - -Create a (basic) Python application ------------------------------------ - -We will write a simple Python application that tracks the IP addresses of the machines that its tasks are executed on: - -.. code-block:: python - - from collections import Counter - import socket - import time - - def f(): - time.sleep(0.001) - # Return IP address. - return socket.gethostbyname(socket.gethostname()) - - ip_addresses = [f() for _ in range(10000)] - print(Counter(ip_addresses)) - -Save this application as ``script.py`` and execute it by running the command ``python script.py``. The application should take 10 seconds to run and output something similar to ``Counter({'127.0.0.1': 10000})``. - -With some small changes, we can make this application run on Ray (for more information on how to do this, refer to :ref:`the Ray Core Walkthrough`): - -.. code-block:: python - - from collections import Counter - import socket - import time - - import ray - - ray.init() - - @ray.remote - def f(): - time.sleep(0.001) - # Return IP address. - return socket.gethostbyname(socket.gethostname()) - - object_ids = [f.remote() for _ in range(10000)] - ip_addresses = ray.get(object_ids) - print(Counter(ip_addresses)) - -Finally, let's add some code to make the output more interesting: - -.. code-block:: python - - from collections import Counter - import socket - import time - - import ray - - ray.init() - - print('''This cluster consists of - {} nodes in total - {} CPU resources in total - '''.format(len(ray.nodes()), ray.cluster_resources()['CPU'])) - - @ray.remote - def f(): - time.sleep(0.001) - # Return IP address. - return socket.gethostbyname(socket.gethostname()) - - object_ids = [f.remote() for _ in range(10000)] - ip_addresses = ray.get(object_ids) - - print('Tasks executed') - for ip_address, num_tasks in Counter(ip_addresses).items(): - print(' {} tasks on {}'.format(num_tasks, ip_address)) - -Running ``python script.py`` should now output something like: - -.. parsed-literal:: - - This cluster consists of - 1 nodes in total - 4.0 CPU resources in total - - Tasks executed - 10000 tasks on 127.0.0.1 - -Launch a cluster on a cloud provider ------------------------------------- - -To start a Ray Cluster, first we need to define the cluster configuration. The cluster configuration is defined within a YAML file that will be used by the Cluster Launcher to launch the head node, and by the Autoscaler to launch worker nodes. - -A minimal sample cluster configuration file looks as follows: - -.. tabs:: - .. group-tab:: AWS - - .. code-block:: yaml - - # An unique identifier for the head node and workers of this cluster. - cluster_name: minimal - - # Cloud-provider specific configuration. - provider: - type: aws - region: us-west-2 - - .. group-tab:: Azure - - .. code-block:: yaml - - # An unique identifier for the head node and workers of this cluster. - cluster_name: minimal - - # Cloud-provider specific configuration. - provider: - type: azure - location: westus2 - resource_group: ray-cluster - - # How Ray will authenticate with newly launched nodes. - auth: - ssh_user: ubuntu - # you must specify paths to matching private and public key pair files - # use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair - ssh_private_key: ~/.ssh/id_rsa - # changes to this should match what is specified in file_mounts - ssh_public_key: ~/.ssh/id_rsa.pub - - .. group-tab:: GCP - - .. code-block:: yaml - - # A unique identifier for the head node and workers of this cluster. - cluster_name: minimal - - # Cloud-provider specific configuration. - provider: - type: gcp - region: us-west1 - -Save this configuration file as ``config.yaml``. You can specify a lot more details in the configuration file: instance types to use, minimum and maximum number of workers to start, autoscaling strategy, files to sync, and more. For a full reference on the available configuration properties, please refer to the :ref:`cluster YAML configuration options reference `. - -After defining our configuration, we will use the Ray Cluster Launcher to start a cluster on the cloud, creating a designated "head node" and worker nodes. To start the Ray cluster, we will use the :ref:`Ray CLI `. Run the following command: - -.. code-block:: shell - - $ ray up -y config.yaml - -Run the application in the cloud --------------------------------- - -We are now ready to execute the application in across multiple machines on our Ray cloud cluster. Run the following command: - -.. code-block:: shell - - $ ray submit config.yaml script.py - -The output should now look similar to the following: - -.. parsed-literal:: - - This cluster consists of - 3 nodes in total - 6.0 CPU resources in total - - Tasks executed - 3425 tasks on xxx.xxx.xxx.xxx - 3834 tasks on xxx.xxx.xxx.xxx - 2741 tasks on xxx.xxx.xxx.xxx - -In this sample output, 3 nodes were started. If the output only shows 1 node, you may want to increase the ``secs`` in ``time.sleep(secs)`` to give Ray more time to start additional nodes. - -The Ray CLI offers additional functionality. For example, you can monitor the Ray cluster status with ``ray monitor config.yaml``, and you can connect to the cluster (ssh into the head node) with ``ray attach config.yaml``. For a full reference on the Ray CLI, please refer to :ref:`the cluster commands reference `. - -To finish, don't forget to shut down the cluster. Run the following command: - -.. code-block:: shell - - $ ray down -y config.yaml diff --git a/doc/source/cluster/reference.rst b/doc/source/cluster/reference.rst deleted file mode 100644 index ad9388060ae6..000000000000 --- a/doc/source/cluster/reference.rst +++ /dev/null @@ -1,11 +0,0 @@ -.. _cluster-reference: - -Config YAML and CLI Reference -============================= - -.. toctree:: - :maxdepth: 2 - - config.rst - commands.rst - sdk.rst diff --git a/doc/source/cluster/sdk.rst b/doc/source/cluster/sdk.rst deleted file mode 100644 index 7238ee55823f..000000000000 --- a/doc/source/cluster/sdk.rst +++ /dev/null @@ -1,13 +0,0 @@ -.. _ref-autoscaler-sdk: - -Autoscaler SDK -============== - -.. _ref-autoscaler-sdk-request-resources: - -ray.autoscaler.sdk.request_resources ------------------------------------- - -Within a Ray program, you can command the autoscaler to scale the cluster up to a desired size with ``request_resources()`` call. The cluster will immediately attempt to scale to accommodate the requested resources, bypassing normal upscaling speed constraints. - -.. autofunction:: ray.autoscaler.sdk.request_resources \ No newline at end of file diff --git a/doc/source/conf.py b/doc/source/conf.py index b1a74f2634ee..bdff928f76ba 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -148,7 +148,6 @@ class SimpleClass2(object): 'sphinx_gallery.gen_gallery', 'sphinxemoji.sphinxemoji', 'sphinx_copybutton', - 'sphinxcontrib.yt', 'versionwarning.extension', ] diff --git a/doc/source/dask-on-ray.rst b/doc/source/dask-on-ray.rst index 486dc9a1fcd8..0530fdc4c7dd 100644 --- a/doc/source/dask-on-ray.rst +++ b/doc/source/dask-on-ray.rst @@ -71,7 +71,7 @@ Here's an example: Why use Dask on Ray? 1. To take advantage of Ray-specific features such as the - :ref:`launching cloud clusters ` and + :ref:`cluster launcher ` and :ref:`shared-memory store `. 2. If you'd like to use Dask and Ray libraries in the same application without having two different clusters. 3. If you'd like to create data analyses using the familiar NumPy and Pandas APIs provided by Dask and execute them on a fast, fault-tolerant distributed task execution system geared towards production, like Ray. diff --git a/doc/source/index.rst b/doc/source/index.rst index 182ff7ef7ce4..e90b52299f5a 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -231,12 +231,11 @@ Papers .. toctree:: :hidden: :maxdepth: -1 - :caption: Ray Clusters/Autoscaler + :caption: Ray Cluster cluster/index.rst - cluster/quickstart.rst - cluster/reference.rst - cluster/cloud.rst + cluster/launcher.rst + cluster/autoscaling.rst cluster/deploy.rst .. toctree:: diff --git a/doc/source/serve/deployment.rst b/doc/source/serve/deployment.rst index ed397ec83266..1ab190595796 100644 --- a/doc/source/serve/deployment.rst +++ b/doc/source/serve/deployment.rst @@ -140,7 +140,7 @@ In order to deploy Ray Serve on Kubernetes, we need to do the following: 3. Start Ray Serve on the cluster. There are multiple ways to start a Ray cluster on Kubernetes, see :ref:`ray-k8s-deploy` for more information. -Here, we will be using the :ref:`Ray Cluster Launcher ` tool, which has support for Kubernetes as a backend. +Here, we will be using the :ref:`Ray Cluster Launcher ` tool, which has support for Kubernetes as a backend. The cluster launcher takes in a yaml config file that describes the cluster. Here, we'll be using the `Kubernetes default config`_ with a few small modifications. diff --git a/doc/source/starting-ray.rst b/doc/source/starting-ray.rst index b4bf4ce0206a..1791cc25b8ed 100644 --- a/doc/source/starting-ray.rst +++ b/doc/source/starting-ray.rst @@ -164,7 +164,7 @@ You can connect other nodes to the head node, creating a Ray cluster by also cal Launching a Ray cluster (``ray up``) ------------------------------------ -Ray clusters can be launched with the :ref:`Cluster Launcher `. +Ray clusters can be launched with the :ref:`Cluster Launcher `. The ``ray up`` command uses the Ray cluster launcher to start a cluster on the cloud, creating a designated "head node" and worker nodes. Underneath the hood, it automatically calls ``ray start`` to create a Ray cluster. Your code **only** needs to execute on one machine in the cluster (usually the head node). Read more about :ref:`running programs on a Ray cluster `. diff --git a/doc/source/tune/_tutorials/tune-distributed.rst b/doc/source/tune/_tutorials/tune-distributed.rst index 46b47e3bc757..498576e5b1d8 100644 --- a/doc/source/tune/_tutorials/tune-distributed.rst +++ b/doc/source/tune/_tutorials/tune-distributed.rst @@ -55,7 +55,7 @@ Launching a cloud cluster If you have already have a list of nodes, go to :ref:`tune-distributed-local`. -Ray currently supports AWS and GCP. Follow the instructions below to launch nodes on AWS (using the Deep Learning AMI). See the :ref:`cluster setup documentation `. Save the below cluster configuration (``tune-default.yaml``): +Ray currently supports AWS and GCP. Follow the instructions below to launch nodes on AWS (using the Deep Learning AMI). See the :ref:`cluster setup documentation `. Save the below cluster configuration (``tune-default.yaml``): .. literalinclude:: /../../python/ray/tune/examples/tune-default.yaml :language: yaml @@ -130,7 +130,7 @@ If you used a cluster configuration (starting a cluster with ``ray up`` or ``ray Syncing ------- -Tune automatically syncs the trial folder on remote nodes back to the head node. This requires the ray cluster to be started with the :ref:`cluster launcher `. +Tune automatically syncs the trial folder on remote nodes back to the head node. This requires the ray cluster to be started with the :ref:`cluster launcher `. By default, local syncing requires rsync to be installed. You can customize the sync command with the ``sync_to_driver`` argument in ``tune.SyncConfig`` by providing either a function or a string. If a string is provided, then it must include replacement fields ``{source}`` and ``{target}``, like ``rsync -savz -e "ssh -i ssh_key.pem" {source} {target}``. Alternatively, a function can be provided with the following signature: @@ -290,7 +290,7 @@ Upon a second run, this will restore the entire experiment state from ``~/path/t Common Commands --------------- -Below are some commonly used commands for submitting experiments. Please see the :ref:`Autoscaler page ` to see find more comprehensive documentation of commands. +Below are some commonly used commands for submitting experiments. Please see the :ref:`Autoscaler page ` to see find more comprehensive documentation of commands. .. code-block:: bash diff --git a/doc/source/tune/user-guide.rst b/doc/source/tune/user-guide.rst index 8dd636042510..909ebbc9faf4 100644 --- a/doc/source/tune/user-guide.rst +++ b/doc/source/tune/user-guide.rst @@ -265,7 +265,7 @@ You can restore a single trial checkpoint by using ``tune.run(restore=` and also requires rsync to be installed. +On a multinode cluster, Tune automatically creates a copy of all trial checkpoints on the head node. This requires the Ray cluster to be started with the :ref:`cluster launcher ` and also requires rsync to be installed. Note that you must use the ``tune.checkpoint_dir`` API to trigger syncing. Also, if running Tune on Kubernetes, be sure to use the :ref:`KubernetesSyncer ` to transfer files between different pods. From 0f22abc318598801b6144d3d04be442d69b67895 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 014/244] Revert "Export additional metrics to Prometheus (#14061)" This reverts commit c16df38a1632be2cc3154154e866bc2b5de7f564. --- dashboard/modules/reporter/reporter_agent.py | 74 ++----------------- .../modules/reporter/tests/test_reporter.py | 8 +- 2 files changed, 8 insertions(+), 74 deletions(-) diff --git a/dashboard/modules/reporter/reporter_agent.py b/dashboard/modules/reporter/reporter_agent.py index e604f7463f86..3d9472a3dee3 100644 --- a/dashboard/modules/reporter/reporter_agent.py +++ b/dashboard/modules/reporter/reporter_agent.py @@ -77,25 +77,7 @@ def __init__(self, dashboard_agent): "node_cpu": Gauge("node_cpu", "Total CPU usage on a ray node", "percentage", ["ip"]), "node_mem": Gauge("node_mem", "Total memory usage on a ray node", - "bytes", ["ip"]), - "node_disk_usage": Gauge("node_disk_usage", - "Total disk usage (bytes) on a ray node", - "bytes", ["ip"]), - "node_disk_utilization_percentage": Gauge( - "node_disk_utilization_percentage", - "Total disk utilization (percentage) on a ray node", - "percentage", ["ip"]), - "node_network_sent": Gauge("node_network_sent", - "Total network sent", "bytes", ["ip"]), - "node_network_received": Gauge("node_network_received", - "Total network received", "bytes", - ["ip"]), - "node_network_send_speed": Gauge("node_network_send_speed", - "Network send speed", "bytes/sec", - ["ip"]), - "node_network_receive_speed": Gauge("node_network_receive_speed", - "Network receive speed", - "bytes/sec", ["ip"]), + "mb", ["ip"]), "raylet_cpu": Gauge("raylet_cpu", "CPU usage of the raylet on a node.", "percentage", ["ip", "pid"]), @@ -255,10 +237,8 @@ def _get_all_stats(self): self._network_stats_hist.append((now, network_stats)) self._network_stats_hist = self._network_stats_hist[-7:] then, prev_network_stats = self._network_stats_hist[0] - prev_send, prev_recv = prev_network_stats - now_send, now_recv = network_stats - network_speed_stats = ((now_send - prev_send) / (now - then), - (now_recv - prev_recv) / (now - then)) + netstats = ((network_stats[0] - prev_network_stats[0]) / (now - then), + (network_stats[1] - prev_network_stats[1]) / (now - then)) return { "now": now, "hostname": self._hostname, @@ -271,8 +251,7 @@ def _get_all_stats(self): "loadAvg": self._get_load_avg(), "disk": self._get_disk_usage(), "gpus": self._get_gpu_usage(), - "network": network_stats, - "network_speed": network_speed_stats, + "net": netstats, "cmdline": self._get_raylet_cmdline(), } @@ -285,45 +264,10 @@ def _record_stats(self, stats): # -- Mem per node -- total, avail, _ = stats["mem"] - mem_usage = float(total - avail) + mem_usage = float(total - avail) / 1e6 mem_record = Record( gauge=self._gauges["node_mem"], value=mem_usage, tags={"ip": ip}) - # -- Disk per node -- - used, free = 0, 0 - for entry in stats["disk"].values(): - used += entry.used - free += entry.free - disk_utilization = float(used / (used + free)) * 100 - disk_usage_record = Record( - gauge=self._gauges["node_disk_usage"], value=used, tags={"ip": ip}) - disk_utilization_percentage_record = Record( - gauge=self._gauges["node_disk_utilization_percentage"], - value=disk_utilization, - tags={"ip": ip}) - - # -- Network speed (send/receive) stats per node -- - network_stats = stats["network"] - network_sent_record = Record( - gauge=self._gauges["node_network_sent"], - value=network_stats[0], - tags={"ip": ip}) - network_received_record = Record( - gauge=self._gauges["node_network_received"], - value=network_stats[1], - tags={"ip": ip}) - - # -- Network speed (send/receive) per node -- - network_speed_stats = stats["network_speed"] - network_send_speed_record = Record( - gauge=self._gauges["node_network_send_speed"], - value=network_speed_stats[0], - tags={"ip": ip}) - network_receive_speed_record = Record( - gauge=self._gauges["node_network_receive_speed"], - value=network_speed_stats[1], - tags={"ip": ip}) - raylet_stats = self._get_raylet_stats() raylet_pid = str(raylet_stats["pid"]) # -- raylet CPU -- @@ -346,12 +290,8 @@ def _record_stats(self, stats): "pid": raylet_pid }) - self._metrics_agent.record_reporter_stats([ - cpu_record, mem_record, disk_usage_record, - disk_utilization_percentage_record, network_sent_record, - network_received_record, network_send_speed_record, - network_receive_speed_record, raylet_cpu_record, raylet_mem_record - ]) + self._metrics_agent.record_reporter_stats( + [cpu_record, mem_record, raylet_cpu_record, raylet_mem_record]) async def _perform_iteration(self, aioredis_client): """Get any changes to the log files and push updates to Redis.""" diff --git a/dashboard/modules/reporter/tests/test_reporter.py b/dashboard/modules/reporter/tests/test_reporter.py index 72617562f92c..001ea42a5b88 100644 --- a/dashboard/modules/reporter/tests/test_reporter.py +++ b/dashboard/modules/reporter/tests/test_reporter.py @@ -105,13 +105,7 @@ def test_case_stats_exist(): prom_addresses) return all([ "ray_node_cpu" in metric_names, "ray_node_mem" in metric_names, - "ray_raylet_cpu" in metric_names, "ray_raylet_mem" in metric_names, - "ray_node_disk_usage" in metric_names, - "ray_node_disk_utilization_percentage" in metric_names, - "ray_node_network_sent" in metric_names, - "ray_node_network_received" in metric_names, - "ray_node_network_send_speed" in metric_names, - "ray_node_network_receive_speed" in metric_names + "ray_raylet_cpu" in metric_names, "ray_raylet_mem" in metric_names ]) def test_case_ip_correct(): From d0f4f69ba01c0008b1f3b448835652a9871e7115 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 015/244] Revert "Revert "Unhandled exception handler based on local ref counting (#14049)" (#14099)" This reverts commit 31231f1f8aea8499c8bf4a78048a4492229c2989. --- BUILD.bazel | 9 +++ python/ray/_raylet.pyx | 25 +++++- python/ray/includes/libcoreworker.pxd | 1 + python/ray/tests/test_failure.py | 46 +++++++++++ python/ray/worker.py | 79 +++++-------------- src/ray/common/ray_object.h | 8 ++ src/ray/core_worker/core_worker.cc | 2 +- src/ray/core_worker/core_worker.h | 3 + .../memory_store/memory_store.cc | 29 ++++++- .../memory_store/memory_store.h | 9 ++- src/ray/core_worker/test/memory_store_test.cc | 66 ++++++++++++++++ 11 files changed, 209 insertions(+), 68 deletions(-) create mode 100644 src/ray/core_worker/test/memory_store_test.cc diff --git a/BUILD.bazel b/BUILD.bazel index c1745e468852..c9c049f623c6 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -702,6 +702,15 @@ cc_test( ], ) +cc_test( + name = "memory_store_test", + srcs = ["src/ray/core_worker/test/memory_store_test.cc"], + deps = [ + ":core_worker_lib", + "@com_google_googletest//:gtest_main", + ], +) + cc_test( name = "direct_actor_transport_test", srcs = ["src/ray/core_worker/test/direct_actor_transport_test.cc"], diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index da00f627345e..3dda95988cd3 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -724,6 +724,20 @@ cdef void delete_spilled_objects_handler( job_id=None) +cdef void unhandled_exception_handler(const CRayObject& error) nogil: + with gil: + worker = ray.worker.global_worker + data = None + metadata = None + if error.HasData(): + data = Buffer.make(error.GetData()) + if error.HasMetadata(): + metadata = Buffer.make(error.GetMetadata()).to_pybytes() + # TODO(ekl) why does passing a ObjectRef.nil() lead to shutdown errors? + object_ids = [None] + worker.raise_errors([(data, metadata)], object_ids) + + # This function introduces ~2-7us of overhead per call (i.e., it can be called # up to hundreds of thousands of times per second). cdef void get_py_stack(c_string* stack_out) nogil: @@ -833,6 +847,7 @@ cdef class CoreWorker: options.spill_objects = spill_objects_handler options.restore_spilled_objects = restore_spilled_objects_handler options.delete_spilled_objects = delete_spilled_objects_handler + options.unhandled_exception_handler = unhandled_exception_handler options.get_lang_stack = get_py_stack options.ref_counting_enabled = True options.is_local_mode = local_mode @@ -1443,9 +1458,13 @@ cdef class CoreWorker: object_ref.native()) def remove_object_ref_reference(self, ObjectRef object_ref): - # Note: faster to not release GIL for short-running op. - CCoreWorkerProcess.GetCoreWorker().RemoveLocalReference( - object_ref.native()) + cdef: + CObjectID c_object_id = object_ref.native() + # We need to release the gil since object destruction may call the + # unhandled exception handler. + with nogil: + CCoreWorkerProcess.GetCoreWorker().RemoveLocalReference( + c_object_id) def serialize_and_promote_object_ref(self, ObjectRef object_ref): cdef: diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd index 6114b9e7d58c..2eb5f109bf65 100644 --- a/python/ray/includes/libcoreworker.pxd +++ b/python/ray/includes/libcoreworker.pxd @@ -250,6 +250,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: (void( const c_vector[c_string]&, CWorkerType) nogil) delete_spilled_objects + (void(const CRayObject&) nogil) unhandled_exception_handler (void(c_string *stack_out) nogil) get_lang_stack c_bool ref_counting_enabled c_bool is_local_mode diff --git a/python/ray/tests/test_failure.py b/python/ray/tests/test_failure.py index fca209743129..b28ebe1ae10d 100644 --- a/python/ray/tests/test_failure.py +++ b/python/ray/tests/test_failure.py @@ -20,6 +20,52 @@ get_error_message, Semaphore) +def test_unhandled_errors(ray_start_regular): + @ray.remote + def f(): + raise ValueError() + + @ray.remote + class Actor: + def f(self): + raise ValueError() + + a = Actor.remote() + num_exceptions = 0 + + def interceptor(e): + nonlocal num_exceptions + num_exceptions += 1 + + # Test we report unhandled exceptions. + ray.worker._unhandled_error_handler = interceptor + x1 = f.remote() + x2 = a.f.remote() + del x1 + del x2 + wait_for_condition(lambda: num_exceptions == 2) + + # Test we don't report handled exceptions. + x1 = f.remote() + x2 = a.f.remote() + with pytest.raises(ray.exceptions.RayError) as err: # noqa + ray.get([x1, x2]) + del x1 + del x2 + time.sleep(1) + assert num_exceptions == 2, num_exceptions + + # Test suppression with env var works. + try: + os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] = "1" + x1 = f.remote() + del x1 + time.sleep(1) + assert num_exceptions == 2, num_exceptions + finally: + del os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] + + def test_failed_task(ray_start_regular, error_pubsub): @ray.remote def throw_exception_fct1(): diff --git a/python/ray/worker.py b/python/ray/worker.py index 00d99930cf95..5ca73860ad63 100644 --- a/python/ray/worker.py +++ b/python/ray/worker.py @@ -9,7 +9,6 @@ import logging import os import redis -from six.moves import queue import sys import threading import time @@ -69,6 +68,12 @@ logger = logging.getLogger(__name__) +# Visible for testing. +def _unhandled_error_handler(e: Exception): + logger.error("Unhandled error (suppress with " + "RAY_IGNORE_UNHANDLED_ERRORS=1): {}".format(e)) + + class Worker: """A class used to define the control flow of a worker process. @@ -277,6 +282,14 @@ def put_object(self, value, object_ref=None): self.core_worker.put_serialized_object( serialized_value, object_ref=object_ref)) + def raise_errors(self, data_metadata_pairs, object_refs): + context = self.get_serialization_context() + out = context.deserialize_objects(data_metadata_pairs, object_refs) + if "RAY_IGNORE_UNHANDLED_ERRORS" in os.environ: + return + for e in out: + _unhandled_error_handler(e) + def deserialize_objects(self, data_metadata_pairs, object_refs): context = self.get_serialization_context() return context.deserialize_objects(data_metadata_pairs, object_refs) @@ -863,13 +876,6 @@ def custom_excepthook(type, value, tb): sys.excepthook = custom_excepthook -# The last time we raised a TaskError in this process. We use this value to -# suppress redundant error messages pushed from the workers. -last_task_error_raise_time = 0 - -# The max amount of seconds to wait before printing out an uncaught error. -UNCAUGHT_ERROR_GRACE_PERIOD = 5 - def print_logs(redis_client, threads_stopped, job_id): """Prints log messages from workers on all of the nodes. @@ -1020,42 +1026,7 @@ def color_for(data: Dict[str, str]) -> str: file=print_file) -def print_error_messages_raylet(task_error_queue, threads_stopped): - """Prints message received in the given output queue. - - This checks periodically if any un-raised errors occurred in the - background. - - Args: - task_error_queue (queue.Queue): A queue used to receive errors from the - thread that listens to Redis. - threads_stopped (threading.Event): A threading event used to signal to - the thread that it should exit. - """ - - while True: - # Exit if we received a signal that we should stop. - if threads_stopped.is_set(): - return - - try: - error, t = task_error_queue.get(block=False) - except queue.Empty: - threads_stopped.wait(timeout=0.01) - continue - # Delay errors a little bit of time to attempt to suppress redundant - # messages originating from the worker. - while t + UNCAUGHT_ERROR_GRACE_PERIOD > time.time(): - threads_stopped.wait(timeout=1) - if threads_stopped.is_set(): - break - if t < last_task_error_raise_time + UNCAUGHT_ERROR_GRACE_PERIOD: - logger.debug(f"Suppressing error from worker: {error}") - else: - logger.error(f"Possible unhandled error from worker: {error}") - - -def listen_error_messages_raylet(worker, task_error_queue, threads_stopped): +def listen_error_messages_raylet(worker, threads_stopped): """Listen to error messages in the background on the driver. This runs in a separate thread on the driver and pushes (error, time) @@ -1063,8 +1034,6 @@ def listen_error_messages_raylet(worker, task_error_queue, threads_stopped): Args: worker: The worker class that this thread belongs to. - task_error_queue (queue.Queue): A queue used to communicate with the - thread that prints the errors found by this thread. threads_stopped (threading.Event): A threading event used to signal to the thread that it should exit. """ @@ -1103,8 +1072,9 @@ def listen_error_messages_raylet(worker, task_error_queue, threads_stopped): error_message = error_data.error_message if (error_data.type == ray_constants.TASK_PUSH_ERROR): - # Delay it a bit to see if we can suppress it - task_error_queue.put((error_message, time.time())) + # TODO(ekl) remove task push errors entirely now that we have + # the separate unhandled exception handler. + pass else: logger.warning(error_message) except (OSError, redis.exceptions.ConnectionError) as e: @@ -1267,19 +1237,12 @@ def connect(node, # temporarily using this implementation which constantly queries the # scheduler for new error messages. if mode == SCRIPT_MODE: - q = queue.Queue() worker.listener_thread = threading.Thread( target=listen_error_messages_raylet, name="ray_listen_error_messages", - args=(worker, q, worker.threads_stopped)) - worker.printer_thread = threading.Thread( - target=print_error_messages_raylet, - name="ray_print_error_messages", - args=(q, worker.threads_stopped)) + args=(worker, worker.threads_stopped)) worker.listener_thread.daemon = True worker.listener_thread.start() - worker.printer_thread.daemon = True - worker.printer_thread.start() if log_to_driver: global_worker_stdstream_dispatcher.add_handler( "ray_print_logs", print_to_stdstream) @@ -1332,8 +1295,6 @@ def disconnect(exiting_interpreter=False): worker.import_thread.join_import_thread() if hasattr(worker, "listener_thread"): worker.listener_thread.join() - if hasattr(worker, "printer_thread"): - worker.printer_thread.join() if hasattr(worker, "logger_thread"): worker.logger_thread.join() worker.threads_stopped.clear() @@ -1445,13 +1406,11 @@ def get(object_refs, *, timeout=None): raise ValueError("'object_refs' must either be an object ref " "or a list of object refs.") - global last_task_error_raise_time # TODO(ujvl): Consider how to allow user to retrieve the ready objects. values, debugger_breakpoint = worker.get_objects( object_refs, timeout=timeout) for i, value in enumerate(values): if isinstance(value, RayError): - last_task_error_raise_time = time.time() if isinstance(value, ray.exceptions.ObjectLostError): worker.core_worker.dump_object_store_memory_usage() if isinstance(value, RayTaskError): diff --git a/src/ray/common/ray_object.h b/src/ray/common/ray_object.h index 633a5d787c7e..c036550a8652 100644 --- a/src/ray/common/ray_object.h +++ b/src/ray/common/ray_object.h @@ -92,12 +92,20 @@ class RayObject { /// large to return directly as part of a gRPC response). bool IsInPlasmaError() const; + /// Mark this object as accessed before. + void SetAccessed() { accessed_ = true; }; + + /// Check if this object was accessed before. + bool WasAccessed() const { return accessed_; } + private: std::shared_ptr data_; std::shared_ptr metadata_; const std::vector nested_ids_; /// Whether this class holds a data copy. bool has_data_copy_; + /// Whether this object was accessed. + bool accessed_ = false; }; } // namespace ray diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 0180e0a7ab84..06d12387c8ad 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -422,7 +422,7 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_ return Status::OK(); }, options_.ref_counting_enabled ? reference_counter_ : nullptr, local_raylet_client_, - options_.check_signals)); + options_.check_signals, options_.unhandled_exception_handler)); auto check_node_alive_fn = [this](const NodeID &node_id) { auto node = gcs_client_->Nodes().Get(node_id); diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h index 2ced7a10fdb8..47023df7b40b 100644 --- a/src/ray/core_worker/core_worker.h +++ b/src/ray/core_worker/core_worker.h @@ -82,6 +82,7 @@ struct CoreWorkerOptions { spill_objects(nullptr), restore_spilled_objects(nullptr), delete_spilled_objects(nullptr), + unhandled_exception_handler(nullptr), get_lang_stack(nullptr), kill_main(nullptr), ref_counting_enabled(false), @@ -146,6 +147,8 @@ struct CoreWorkerOptions { /// Application-language callback to delete objects from external storage. std::function &, rpc::WorkerType)> delete_spilled_objects; + /// Function to call on error objects never retrieved. + std::function unhandled_exception_handler; /// Language worker callback to get the current call stack. std::function get_lang_stack; // Function that tries to interrupt the currently running Python thread. diff --git a/src/ray/core_worker/store_provider/memory_store/memory_store.cc b/src/ray/core_worker/store_provider/memory_store/memory_store.cc index 6dad1b37be72..7897b6504e82 100644 --- a/src/ray/core_worker/store_provider/memory_store/memory_store.cc +++ b/src/ray/core_worker/store_provider/memory_store/memory_store.cc @@ -93,6 +93,7 @@ void GetRequest::Set(const ObjectID &object_id, std::shared_ptr objec if (is_ready_) { return; // We have already hit the number of objects to return limit. } + object->SetAccessed(); objects_.emplace(object_id, object); if (objects_.size() == num_objects_ || (abort_if_any_object_is_exception_ && object->IsException() && @@ -106,6 +107,7 @@ std::shared_ptr GetRequest::Get(const ObjectID &object_id) const { std::unique_lock lock(mutex_); auto iter = objects_.find(object_id); if (iter != objects_.end()) { + iter->second->SetAccessed(); return iter->second; } @@ -116,11 +118,13 @@ CoreWorkerMemoryStore::CoreWorkerMemoryStore( std::function store_in_plasma, std::shared_ptr counter, std::shared_ptr raylet_client, - std::function check_signals) + std::function check_signals, + std::function unhandled_exception_handler) : store_in_plasma_(store_in_plasma), ref_counter_(counter), raylet_client_(raylet_client), - check_signals_(check_signals) {} + check_signals_(check_signals), + unhandled_exception_handler_(unhandled_exception_handler) {} void CoreWorkerMemoryStore::GetAsync( const ObjectID &object_id, std::function)> callback) { @@ -136,6 +140,7 @@ void CoreWorkerMemoryStore::GetAsync( } // It's important for performance to run the callback outside the lock. if (ptr != nullptr) { + ptr->SetAccessed(); callback(ptr); } } @@ -146,6 +151,7 @@ std::shared_ptr CoreWorkerMemoryStore::GetOrPromoteToPlasma( auto iter = objects_.find(object_id); if (iter != objects_.end()) { auto obj = iter->second; + obj->SetAccessed(); if (obj->IsInPlasmaError()) { return nullptr; } @@ -210,6 +216,8 @@ bool CoreWorkerMemoryStore::Put(const RayObject &object, const ObjectID &object_ if (should_add_entry) { // If there is no existing get request, then add the `RayObject` to map. objects_.emplace(object_id, object_entry); + } else { + OnErase(object_entry); } } @@ -223,6 +231,7 @@ bool CoreWorkerMemoryStore::Put(const RayObject &object, const ObjectID &object_ // It's important for performance to run the callbacks outside the lock. for (const auto &cb : async_callbacks) { + object_entry->SetAccessed(); cb(object_entry); } @@ -257,6 +266,7 @@ Status CoreWorkerMemoryStore::GetImpl(const std::vector &object_ids, const auto &object_id = object_ids[i]; auto iter = objects_.find(object_id); if (iter != objects_.end()) { + iter->second->SetAccessed(); (*results)[i] = iter->second; if (remove_after_get) { // Note that we cannot remove the object_id from `objects_` now, @@ -426,6 +436,7 @@ void CoreWorkerMemoryStore::Delete(const absl::flat_hash_set &object_i if (it->second->IsInPlasmaError()) { plasma_ids_to_delete->insert(object_id); } else { + OnErase(it->second); objects_.erase(it); } } @@ -435,7 +446,11 @@ void CoreWorkerMemoryStore::Delete(const absl::flat_hash_set &object_i void CoreWorkerMemoryStore::Delete(const std::vector &object_ids) { absl::MutexLock lock(&mu_); for (const auto &object_id : object_ids) { - objects_.erase(object_id); + auto it = objects_.find(object_id); + if (it != objects_.end()) { + OnErase(it->second); + objects_.erase(it); + } } } @@ -451,6 +466,14 @@ bool CoreWorkerMemoryStore::Contains(const ObjectID &object_id, bool *in_plasma) return false; } +void CoreWorkerMemoryStore::OnErase(std::shared_ptr obj) { + // TODO(ekl) note that this doesn't warn on errors that are stored in plasma. + if (obj->IsException() && !obj->IsInPlasmaError() && !obj->WasAccessed() && + unhandled_exception_handler_ != nullptr) { + unhandled_exception_handler_(*obj); + } +} + MemoryStoreStats CoreWorkerMemoryStore::GetMemoryStoreStatisticalData() { absl::MutexLock lock(&mu_); MemoryStoreStats item; diff --git a/src/ray/core_worker/store_provider/memory_store/memory_store.h b/src/ray/core_worker/store_provider/memory_store/memory_store.h index 709227f65206..0ca94ef6cc02 100644 --- a/src/ray/core_worker/store_provider/memory_store/memory_store.h +++ b/src/ray/core_worker/store_provider/memory_store/memory_store.h @@ -35,7 +35,8 @@ class CoreWorkerMemoryStore { std::function store_in_plasma = nullptr, std::shared_ptr counter = nullptr, std::shared_ptr raylet_client = nullptr, - std::function check_signals = nullptr); + std::function check_signals = nullptr, + std::function unhandled_exception_handler = nullptr); ~CoreWorkerMemoryStore(){}; /// Put an object with specified ID into object store. @@ -143,6 +144,9 @@ class CoreWorkerMemoryStore { std::vector> *results, bool abort_if_any_object_is_exception); + /// Called when an object is erased from the store. + void OnErase(std::shared_ptr obj); + /// Optional callback for putting objects into the plasma store. std::function store_in_plasma_; @@ -173,6 +177,9 @@ class CoreWorkerMemoryStore { /// Function passed in to be called to check for signals (e.g., Ctrl-C). std::function check_signals_; + + /// Function called to report unhandled exceptions. + std::function unhandled_exception_handler_; }; } // namespace ray diff --git a/src/ray/core_worker/test/memory_store_test.cc b/src/ray/core_worker/test/memory_store_test.cc new file mode 100644 index 000000000000..f4403e4a887e --- /dev/null +++ b/src/ray/core_worker/test/memory_store_test.cc @@ -0,0 +1,66 @@ +// Copyright 2017 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/core_worker/store_provider/memory_store/memory_store.h" + +#include "gtest/gtest.h" +#include "ray/common/test_util.h" + +namespace ray { + +TEST(TestMemoryStore, TestReportUnhandledErrors) { + std::vector> results; + WorkerContext context(WorkerType::WORKER, WorkerID::FromRandom(), JobID::FromInt(0)); + int unhandled_count = 0; + + std::shared_ptr provider = + std::make_shared( + nullptr, nullptr, nullptr, nullptr, + [&](const RayObject &obj) { unhandled_count++; }); + RayObject obj1(rpc::ErrorType::TASK_EXECUTION_EXCEPTION); + RayObject obj2(rpc::ErrorType::TASK_EXECUTION_EXCEPTION); + auto id1 = ObjectID::FromRandom(); + auto id2 = ObjectID::FromRandom(); + + // Check delete without get. + RAY_CHECK(provider->Put(obj1, id1)); + RAY_CHECK(provider->Put(obj2, id2)); + ASSERT_EQ(unhandled_count, 0); + provider->Delete({id1, id2}); + ASSERT_EQ(unhandled_count, 2); + unhandled_count = 0; + + // Check delete after get. + RAY_CHECK(provider->Put(obj1, id1)); + RAY_CHECK(provider->Put(obj1, id2)); + provider->Get({id1}, 1, 100, context, false, &results); + provider->GetOrPromoteToPlasma(id2); + provider->Delete({id1, id2}); + ASSERT_EQ(unhandled_count, 0); + + // Check delete after async get. + provider->GetAsync({id2}, [](std::shared_ptr obj) {}); + RAY_CHECK(provider->Put(obj1, id1)); + RAY_CHECK(provider->Put(obj2, id2)); + provider->GetAsync({id1}, [](std::shared_ptr obj) {}); + provider->Delete({id1, id2}); + ASSERT_EQ(unhandled_count, 0); +} + +} // namespace ray + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} From a99ada74be457533f6ec75f6149706276f742623 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 016/244] Revert "skip restart and multi restart test on win (#14084)" This reverts commit 59c26ac766fe3e5963a2b59b6baa3f69d20fdbf1. --- python/ray/tests/test_actor_failures.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/ray/tests/test_actor_failures.py b/python/ray/tests/test_actor_failures.py index ff9c9fd45a0e..f26f87a0c101 100644 --- a/python/ray/tests/test_actor_failures.py +++ b/python/ray/tests/test_actor_failures.py @@ -73,7 +73,6 @@ def create_object(self, size): assert num_success > 0 -@pytest.mark.skipif(sys.platform == "win32", reason="Very flaky on Windows.") def test_actor_restart(ray_init_with_task_retry_delay): """Test actor restart when actor process is killed.""" @@ -434,7 +433,6 @@ def increase(self): assert ray.get(RetryableTask.remote(remote_actor)) == 3 -@pytest.mark.skipif(sys.platform == "win32", reason="Very flaky on Windows.") # NOTE(hchen): we set object_timeout_milliseconds to 1s for # this test. Because if this value is too small, suprious task reconstruction # may happen and cause the test fauilure. If the value is too large, this test From 91a780449603da3c026ac81d518e736ae9d9c6ca Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 017/244] Revert "[hotfix] Fix mac build (#14075)" This reverts commit 172293087751d30e2b0f9f922330dc05656f0184. --- src/ray/core_worker/reference_count.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ray/core_worker/reference_count.cc b/src/ray/core_worker/reference_count.cc index 652663ecf50c..87400ca21252 100644 --- a/src/ray/core_worker/reference_count.cc +++ b/src/ray/core_worker/reference_count.cc @@ -1026,7 +1026,7 @@ void ReferenceCounter::PushToLocationSubscribers(ReferenceTable::iterator it) { const auto callbacks = it->second.location_subscription_callbacks; it->second.location_subscription_callbacks.clear(); it->second.location_version++; - for (const auto &callback : callbacks) { + for (const auto callback : callbacks) { callback(it->second.locations, it->second.object_size, it->second.spilled_url, it->second.spilled_node_id, it->second.location_version); } From 52042d1f600c93f134f750906b2f20a28d4d86f0 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 018/244] Revert "Unhandled exception handler based on local ref counting (#14049)" This reverts commit 64783d30f83a973533436588397820b5f00ac0a6. --- BUILD.bazel | 9 --- python/ray/_raylet.pyx | 25 +----- python/ray/includes/libcoreworker.pxd | 1 - python/ray/tests/test_failure.py | 46 ----------- python/ray/worker.py | 79 ++++++++++++++----- src/ray/common/ray_object.h | 8 -- src/ray/core_worker/core_worker.cc | 2 +- src/ray/core_worker/core_worker.h | 3 - .../memory_store/memory_store.cc | 29 +------ .../memory_store/memory_store.h | 9 +-- src/ray/core_worker/test/memory_store_test.cc | 66 ---------------- 11 files changed, 68 insertions(+), 209 deletions(-) delete mode 100644 src/ray/core_worker/test/memory_store_test.cc diff --git a/BUILD.bazel b/BUILD.bazel index c9c049f623c6..c1745e468852 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -702,15 +702,6 @@ cc_test( ], ) -cc_test( - name = "memory_store_test", - srcs = ["src/ray/core_worker/test/memory_store_test.cc"], - deps = [ - ":core_worker_lib", - "@com_google_googletest//:gtest_main", - ], -) - cc_test( name = "direct_actor_transport_test", srcs = ["src/ray/core_worker/test/direct_actor_transport_test.cc"], diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 3dda95988cd3..da00f627345e 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -724,20 +724,6 @@ cdef void delete_spilled_objects_handler( job_id=None) -cdef void unhandled_exception_handler(const CRayObject& error) nogil: - with gil: - worker = ray.worker.global_worker - data = None - metadata = None - if error.HasData(): - data = Buffer.make(error.GetData()) - if error.HasMetadata(): - metadata = Buffer.make(error.GetMetadata()).to_pybytes() - # TODO(ekl) why does passing a ObjectRef.nil() lead to shutdown errors? - object_ids = [None] - worker.raise_errors([(data, metadata)], object_ids) - - # This function introduces ~2-7us of overhead per call (i.e., it can be called # up to hundreds of thousands of times per second). cdef void get_py_stack(c_string* stack_out) nogil: @@ -847,7 +833,6 @@ cdef class CoreWorker: options.spill_objects = spill_objects_handler options.restore_spilled_objects = restore_spilled_objects_handler options.delete_spilled_objects = delete_spilled_objects_handler - options.unhandled_exception_handler = unhandled_exception_handler options.get_lang_stack = get_py_stack options.ref_counting_enabled = True options.is_local_mode = local_mode @@ -1458,13 +1443,9 @@ cdef class CoreWorker: object_ref.native()) def remove_object_ref_reference(self, ObjectRef object_ref): - cdef: - CObjectID c_object_id = object_ref.native() - # We need to release the gil since object destruction may call the - # unhandled exception handler. - with nogil: - CCoreWorkerProcess.GetCoreWorker().RemoveLocalReference( - c_object_id) + # Note: faster to not release GIL for short-running op. + CCoreWorkerProcess.GetCoreWorker().RemoveLocalReference( + object_ref.native()) def serialize_and_promote_object_ref(self, ObjectRef object_ref): cdef: diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd index 2eb5f109bf65..6114b9e7d58c 100644 --- a/python/ray/includes/libcoreworker.pxd +++ b/python/ray/includes/libcoreworker.pxd @@ -250,7 +250,6 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: (void( const c_vector[c_string]&, CWorkerType) nogil) delete_spilled_objects - (void(const CRayObject&) nogil) unhandled_exception_handler (void(c_string *stack_out) nogil) get_lang_stack c_bool ref_counting_enabled c_bool is_local_mode diff --git a/python/ray/tests/test_failure.py b/python/ray/tests/test_failure.py index b28ebe1ae10d..fca209743129 100644 --- a/python/ray/tests/test_failure.py +++ b/python/ray/tests/test_failure.py @@ -20,52 +20,6 @@ get_error_message, Semaphore) -def test_unhandled_errors(ray_start_regular): - @ray.remote - def f(): - raise ValueError() - - @ray.remote - class Actor: - def f(self): - raise ValueError() - - a = Actor.remote() - num_exceptions = 0 - - def interceptor(e): - nonlocal num_exceptions - num_exceptions += 1 - - # Test we report unhandled exceptions. - ray.worker._unhandled_error_handler = interceptor - x1 = f.remote() - x2 = a.f.remote() - del x1 - del x2 - wait_for_condition(lambda: num_exceptions == 2) - - # Test we don't report handled exceptions. - x1 = f.remote() - x2 = a.f.remote() - with pytest.raises(ray.exceptions.RayError) as err: # noqa - ray.get([x1, x2]) - del x1 - del x2 - time.sleep(1) - assert num_exceptions == 2, num_exceptions - - # Test suppression with env var works. - try: - os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] = "1" - x1 = f.remote() - del x1 - time.sleep(1) - assert num_exceptions == 2, num_exceptions - finally: - del os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] - - def test_failed_task(ray_start_regular, error_pubsub): @ray.remote def throw_exception_fct1(): diff --git a/python/ray/worker.py b/python/ray/worker.py index 5ca73860ad63..00d99930cf95 100644 --- a/python/ray/worker.py +++ b/python/ray/worker.py @@ -9,6 +9,7 @@ import logging import os import redis +from six.moves import queue import sys import threading import time @@ -68,12 +69,6 @@ logger = logging.getLogger(__name__) -# Visible for testing. -def _unhandled_error_handler(e: Exception): - logger.error("Unhandled error (suppress with " - "RAY_IGNORE_UNHANDLED_ERRORS=1): {}".format(e)) - - class Worker: """A class used to define the control flow of a worker process. @@ -282,14 +277,6 @@ def put_object(self, value, object_ref=None): self.core_worker.put_serialized_object( serialized_value, object_ref=object_ref)) - def raise_errors(self, data_metadata_pairs, object_refs): - context = self.get_serialization_context() - out = context.deserialize_objects(data_metadata_pairs, object_refs) - if "RAY_IGNORE_UNHANDLED_ERRORS" in os.environ: - return - for e in out: - _unhandled_error_handler(e) - def deserialize_objects(self, data_metadata_pairs, object_refs): context = self.get_serialization_context() return context.deserialize_objects(data_metadata_pairs, object_refs) @@ -876,6 +863,13 @@ def custom_excepthook(type, value, tb): sys.excepthook = custom_excepthook +# The last time we raised a TaskError in this process. We use this value to +# suppress redundant error messages pushed from the workers. +last_task_error_raise_time = 0 + +# The max amount of seconds to wait before printing out an uncaught error. +UNCAUGHT_ERROR_GRACE_PERIOD = 5 + def print_logs(redis_client, threads_stopped, job_id): """Prints log messages from workers on all of the nodes. @@ -1026,7 +1020,42 @@ def color_for(data: Dict[str, str]) -> str: file=print_file) -def listen_error_messages_raylet(worker, threads_stopped): +def print_error_messages_raylet(task_error_queue, threads_stopped): + """Prints message received in the given output queue. + + This checks periodically if any un-raised errors occurred in the + background. + + Args: + task_error_queue (queue.Queue): A queue used to receive errors from the + thread that listens to Redis. + threads_stopped (threading.Event): A threading event used to signal to + the thread that it should exit. + """ + + while True: + # Exit if we received a signal that we should stop. + if threads_stopped.is_set(): + return + + try: + error, t = task_error_queue.get(block=False) + except queue.Empty: + threads_stopped.wait(timeout=0.01) + continue + # Delay errors a little bit of time to attempt to suppress redundant + # messages originating from the worker. + while t + UNCAUGHT_ERROR_GRACE_PERIOD > time.time(): + threads_stopped.wait(timeout=1) + if threads_stopped.is_set(): + break + if t < last_task_error_raise_time + UNCAUGHT_ERROR_GRACE_PERIOD: + logger.debug(f"Suppressing error from worker: {error}") + else: + logger.error(f"Possible unhandled error from worker: {error}") + + +def listen_error_messages_raylet(worker, task_error_queue, threads_stopped): """Listen to error messages in the background on the driver. This runs in a separate thread on the driver and pushes (error, time) @@ -1034,6 +1063,8 @@ def listen_error_messages_raylet(worker, threads_stopped): Args: worker: The worker class that this thread belongs to. + task_error_queue (queue.Queue): A queue used to communicate with the + thread that prints the errors found by this thread. threads_stopped (threading.Event): A threading event used to signal to the thread that it should exit. """ @@ -1072,9 +1103,8 @@ def listen_error_messages_raylet(worker, threads_stopped): error_message = error_data.error_message if (error_data.type == ray_constants.TASK_PUSH_ERROR): - # TODO(ekl) remove task push errors entirely now that we have - # the separate unhandled exception handler. - pass + # Delay it a bit to see if we can suppress it + task_error_queue.put((error_message, time.time())) else: logger.warning(error_message) except (OSError, redis.exceptions.ConnectionError) as e: @@ -1237,12 +1267,19 @@ def connect(node, # temporarily using this implementation which constantly queries the # scheduler for new error messages. if mode == SCRIPT_MODE: + q = queue.Queue() worker.listener_thread = threading.Thread( target=listen_error_messages_raylet, name="ray_listen_error_messages", - args=(worker, worker.threads_stopped)) + args=(worker, q, worker.threads_stopped)) + worker.printer_thread = threading.Thread( + target=print_error_messages_raylet, + name="ray_print_error_messages", + args=(q, worker.threads_stopped)) worker.listener_thread.daemon = True worker.listener_thread.start() + worker.printer_thread.daemon = True + worker.printer_thread.start() if log_to_driver: global_worker_stdstream_dispatcher.add_handler( "ray_print_logs", print_to_stdstream) @@ -1295,6 +1332,8 @@ def disconnect(exiting_interpreter=False): worker.import_thread.join_import_thread() if hasattr(worker, "listener_thread"): worker.listener_thread.join() + if hasattr(worker, "printer_thread"): + worker.printer_thread.join() if hasattr(worker, "logger_thread"): worker.logger_thread.join() worker.threads_stopped.clear() @@ -1406,11 +1445,13 @@ def get(object_refs, *, timeout=None): raise ValueError("'object_refs' must either be an object ref " "or a list of object refs.") + global last_task_error_raise_time # TODO(ujvl): Consider how to allow user to retrieve the ready objects. values, debugger_breakpoint = worker.get_objects( object_refs, timeout=timeout) for i, value in enumerate(values): if isinstance(value, RayError): + last_task_error_raise_time = time.time() if isinstance(value, ray.exceptions.ObjectLostError): worker.core_worker.dump_object_store_memory_usage() if isinstance(value, RayTaskError): diff --git a/src/ray/common/ray_object.h b/src/ray/common/ray_object.h index c036550a8652..633a5d787c7e 100644 --- a/src/ray/common/ray_object.h +++ b/src/ray/common/ray_object.h @@ -92,20 +92,12 @@ class RayObject { /// large to return directly as part of a gRPC response). bool IsInPlasmaError() const; - /// Mark this object as accessed before. - void SetAccessed() { accessed_ = true; }; - - /// Check if this object was accessed before. - bool WasAccessed() const { return accessed_; } - private: std::shared_ptr data_; std::shared_ptr metadata_; const std::vector nested_ids_; /// Whether this class holds a data copy. bool has_data_copy_; - /// Whether this object was accessed. - bool accessed_ = false; }; } // namespace ray diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 06d12387c8ad..0180e0a7ab84 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -422,7 +422,7 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_ return Status::OK(); }, options_.ref_counting_enabled ? reference_counter_ : nullptr, local_raylet_client_, - options_.check_signals, options_.unhandled_exception_handler)); + options_.check_signals)); auto check_node_alive_fn = [this](const NodeID &node_id) { auto node = gcs_client_->Nodes().Get(node_id); diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h index 47023df7b40b..2ced7a10fdb8 100644 --- a/src/ray/core_worker/core_worker.h +++ b/src/ray/core_worker/core_worker.h @@ -82,7 +82,6 @@ struct CoreWorkerOptions { spill_objects(nullptr), restore_spilled_objects(nullptr), delete_spilled_objects(nullptr), - unhandled_exception_handler(nullptr), get_lang_stack(nullptr), kill_main(nullptr), ref_counting_enabled(false), @@ -147,8 +146,6 @@ struct CoreWorkerOptions { /// Application-language callback to delete objects from external storage. std::function &, rpc::WorkerType)> delete_spilled_objects; - /// Function to call on error objects never retrieved. - std::function unhandled_exception_handler; /// Language worker callback to get the current call stack. std::function get_lang_stack; // Function that tries to interrupt the currently running Python thread. diff --git a/src/ray/core_worker/store_provider/memory_store/memory_store.cc b/src/ray/core_worker/store_provider/memory_store/memory_store.cc index 7897b6504e82..6dad1b37be72 100644 --- a/src/ray/core_worker/store_provider/memory_store/memory_store.cc +++ b/src/ray/core_worker/store_provider/memory_store/memory_store.cc @@ -93,7 +93,6 @@ void GetRequest::Set(const ObjectID &object_id, std::shared_ptr objec if (is_ready_) { return; // We have already hit the number of objects to return limit. } - object->SetAccessed(); objects_.emplace(object_id, object); if (objects_.size() == num_objects_ || (abort_if_any_object_is_exception_ && object->IsException() && @@ -107,7 +106,6 @@ std::shared_ptr GetRequest::Get(const ObjectID &object_id) const { std::unique_lock lock(mutex_); auto iter = objects_.find(object_id); if (iter != objects_.end()) { - iter->second->SetAccessed(); return iter->second; } @@ -118,13 +116,11 @@ CoreWorkerMemoryStore::CoreWorkerMemoryStore( std::function store_in_plasma, std::shared_ptr counter, std::shared_ptr raylet_client, - std::function check_signals, - std::function unhandled_exception_handler) + std::function check_signals) : store_in_plasma_(store_in_plasma), ref_counter_(counter), raylet_client_(raylet_client), - check_signals_(check_signals), - unhandled_exception_handler_(unhandled_exception_handler) {} + check_signals_(check_signals) {} void CoreWorkerMemoryStore::GetAsync( const ObjectID &object_id, std::function)> callback) { @@ -140,7 +136,6 @@ void CoreWorkerMemoryStore::GetAsync( } // It's important for performance to run the callback outside the lock. if (ptr != nullptr) { - ptr->SetAccessed(); callback(ptr); } } @@ -151,7 +146,6 @@ std::shared_ptr CoreWorkerMemoryStore::GetOrPromoteToPlasma( auto iter = objects_.find(object_id); if (iter != objects_.end()) { auto obj = iter->second; - obj->SetAccessed(); if (obj->IsInPlasmaError()) { return nullptr; } @@ -216,8 +210,6 @@ bool CoreWorkerMemoryStore::Put(const RayObject &object, const ObjectID &object_ if (should_add_entry) { // If there is no existing get request, then add the `RayObject` to map. objects_.emplace(object_id, object_entry); - } else { - OnErase(object_entry); } } @@ -231,7 +223,6 @@ bool CoreWorkerMemoryStore::Put(const RayObject &object, const ObjectID &object_ // It's important for performance to run the callbacks outside the lock. for (const auto &cb : async_callbacks) { - object_entry->SetAccessed(); cb(object_entry); } @@ -266,7 +257,6 @@ Status CoreWorkerMemoryStore::GetImpl(const std::vector &object_ids, const auto &object_id = object_ids[i]; auto iter = objects_.find(object_id); if (iter != objects_.end()) { - iter->second->SetAccessed(); (*results)[i] = iter->second; if (remove_after_get) { // Note that we cannot remove the object_id from `objects_` now, @@ -436,7 +426,6 @@ void CoreWorkerMemoryStore::Delete(const absl::flat_hash_set &object_i if (it->second->IsInPlasmaError()) { plasma_ids_to_delete->insert(object_id); } else { - OnErase(it->second); objects_.erase(it); } } @@ -446,11 +435,7 @@ void CoreWorkerMemoryStore::Delete(const absl::flat_hash_set &object_i void CoreWorkerMemoryStore::Delete(const std::vector &object_ids) { absl::MutexLock lock(&mu_); for (const auto &object_id : object_ids) { - auto it = objects_.find(object_id); - if (it != objects_.end()) { - OnErase(it->second); - objects_.erase(it); - } + objects_.erase(object_id); } } @@ -466,14 +451,6 @@ bool CoreWorkerMemoryStore::Contains(const ObjectID &object_id, bool *in_plasma) return false; } -void CoreWorkerMemoryStore::OnErase(std::shared_ptr obj) { - // TODO(ekl) note that this doesn't warn on errors that are stored in plasma. - if (obj->IsException() && !obj->IsInPlasmaError() && !obj->WasAccessed() && - unhandled_exception_handler_ != nullptr) { - unhandled_exception_handler_(*obj); - } -} - MemoryStoreStats CoreWorkerMemoryStore::GetMemoryStoreStatisticalData() { absl::MutexLock lock(&mu_); MemoryStoreStats item; diff --git a/src/ray/core_worker/store_provider/memory_store/memory_store.h b/src/ray/core_worker/store_provider/memory_store/memory_store.h index 0ca94ef6cc02..709227f65206 100644 --- a/src/ray/core_worker/store_provider/memory_store/memory_store.h +++ b/src/ray/core_worker/store_provider/memory_store/memory_store.h @@ -35,8 +35,7 @@ class CoreWorkerMemoryStore { std::function store_in_plasma = nullptr, std::shared_ptr counter = nullptr, std::shared_ptr raylet_client = nullptr, - std::function check_signals = nullptr, - std::function unhandled_exception_handler = nullptr); + std::function check_signals = nullptr); ~CoreWorkerMemoryStore(){}; /// Put an object with specified ID into object store. @@ -144,9 +143,6 @@ class CoreWorkerMemoryStore { std::vector> *results, bool abort_if_any_object_is_exception); - /// Called when an object is erased from the store. - void OnErase(std::shared_ptr obj); - /// Optional callback for putting objects into the plasma store. std::function store_in_plasma_; @@ -177,9 +173,6 @@ class CoreWorkerMemoryStore { /// Function passed in to be called to check for signals (e.g., Ctrl-C). std::function check_signals_; - - /// Function called to report unhandled exceptions. - std::function unhandled_exception_handler_; }; } // namespace ray diff --git a/src/ray/core_worker/test/memory_store_test.cc b/src/ray/core_worker/test/memory_store_test.cc deleted file mode 100644 index f4403e4a887e..000000000000 --- a/src/ray/core_worker/test/memory_store_test.cc +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright 2017 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "ray/core_worker/store_provider/memory_store/memory_store.h" - -#include "gtest/gtest.h" -#include "ray/common/test_util.h" - -namespace ray { - -TEST(TestMemoryStore, TestReportUnhandledErrors) { - std::vector> results; - WorkerContext context(WorkerType::WORKER, WorkerID::FromRandom(), JobID::FromInt(0)); - int unhandled_count = 0; - - std::shared_ptr provider = - std::make_shared( - nullptr, nullptr, nullptr, nullptr, - [&](const RayObject &obj) { unhandled_count++; }); - RayObject obj1(rpc::ErrorType::TASK_EXECUTION_EXCEPTION); - RayObject obj2(rpc::ErrorType::TASK_EXECUTION_EXCEPTION); - auto id1 = ObjectID::FromRandom(); - auto id2 = ObjectID::FromRandom(); - - // Check delete without get. - RAY_CHECK(provider->Put(obj1, id1)); - RAY_CHECK(provider->Put(obj2, id2)); - ASSERT_EQ(unhandled_count, 0); - provider->Delete({id1, id2}); - ASSERT_EQ(unhandled_count, 2); - unhandled_count = 0; - - // Check delete after get. - RAY_CHECK(provider->Put(obj1, id1)); - RAY_CHECK(provider->Put(obj1, id2)); - provider->Get({id1}, 1, 100, context, false, &results); - provider->GetOrPromoteToPlasma(id2); - provider->Delete({id1, id2}); - ASSERT_EQ(unhandled_count, 0); - - // Check delete after async get. - provider->GetAsync({id2}, [](std::shared_ptr obj) {}); - RAY_CHECK(provider->Put(obj1, id1)); - RAY_CHECK(provider->Put(obj2, id2)); - provider->GetAsync({id1}, [](std::shared_ptr obj) {}); - provider->Delete({id1, id2}); - ASSERT_EQ(unhandled_count, 0); -} - -} // namespace ray - -int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} From f7297fdadcfe5250029c7c59b421a90fb978c93a Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 019/244] Revert "[operator] expose RAY_CONFIG_DIR env var (fix #14074) (#14076)" This reverts commit 4f856778af2e4f30fbd8d764fd205095895988f4. --- python/ray/ray_operator/operator_utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/ray/ray_operator/operator_utils.py b/python/ray/ray_operator/operator_utils.py index 98a31ce6f9b7..3dc50e9a1529 100644 --- a/python/ray/ray_operator/operator_utils.py +++ b/python/ray/ray_operator/operator_utils.py @@ -10,9 +10,7 @@ RAY_NAMESPACE = os.environ.get("RAY_OPERATOR_POD_NAMESPACE") -RAY_CONFIG_DIR = os.environ.get("RAY_CONFIG_DIR") or \ - os.path.expanduser("~/ray_cluster_configs") - +RAY_CONFIG_DIR = os.path.expanduser("~/ray_cluster_configs") CONFIG_SUFFIX = "_config.yaml" CONFIG_FIELDS = { From de75b49b26d9c072a183473c76325b7836cbd136 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 020/244] Revert "skip test_basic_reconstruction_put on win (#14082)" This reverts commit 88f3f8beb5a7bca589a95b2f68b3d5be8e74c20b. --- python/ray/tests/test_reconstruction.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/ray/tests/test_reconstruction.py b/python/ray/tests/test_reconstruction.py index 1589f77d8332..35d00a9b819d 100644 --- a/python/ray/tests/test_reconstruction.py +++ b/python/ray/tests/test_reconstruction.py @@ -163,7 +163,6 @@ def dependent_task(x): raise e.as_instanceof_cause() -@pytest.mark.skipif(sys.platform == "win32", reason="Very flaky on Windows.") @pytest.mark.parametrize("reconstruction_enabled", [False, True]) def test_basic_reconstruction_put(ray_start_cluster, reconstruction_enabled): config = { From 516c8b7e9eac0f449c5b0ebce1f4cec61b9ad695 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 021/244] Revert "[OBOD] Disable the ownership-based object directory for all tests that use ray.objects(). (#14065)" This reverts commit 9b9376eef47e87be457cd91ea8e4197a24be01aa. --- python/ray/tests/test_advanced_3.py | 23 +++++++++++-------- python/ray/tests/test_client_references.py | 13 +++-------- python/ray/tests/test_multi_node.py | 10 -------- .../tune/tests/test_trial_scheduler_pbt.py | 17 ++------------ 4 files changed, 18 insertions(+), 45 deletions(-) diff --git a/python/ray/tests/test_advanced_3.py b/python/ray/tests/test_advanced_3.py index f9c736689e61..2e60f40e997c 100644 --- a/python/ray/tests/test_advanced_3.py +++ b/python/ray/tests/test_advanced_3.py @@ -21,8 +21,9 @@ import setproctitle import subprocess -from ray.test_utils import (check_call_ray, wait_for_condition, - wait_for_num_actors, new_scheduler_enabled) +from ray.test_utils import (check_call_ray, RayTestTimeoutException, + wait_for_condition, wait_for_num_actors, + new_scheduler_enabled) logger = logging.getLogger(__name__) @@ -155,6 +156,15 @@ def f(x): assert ray.get(f.remote(non_local.remote())) == non_local_node.unique_id +def wait_for_num_objects(num_objects, timeout=10): + start_time = time.time() + while time.time() - start_time < timeout: + if len(ray.objects()) >= num_objects: + return + time.sleep(0.1) + raise RayTestTimeoutException("Timed out while waiting for global state.") + + def test_global_state_api(shutdown_only): ray.init(num_cpus=5, num_gpus=3, resources={"CustomResource": 1}) @@ -614,14 +624,7 @@ def f(self): def test_lease_request_leak(shutdown_only): - ray.init( - num_cpus=1, - _system_config={ - # This test uses ray.objects(), which only works with the GCS-based - # object directory - "ownership_based_object_directory_enabled": False, - "object_timeout_milliseconds": 200 - }) + ray.init(num_cpus=1, _system_config={"object_timeout_milliseconds": 200}) assert len(ray.objects()) == 0 @ray.remote diff --git a/python/ray/tests/test_client_references.py b/python/ray/tests/test_client_references.py index b0dd01b0498a..54bfa7f4290c 100644 --- a/python/ray/tests/test_client_references.py +++ b/python/ray/tests/test_client_references.py @@ -33,17 +33,10 @@ def test_cond(): @pytest.mark.parametrize( - "ray_start_cluster", - [{ + "ray_start_cluster", [{ "num_nodes": 1, - "do_init": False, - # This test uses ray.objects(), which only works with the GCS-based - # object directory - "_system_config": { - "ownership_based_object_directory_enabled": False - }, - }], - indirect=True) + "do_init": False + }], indirect=True) def test_delete_refs_on_disconnect(ray_start_cluster): cluster = ray_start_cluster with ray_start_cluster_client_server_pair(cluster.address) as pair: diff --git a/python/ray/tests/test_multi_node.py b/python/ray/tests/test_multi_node.py index 464d985eafe2..ae9ae1c1e981 100644 --- a/python/ray/tests/test_multi_node.py +++ b/python/ray/tests/test_multi_node.py @@ -178,16 +178,6 @@ def f(): assert "success" in out -@pytest.mark.parametrize( - "call_ray_start", - [ - "ray start --head --num-cpus=1 --min-worker-port=0 " - "--max-worker-port=0 --port 0 --system-config=" - # This test uses ray.objects(), which only works with the GCS-based - # object directory - "{\"ownership_based_object_directory_enabled\":false}", - ], - indirect=True) def test_cleanup_on_driver_exit(call_ray_start): # This test will create a driver that creates a bunch of objects and then # exits. The entries in the object table should be cleaned up. diff --git a/python/ray/tune/tests/test_trial_scheduler_pbt.py b/python/ray/tune/tests/test_trial_scheduler_pbt.py index 48ba7322958b..300ea0bfbc25 100644 --- a/python/ray/tune/tests/test_trial_scheduler_pbt.py +++ b/python/ray/tune/tests/test_trial_scheduler_pbt.py @@ -29,14 +29,7 @@ def __call__(self, *args, **kwargs): class PopulationBasedTrainingMemoryTest(unittest.TestCase): def setUp(self): - ray.init( - num_cpus=1, - object_store_memory=100 * MB, - _system_config={ - # This test uses ray.objects(), which only works with the - # GCS-based object directory - "ownership_based_object_directory_enabled": False, - }) + ray.init(num_cpus=1, object_store_memory=100 * MB) def tearDown(self): ray.shutdown() @@ -97,13 +90,7 @@ def save(self, *args, **kwargs): class PopulationBasedTrainingFileDescriptorTest(unittest.TestCase): def setUp(self): - ray.init( - num_cpus=2, - _system_config={ - # This test uses ray.objects(), which only works with the - # GCS-based object directory - "ownership_based_object_directory_enabled": False, - }) + ray.init(num_cpus=2) os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "0" def tearDown(self): From 7a93143acce235ac94a1874990e2fa546ba5c394 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 022/244] Revert "[OBOD] Add support for ownership-based object directory object recovery. (#14066)" This reverts commit 7fb5bab572790a027d1fd3adbe948e0702709301. --- src/ray/core_worker/core_worker.cc | 69 +++++++++--------------------- 1 file changed, 20 insertions(+), 49 deletions(-) diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 0180e0a7ab84..86f6344b53dc 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -535,56 +535,27 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_ actor_manager_ = std::unique_ptr( new ActorManager(gcs_client_, direct_actor_submitter_, reference_counter_)); - std::function - object_lookup_fn; - - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - object_lookup_fn = [this, node_addr_factory](const ObjectID &object_id, - const ObjectLookupCallback &callback) { - std::vector locations; - const absl::optional> object_locations = - reference_counter_->GetObjectLocations(object_id); - if (object_locations.has_value()) { - locations.reserve(object_locations.value().size()); - for (const auto &node_id : object_locations.value()) { - absl::optional addr = node_addr_factory(node_id); - if (addr.has_value()) { - locations.push_back(addr.value()); - } else { - // We're getting potentially stale locations directly from the reference - // counter, so the location might be a dead node. - RAY_LOG(DEBUG) << "Location " << node_id - << " is dead, not using it in the recovery of object " - << object_id; + auto object_lookup_fn = [this](const ObjectID &object_id, + const ObjectLookupCallback &callback) { + return gcs_client_->Objects().AsyncGetLocations( + object_id, [this, object_id, callback]( + const Status &status, + const boost::optional &result) { + RAY_CHECK_OK(status); + std::vector locations; + for (const auto &loc : result->locations()) { + const auto &node_id = NodeID::FromBinary(loc.manager()); + auto node = gcs_client_->Nodes().Get(node_id); + RAY_CHECK(node.has_value()); + rpc::Address address; + address.set_raylet_id(node->node_id()); + address.set_ip_address(node->node_manager_address()); + address.set_port(node->node_manager_port()); + locations.push_back(address); } - } - } - callback(object_id, locations); - return Status::OK(); - }; - } else { - object_lookup_fn = [this](const ObjectID &object_id, - const ObjectLookupCallback &callback) { - return gcs_client_->Objects().AsyncGetLocations( - object_id, [this, object_id, callback]( - const Status &status, - const boost::optional &result) { - RAY_CHECK_OK(status); - std::vector locations; - for (const auto &loc : result->locations()) { - const auto &node_id = NodeID::FromBinary(loc.manager()); - auto node = gcs_client_->Nodes().Get(node_id); - RAY_CHECK(node.has_value()); - rpc::Address address; - address.set_raylet_id(node->node_id()); - address.set_ip_address(node->node_manager_address()); - address.set_port(node->node_manager_port()); - locations.push_back(address); - } - callback(object_id, locations); - }); - }; - } + callback(object_id, locations); + }); + }; object_recovery_manager_ = std::unique_ptr(new ObjectRecoveryManager( rpc_address_, raylet_client_factory, local_raylet_client_, object_lookup_fn, From 5c7751348fef01d054a1f9abc7774dbda98bef2f Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 023/244] Revert "[autoscaler][kubernetes][docs] Updated Kubernetes Documentation (#14016)" This reverts commit c34d1ef8bb602088bf92772184e4ba80ba62fe87. --- doc/kubernetes/example.py | 55 ++ doc/kubernetes/ray-cluster.yaml | 60 +- doc/kubernetes/ray-job.yaml | 32 + doc/kubernetes/ray-namespace.yaml | 4 + doc/source/cluster/deploy.rst | 4 + doc/source/cluster/k8s-operator.rst | 241 ++++++++ doc/source/cluster/kubernetes-gpu.rst | 91 --- doc/source/cluster/kubernetes-manual.rst | 162 ----- doc/source/cluster/kubernetes.rst | 734 ++++++++--------------- doc/source/package-ref.rst | 1 - doc/source/ray-dashboard.rst | 2 - 11 files changed, 635 insertions(+), 751 deletions(-) create mode 100644 doc/kubernetes/example.py create mode 100644 doc/kubernetes/ray-job.yaml create mode 100644 doc/kubernetes/ray-namespace.yaml create mode 100644 doc/source/cluster/k8s-operator.rst delete mode 100644 doc/source/cluster/kubernetes-gpu.rst delete mode 100644 doc/source/cluster/kubernetes-manual.rst diff --git a/doc/kubernetes/example.py b/doc/kubernetes/example.py new file mode 100644 index 000000000000..b1ea3e23d901 --- /dev/null +++ b/doc/kubernetes/example.py @@ -0,0 +1,55 @@ +from collections import Counter +import os +import sys +import time +import ray + + +@ray.remote +def gethostname(x): + import platform + import time + time.sleep(0.01) + return x + (platform.node(), ) + + +def wait_for_nodes(expected): + # Wait for all nodes to join the cluster. + while True: + num_nodes = len(ray.nodes()) + if num_nodes < expected: + print("{} nodes have joined so far, waiting for {} more.".format( + num_nodes, expected - num_nodes)) + sys.stdout.flush() + time.sleep(1) + else: + break + + +def main(): + wait_for_nodes(4) + + # Check that objects can be transferred from each node to each other node. + for i in range(10): + print("Iteration {}".format(i)) + results = [ + gethostname.remote(gethostname.remote(())) for _ in range(100) + ] + print(Counter(ray.get(results))) + sys.stdout.flush() + + print("Success!") + sys.stdout.flush() + + +if __name__ == "__main__": + # NOTE: If you know you're running this on the head node, you can just + # use "localhost" here. + # redis_host = "localhost" + if ("RAY_HEAD_SERVICE_HOST" not in os.environ + or os.environ["RAY_HEAD_SERVICE_HOST"] == ""): + raise ValueError("RAY_HEAD_SERVICE_HOST environment variable empty." + "Is there a ray cluster running?") + redis_host = os.environ["RAY_HEAD_SERVICE_HOST"] + ray.init(address=redis_host + ":6379") + main() diff --git a/doc/kubernetes/ray-cluster.yaml b/doc/kubernetes/ray-cluster.yaml index fe3a04c486e7..70d386ad5b21 100644 --- a/doc/kubernetes/ray-cluster.yaml +++ b/doc/kubernetes/ray-cluster.yaml @@ -6,18 +6,24 @@ metadata: name: ray-head spec: ports: - - name: client - protocol: TCP - port: 10001 - targetPort: 10001 - - name: dashboard - protocol: TCP - port: 8265 - targetPort: 8265 - - name: redis - protocol: TCP - port: 6379 - targetPort: 6379 + # Redis ports. + - name: redis-primary + port: 6379 + targetPort: 6379 + - name: redis-shard-0 + port: 6380 + targetPort: 6380 + - name: redis-shard-1 + port: 6381 + targetPort: 6381 + + # Ray internal communication ports. + - name: object-manager + port: 12345 + targetPort: 12345 + - name: node-manager + port: 12346 + targetPort: 12346 selector: component: ray-head --- @@ -56,12 +62,14 @@ spec: image: rayproject/ray:nightly imagePullPolicy: IfNotPresent command: [ "/bin/bash", "-c", "--" ] - args: - - "ray start --head --port=6379 --redis-shard-ports=6380,6381 --num-cpus=$MY_CPU_REQUEST --object-manager-port=12345 --node-manager-port=12346 --block" + args: + - "ray start --head --node-ip-address=$MY_POD_IP --port=6379 --redis-shard-ports=6380,6381 --num-cpus=$MY_CPU_REQUEST --object-manager-port=12345 --node-manager-port=12346 --block" ports: - - containerPort: 6379 # Redis port - - containerPort: 10001 # Used by Ray Client - - containerPort: 8265 # Used by Ray Dashboard + - containerPort: 6379 # Redis port. + - containerPort: 6380 # Redis port. + - containerPort: 6381 # Redis port. + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. # This volume allocates shared memory for Ray to use for its plasma # object store. If you do not provide this, Ray will fall back to @@ -70,6 +78,11 @@ spec: - mountPath: /dev/shm name: dshm env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + # This is used in the ray start command so that Ray can spawn the # correct number of processes. Omitting this may lead to degraded # performance. @@ -111,14 +124,19 @@ spec: imagePullPolicy: IfNotPresent command: ["/bin/bash", "-c", "--"] args: - - "ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_SERVICE_HOST:$RAY_HEAD_SERVICE_PORT_REDIS --object-manager-port=12345 --node-manager-port=12346 --block" - # This volume allocates shared memory for Ray to use for its plasma - # object store. If you do not provide this, Ray will fall back to - # /tmp which cause slowdowns if is not a shared memory volume. + - "ray start --node-ip-address=$MY_POD_IP --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_SERVICE_HOST:$RAY_HEAD_SERVICE_PORT_REDIS_PRIMARY --object-manager-port=12345 --node-manager-port=12346 --block" + ports: + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. volumeMounts: - mountPath: /dev/shm name: dshm env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + # This is used in the ray start command so that Ray can spawn the # correct number of processes. Omitting this may lead to degraded # performance. diff --git a/doc/kubernetes/ray-job.yaml b/doc/kubernetes/ray-job.yaml new file mode 100644 index 000000000000..686359e167d8 --- /dev/null +++ b/doc/kubernetes/ray-job.yaml @@ -0,0 +1,32 @@ +# Job to run a Ray program in its own pod. Assumes that a cluster is already +# running (e.g., from './ray-cluster.yaml'). +apiVersion: batch/v1 +kind: Job +metadata: + namespace: ray + generateName: ray-test-job- +spec: + template: + spec: + restartPolicy: Never + containers: + - name: ray-head + image: rayproject/ray:nightly + imagePullPolicy: IfNotPresent + command: [ "/bin/bash", "-c", "--" ] + args: + - "cd ~ && wget https://raw.githubusercontent.com/ray-project/ray/master/doc/kubernetes/example.py && + ray start --node-ip-address=$MY_POD_IP --num-cpus=0 --address=$RAY_HEAD_SERVICE_HOST:$RAY_HEAD_SERVICE_PORT_REDIS_PRIMARY --object-manager-port=12345 --node-manager-port=12346 && + python example.py" + ports: + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. + env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + resources: + requests: + cpu: 100m + memory: 512Mi diff --git a/doc/kubernetes/ray-namespace.yaml b/doc/kubernetes/ray-namespace.yaml new file mode 100644 index 000000000000..3f379c3759b0 --- /dev/null +++ b/doc/kubernetes/ray-namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: ray diff --git a/doc/source/cluster/deploy.rst b/doc/source/cluster/deploy.rst index e9253614f496..60a45e171062 100644 --- a/doc/source/cluster/deploy.rst +++ b/doc/source/cluster/deploy.rst @@ -3,6 +3,10 @@ Ray with Cluster Managers ========================= +.. note:: + + If you're using AWS, Azure or GCP you can use the :ref:`Ray Cluster Launcher ` to simplify the cluster setup process. + .. toctree:: :maxdepth: 2 diff --git a/doc/source/cluster/k8s-operator.rst b/doc/source/cluster/k8s-operator.rst new file mode 100644 index 000000000000..d846fe029177 --- /dev/null +++ b/doc/source/cluster/k8s-operator.rst @@ -0,0 +1,241 @@ +.. _k8s-operator: + +The Ray Kubernetes Operator +================================= + +Ray provides a `Kubernetes Operator`_ for managing autoscaling Ray clusters. +Using the operator provides similar functionality to deploying a Ray cluster using +the :ref:`Ray Cluster Launcher`. However, working with the operator does not require +running Ray locally -- all interactions with your Ray cluster are mediated by Kubernetes. + +The operator makes use of a `Kubernetes Custom Resource`_ called a *RayCluster*. +A RayCluster is specified by a configuration similar to the ``yaml`` files used by the Ray Cluster Launcher. +Internally, the operator uses Ray's autoscaler to manage your Ray cluster. However, the autoscaler runs in a +separate operator pod, rather than on the Ray head node. Applying multiple RayCluster custom resources in the operator's +namespace allows the operator to manage several Ray clusters. + +The rest of this document explains step-by-step how to use the Ray Kubernetes Operator to launch a Ray cluster on your existing Kubernetes cluster. + +.. role:: bash(code) + :language: bash + +.. note:: + The Ray Kubernetes Operator is still experimental. For the yaml files in the examples below, we recomend using the latest master version of Ray. + +.. warning:: + The Ray Kubernetes Operator requires Kubernetes version at least ``v1.17.0``. Check Kubernetes version info with the command + :bash:`kubectl version`. + +.. note:: + The example commands in this document launch six Kubernetes pods, using a total of 6 CPU and 3.5Gi memory. + If you are experimenting using a test Kubernetes environment such as `minikube`_, make sure to provision sufficient resources, e.g. + :bash:`minikube start --cpus=6 --memory=\"4G\"`. + Alternatively, reduce resource usage by editing the ``yaml`` files referenced in this document; for example, reduce ``minWorkers`` + in ``example_cluster.yaml`` and ``example_cluster2.yaml``. + + +Applying the RayCluster Custom Resource Definition +-------------------------------------------------- +First, we need to apply the `Kubernetes Custom Resource Definition`_ (CRD) defining a RayCluster. + +.. note:: + + Creating a Custom Resource Definition requires the appropriate Kubernetes cluster-level privileges. + +.. code-block:: shell + + $ kubectl apply -f ray/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml + + customresourcedefinition.apiextensions.k8s.io/rayclusters.cluster.ray.io created + +Picking a Kubernetes Namespace +------------------------------- +The rest of the Kubernetes resources we will use are `namespaced`_. +You can use an existing namespace for your Ray clusters or create a new one if you have permissions. +For this example, we will create a namespace called ``ray``. + +.. code-block:: shell + + $ kubectl create namespace ray + + namespace/ray created + +Starting the Operator +---------------------- + +To launch the operator in our namespace, we execute the following command. + +.. code-block:: shell + + $ kubectl -n ray apply -f ray/python/ray/autoscaler/kubernetes/operator_configs/operator.yaml + + serviceaccount/ray-operator-serviceaccount created + role.rbac.authorization.k8s.io/ray-operator-role created + rolebinding.rbac.authorization.k8s.io/ray-operator-rolebinding created + pod/ray-operator-pod created + +The output shows that we've launched a Pod named ``ray-operator-pod``. This is the pod that runs the operator process. +The ServiceAccount, Role, and RoleBinding we have created grant the operator pod the `permissions`_ it needs to manage Ray clusters. + +Launching Ray Clusters +---------------------- +Finally, to launch a Ray cluster, we create a RayCluster custom resource. + +.. code-block:: shell + + $ kubectl -n ray apply -f ray/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml + + raycluster.cluster.ray.io/example-cluster created + +The operator detects the RayCluster resource we've created and launches an autoscaling Ray cluster. +Our RayCluster configuration specifies ``minWorkers:2`` in the second entry of ``spec.podTypes``, so we get a head node and two workers upon launch. + +.. note:: + + For more details about RayCluster resources, we recommend take a looking at the annotated example ``example_cluster.yaml`` applied in the last command. + +.. code-block:: shell + + $ kubectl -n ray get pods + NAME READY STATUS RESTARTS AGE + example-cluster-ray-head-hbxvv 1/1 Running 0 72s + example-cluster-ray-worker-4hvv6 1/1 Running 0 64s + example-cluster-ray-worker-78kp5 1/1 Running 0 64s + ray-operator-pod 1/1 Running 0 2m33s + +We see four pods: the operator, the Ray head node, and two Ray worker nodes. + +Let's launch another cluster in the same namespace, this one specifiying ``minWorkers:1``. + +.. code-block:: shell + + $ kubectl -n ray apply -f ray/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml + +We confirm that both clusters are running in our namespace. + +.. code-block:: shell + + $ kubectl -n ray get rayclusters + NAME AGE + example-cluster 12m + example-cluster2 114s + + $ kubectl -n ray get pods + NAME READY STATUS RESTARTS AGE + example-cluster-ray-head-th4wv 1/1 Running 0 10m + example-cluster-ray-worker-q9pjn 1/1 Running 0 10m + example-cluster-ray-worker-qltnp 1/1 Running 0 10m + example-cluster2-ray-head-kj5mg 1/1 Running 0 10s + example-cluster2-ray-worker-qsgnd 1/1 Running 0 1s + ray-operator-pod 1/1 Running 0 10m + +Now we can :ref:`run Ray programs` on our Ray clusters. + +Monitoring +---------- +Autoscaling logs are written to the operator pod's ``stdout`` and can be accessed with :code:`kubectl logs`. +Each line of output is prefixed by the name of the cluster followed by a colon. +The following command gets the last hundred lines of autoscaling logs for our second cluster. + +.. code-block:: shell + + $ kubectl -n ray logs ray-operator-pod | grep ^example-cluster2: | tail -n 100 + +The output should include monitoring updates that look like this: + +.. code-block:: shell + + example-cluster2:2020-12-12 13:55:36,814 DEBUG autoscaler.py:693 -- Cluster status: 1 nodes + example-cluster2: - MostDelayedHeartbeats: {'172.17.0.4': 0.04093289375305176, '172.17.0.5': 0.04084634780883789} + example-cluster2: - NodeIdleSeconds: Min=36 Mean=38 Max=41 + example-cluster2: - ResourceUsage: 0.0/2.0 CPU, 0.0/1.0 Custom1, 0.0/1.0 is_spot, 0.0 GiB/0.58 GiB memory, 0.0 GiB/0.1 GiB object_store_memory + example-cluster2: - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 + example-cluster2:Worker node types: + example-cluster2: - worker-nodes: 1 + example-cluster2:2020-12-12 13:55:36,870 INFO resource_demand_scheduler.py:148 -- Cluster resources: [{'object_store_memory': 1.0, 'node:172.17.0.4': 1.0, 'memory': 5.0, 'CPU': 1.0}, {'object_store_memory': 1.0, 'is_spot': 1.0, 'memory': 6.0, 'node:172.17.0.5': 1.0, 'Custom1': 1.0, 'CPU': 1.0}] + example-cluster2:2020-12-12 13:55:36,870 INFO resource_demand_scheduler.py:149 -- Node counts: defaultdict(, {'head-node': 1, 'worker-nodes + ': 1}) + example-cluster2:2020-12-12 13:55:36,870 INFO resource_demand_scheduler.py:159 -- Placement group demands: [] + example-cluster2:2020-12-12 13:55:36,870 INFO resource_demand_scheduler.py:186 -- Resource demands: [] + example-cluster2:2020-12-12 13:55:36,870 INFO resource_demand_scheduler.py:187 -- Unfulfilled demands: [] + example-cluster2:2020-12-12 13:55:36,891 INFO resource_demand_scheduler.py:209 -- Node requests: {} + example-cluster2:2020-12-12 13:55:36,903 DEBUG autoscaler.py:654 -- example-cluster2-ray-worker-tdxdr is not being updated and passes config check (can_update=True). + example-cluster2:2020-12-12 13:55:36,923 DEBUG autoscaler.py:654 -- example-cluster2-ray-worker-tdxdr is not being updated and passes config check (can_update=True). + + +Updating and Retrying +--------------------- +To update a Ray cluster's configuration, edit the ``yaml`` file of the corresponding RayCluster resource +and apply it again: + +.. code-block:: shell + + $ kubectl -n ray apply -f ray/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml + +To force a restart with the same configuration, you can add an `annotation`_ to the RayCluster resource's ``metadata.labels`` field, e.g. + +.. code-block:: yaml + + apiVersion: cluster.ray.io/v1 + kind: RayCluster + metadata: + name: example-cluster + annotations: + try: again + spec: + ... + +Then reapply the RayCluster, as above. + +Currently, editing and reapplying a RayCluster resource will stop and restart Ray processes running on the corresponding +Ray cluster. Similarly, deleting and relaunching the operator pod will stop and restart Ray processes on all Ray clusters in the operator's namespace. +This behavior may be modified in future releases. + + +Cleaning Up +----------- +We shut down a Ray cluster by deleting the associated RayCluster resource. +Either of the next two commands will delete our second cluster ``example-cluster2``. + +.. code-block:: shell + + $ kubectl -n ray delete raycluster example-cluster2 + # OR + $ kubectl -n ray delete -f ray/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml + +The pods associated with ``example-cluster2`` go into ``TERMINATING`` status. In a few moments, we check that these pods are gone: + +.. code-block:: shell + + $ kubectl -n ray get pods + NAME READY STATUS RESTARTS AGE + example-cluster-ray-head-th4wv 1/1 Running 0 57m + example-cluster-ray-worker-q9pjn 1/1 Running 0 56m + example-cluster-ray-worker-qltnp 1/1 Running 0 56m + ray-operator-pod 1/1 Running 0 57m + +Only the operator pod and the first ``example-cluster`` remain. + +To finish clean-up, we delete the cluster ``example-cluster`` and then the operator's resources. + +.. code-block:: shell + + $ kubectl -n ray delete raycluster example-cluster + $ kubectl -n ray delete -f ray/python/ray/autoscaler/kubernetes/operator_configs/operator.yaml + +If you like, you can delete the RayCluster customer resource definition. +(Using the operator again will then require reapplying the CRD.) + +.. code-block:: shell + + $ kubectl delete crd rayclusters.cluster.ray.io + # OR + $ kubectl delete -f ray/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml + +.. _`Kubernetes Operator`: https://kubernetes.io/docs/concepts/extend-kubernetes/operator/ +.. _`Kubernetes Custom Resource`: https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/custom-resources/ +.. _`Kubernetes Custom Resource Definition`: https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definitions/ +.. _`annotation`: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/#attaching-metadata-to-objects +.. _`permissions`: https://kubernetes.io/docs/reference/access-authn-authz/rbac/ +.. _`minikube`: https://minikube.sigs.k8s.io/docs/start/ +.. _`namespaced`: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/ diff --git a/doc/source/cluster/kubernetes-gpu.rst b/doc/source/cluster/kubernetes-gpu.rst deleted file mode 100644 index c91382bf6e7a..000000000000 --- a/doc/source/cluster/kubernetes-gpu.rst +++ /dev/null @@ -1,91 +0,0 @@ -:orphan: - -.. _k8s-gpus: - -GPU Usage with Kubernetes -========================= -This document provides some notes on GPU usage with Kubernetes. - -To use GPUs on Kubernetes, you will need to configure both your Kubernetes setup and add additional values to your Ray cluster configuration. - -For relevant documentation for GPU usage on different clouds, see instructions for `GKE`_, for `EKS`_, and for `AKS`_. - -The `Ray Docker Hub `_ hosts CUDA-based images packaged with Ray for use in Kubernetes pods. -For example, the image ``rayproject/ray-ml:nightly-gpu`` is ideal for running GPU-based ML workloads with the most recent nightly build of Ray. -Read :ref:`here` for further details on Ray images. - -Using Nvidia GPUs requires specifying the relevant resource `limits` in the container fields of your Kubernetes configurations. -(Kubernetes `sets `_ -the GPU request equal to the limit.) The configuration for a pod running a Ray GPU image and -using one Nvidia GPU looks like this: - -.. code-block:: yaml - - apiVersion: v1 - kind: Pod - metadata: - generateName: example-cluster-ray-worker - spec: - ... - containers: - - name: ray-node - image: rayproject/ray:nightly-gpu - ... - resources: - cpu: 1000m - memory: 512Mi - limits: - memory: 512Mi - nvidia.com/gpu: 1 - -GPU taints and tolerations --------------------------- -.. note:: - - Users using a managed Kubernetes service probably don't need to worry about this section. - -The `Nvidia gpu plugin`_ for Kubernetes applies `taints`_ to GPU nodes; these taints prevent non-GPU pods from being scheduled on GPU nodes. -Managed Kubernetes services like GKE, EKS, and AKS automatically apply matching `tolerations`_ -to pods requesting GPU resources. Tolerations are applied by means of Kubernetes's `ExtendedResourceToleration`_ `admission controller`_. -If this admission controller is not enabled for your Kubernetes cluster, you may need to manually add a GPU toleration each of to your GPU pod configurations. For example, - -.. code-block:: yaml - - apiVersion: v1 - kind: Pod - metadata: - generateName: example-cluster-ray-worker - spec: - ... - tolerations: - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - ... - containers: - - name: ray-node - image: rayproject/ray:nightly-gpu - ... - -Further reference and discussion --------------------------------- -Read about Kubernetes device plugins `here `__, -about Kubernetes GPU plugins `here `__, -and about Nvidia's GPU plugin for Kubernetes `here `__. - -If you run into problems setting up GPUs for your Ray cluster on Kubernetes, please reach out to us at ``_. - -Questions or Issues? --------------------- - -.. include:: /_help.rst - -.. _`GKE`: https://cloud.google.com/kubernetes-engine/docs/how-to/gpus -.. _`EKS`: https://docs.aws.amazon.com/eks/latest/userguide/eks-optimized-ami.html -.. _`AKS`: https://docs.microsoft.com/en-us/azure/aks/gpu-cluster - -.. _`tolerations`: https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ -.. _`taints`: https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ -.. _`Nvidia gpu plugin`: https://github.com/NVIDIA/k8s-device-plugin -.. _`admission controller`: https://kubernetes.io/docs/reference/access-authn-authz/admission-controllers/ -.. _`ExtendedResourceToleration`: https://kubernetes.io/docs/reference/access-authn-authz/admission-controllers/#extendedresourcetoleration diff --git a/doc/source/cluster/kubernetes-manual.rst b/doc/source/cluster/kubernetes-manual.rst deleted file mode 100644 index 5cd6e10ffc0a..000000000000 --- a/doc/source/cluster/kubernetes-manual.rst +++ /dev/null @@ -1,162 +0,0 @@ -:orphan: - -.. _ray-k8s-static: - -Deploying a Static Cluster -========================== - -This document gives an example of how to manually deploy a non-autoscaling Ray cluster on Kubernetes. - -To learn about deploying an autoscaling Ray cluster using :ref:`Ray's Kubernetes operator`, read -:ref:`here`. - -To learn about deploying an autoscaling Ray cluster using the :ref:`Ray Cluster Launcher`, read -:ref:`here`. - - -Creating a Ray Namespace ------------------------- - -First, create a `Kubernetes Namespace`_ for Ray resources on your cluster. The -following commands will create resources under this Namespace, so if you want -to use a different one than ``ray``, please be sure to also change the -``namespace`` fields in the provided ``yaml`` files and anytime you see a ``-n`` -flag passed to ``kubectl``. - -.. code-block:: shell - - $ kubectl create namespace ray - -Starting a Ray Cluster ----------------------- - - -A Ray cluster consists of a single head node and a set of worker nodes (the -provided ``ray-cluster.yaml`` file will start 3 worker nodes). In the example -Kubernetes configuration, this is implemented as: - -- A ``ray-head`` `Kubernetes Service`_ that enables the worker nodes to discover the location of the head node on start up. - This Service also enables access to the Ray Client and Ray Dashboard. -- A ``ray-head`` `Kubernetes Deployment`_ that backs the ``ray-head`` Service with a single head node pod (replica). -- A ``ray-worker`` `Kubernetes Deployment`_ with multiple worker node pods (replicas) that connect to the ``ray-head`` pod using the ``ray-head`` Service. - -Note that because the head and worker nodes are Deployments, Kubernetes will -automatically restart pods that crash to maintain the correct number of -replicas. - -- If a worker node goes down, a replacement pod will be started and joined to the cluster. -- If the head node goes down, it will be restarted. This will start a new Ray cluster. Worker nodes that were connected to the old head node will crash and be restarted, connecting to the new head node when they come back up. - -Try deploying a cluster with the provided Kubernetes config by running the -following command: - -.. code-block:: shell - - $ kubectl apply -f ray/doc/kubernetes/ray-cluster.yaml - -Verify that the pods are running by running ``kubectl get pods -n ray``. You -may have to wait up to a few minutes for the pods to enter the 'Running' -state on the first run. - -.. code-block:: shell - - $ kubectl -n ray get pods - NAME READY STATUS RESTARTS AGE - ray-head-5455bb66c9-6bxvz 1/1 Running 0 10s - ray-worker-5c49b7cc57-c6xs8 1/1 Running 0 5s - ray-worker-5c49b7cc57-d9m86 1/1 Running 0 5s - ray-worker-5c49b7cc57-kzk4s 1/1 Running 0 5s - -.. note:: - - You might see a nonzero number of RESTARTS for the worker pods. That can - happen when the worker pods start up before the head pod and the workers - aren't able to connect. This shouldn't affect the behavior of the cluster. - -To change the number of worker nodes in the cluster, change the ``replicas`` -field in the worker deployment configuration in that file and then re-apply -the config as follows: - -.. code-block:: shell - - # Edit 'ray/doc/kubernetes/ray-cluster.yaml' and change the 'replicas' - # field under the ray-worker deployment to, e.g., 4. - - # Re-apply the new configuration to the running deployment. - $ kubectl apply -f ray/doc/kubernetes/ray-cluster.yaml - service/ray-head unchanged - deployment.apps/ray-head unchanged - deployment.apps/ray-worker configured - - # Verify that there are now the correct number of worker pods running. - $ kubectl -n ray get pods - NAME READY STATUS RESTARTS AGE - ray-head-5455bb66c9-6bxvz 1/1 Running 0 30s - ray-worker-5c49b7cc57-c6xs8 1/1 Running 0 25s - ray-worker-5c49b7cc57-d9m86 1/1 Running 0 25s - ray-worker-5c49b7cc57-kzk4s 1/1 Running 0 25s - ray-worker-5c49b7cc57-zzfg2 1/1 Running 0 0s - -To validate that the restart behavior is working properly, try killing pods -and checking that they are restarted by Kubernetes: - -.. code-block:: shell - - # Delete a worker pod. - $ kubectl -n ray delete pod ray-worker-5c49b7cc57-c6xs8 - pod "ray-worker-5c49b7cc57-c6xs8" deleted - - # Check that a new worker pod was started (this may take a few seconds). - $ kubectl -n ray get pods - NAME READY STATUS RESTARTS AGE - ray-head-5455bb66c9-6bxvz 1/1 Running 0 45s - ray-worker-5c49b7cc57-d9m86 1/1 Running 0 40s - ray-worker-5c49b7cc57-kzk4s 1/1 Running 0 40s - ray-worker-5c49b7cc57-ypq8x 1/1 Running 0 0s - - # Delete the head pod. - $ kubectl -n ray delete pod ray-head-5455bb66c9-6bxvz - pod "ray-head-5455bb66c9-6bxvz" deleted - - # Check that a new head pod was started and the worker pods were restarted. - $ kubectl -n ray get pods - NAME READY STATUS RESTARTS AGE - ray-head-5455bb66c9-gqzql 1/1 Running 0 0s - ray-worker-5c49b7cc57-d9m86 1/1 Running 1 50s - ray-worker-5c49b7cc57-kzk4s 1/1 Running 1 50s - ray-worker-5c49b7cc57-ypq8x 1/1 Running 1 10s - - # You can even try deleting all of the pods in the Ray namespace and checking - # that Kubernetes brings the right number back up. - $ kubectl -n ray delete pods --all - $ kubectl -n ray get pods - NAME READY STATUS RESTARTS AGE - ray-head-5455bb66c9-7l6xj 1/1 Running 0 10s - ray-worker-5c49b7cc57-57tpv 1/1 Running 0 10s - ray-worker-5c49b7cc57-6m4kp 1/1 Running 0 10s - ray-worker-5c49b7cc57-jx2w2 1/1 Running 0 10s - -Now that we have a running cluster, :ref:`we can execute Ray programs `. - -Cleaning Up ------------ - -To delete a running Ray cluster, you can run the following command: - -.. code-block:: shell - - kubectl delete -f ray/doc/kubernetes/ray-cluster.yaml - - -Questions or Issues? --------------------- - -.. include:: /_help.rst - - -.. _`Kubernetes Namespace`: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/ -.. _`Kubernetes Service`: https://kubernetes.io/docs/concepts/services-networking/service/ -.. _`Kubernetes Deployment`: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/ -.. _`Kubernetes Job`: https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/ - -.. _`Discussion Board`: https://discuss.ray.io/ diff --git a/doc/source/cluster/kubernetes.rst b/doc/source/cluster/kubernetes.rst index 94711b59507e..36a9dc126c62 100644 --- a/doc/source/cluster/kubernetes.rst +++ b/doc/source/cluster/kubernetes.rst @@ -1,430 +1,254 @@ -*********************** -Deploying on Kubernetes -*********************** - .. _ray-k8s-deploy: -Introduction -============ -You can leverage your Kubernetes cluster as a substrate for execution of distributed Ray programs. -The Ray Autoscaler spins up and deletes Kubernetes pods according to resource demands of the Ray workload - each Ray node runs in its own Kubernetes pod. - -Quick Guide ------------ - -This document covers the following topics: - -- :ref:`Overview of methods for launching a Ray Cluster on Kubernetes` -- :ref:`Managing clusters with the Ray Cluster Launcher` -- :ref:`Managing clusters with the Ray Kubernetes Operator` -- :ref:`Interacting with a Ray Cluster via a Kubernetes Service` -- :ref:`Comparison of the Ray Cluster Launcher and Ray Kubernetes Operator` - -You can find more information at the following links: - -- :ref:`GPU usage with Kubernetes` -- :ref:`Using Ray Tune on your Kubernetes cluster` -- :ref:`How to manually set up a non-autoscaling Ray cluster on Kubernetes` - -.. _k8s-overview: - -Ray on Kubernetes -================= - -Ray supports two ways of launching an autoscaling Ray cluster on Kubernetes. - -- Using the :ref:`Ray Cluster Launcher ` -- Using the :ref:`Ray Kubernetes Operator ` - -The Cluster Launcher and Ray Kubernetes Operator provide similar functionality; each serves as an `interface to the Ray autoscaler`. -Below is a brief overview of the two tools. - -The Ray Cluster Launcher ------------------------- -The :ref:`Ray Cluster Launcher ` is geared towards experimentation and development and can be used to launch Ray clusters on Kubernetes (among other backends). -It allows you to manage an autoscaling Ray Cluster from your local environment using the :ref:`Ray CLI `. -For example, you can use ``ray up`` to launch a Ray cluster on Kubernetes and ``ray exec`` to execute commands in the Ray head node's pod. -Note that using the Cluster Launcher requires Ray to be :ref:`installed locally `. - -* Get started with the :ref:`Ray Cluster Launcher on Kubernetes`. - -The Ray Kubernetes Operator ---------------------------- -The Ray Kubernetes Operator is a Kubernetes-native solution geared towards production use cases. -Rather than handling cluster launching locally, cluster launching and autoscaling are centralized in the Operator's Pod. -The Operator follows the standard Kubernetes `pattern `__ - it runs -a control loop which manages a `Kubernetes Custom Resource`_ specifying the desired state of your Ray cluster. -Using the Kubernetes Operator does not require a local installation of Ray - all interactions with your Ray cluster are mediated by Kubernetes. - -* Get started with the :ref:`Ray Kubernetes Operator`. - - -Further reading ---------------- - -Read :ref:`here` for more details on the comparison between the Operator and Cluster Launcher. -Note that it is also possible to manually deploy a :ref:`non-autoscaling Ray cluster ` on Kubernetes. - -.. note:: - - The configuration ``yaml`` files used in this document are provided in the `Ray repository`_ - as examples to get you started. When deploying real applications, you will probably - want to build and use your own container images, add more worker nodes to the - cluster, and change the resource requests for the head and worker nodes. Refer to the provided ``yaml`` - files to be sure that you maintain important configuration options for Ray to - function properly. - - -.. _`Ray repository`: https://github.com/ray-project/ray/tree/master/python/ray/autoscaler/kubernetes - -.. _k8s-cluster-launcher: - -Managing Clusters with the Ray Cluster Launcher -=============================================== - -This section briefly explains how to use the Ray Cluster Launcher to launch a Ray cluster on your existing Kubernetes cluster. - -First, install the Kubernetes API client (``pip install kubernetes``), then make sure your Kubernetes credentials are set up properly to access the cluster (if a command like ``kubectl get pods`` succeeds, you should be good to go). - -Once you have ``kubectl`` configured locally to access the remote cluster, you should be ready to launch your cluster. The provided `ray/python/ray/autoscaler/kubernetes/example-full.yaml `__ cluster config file will create a small cluster of one pod for the head node configured to autoscale up to two worker node pods, with all pods requiring 1 CPU and 0.5GiB of memory. - -Test that it works by running the following commands from your local machine: - -.. _cluster-launcher-commands: - -.. code-block:: bash - - # Create or update the cluster. When the command finishes, it will print - # out the command that can be used to get a remote shell into the head node. - $ ray up ray/python/ray/autoscaler/kubernetes/example-full.yaml - - # List the pods running in the cluster. You shoud only see one head node - # until you start running an application, at which point worker nodes - # should be started. Don't forget to include the Ray namespace in your - # 'kubectl' commands ('ray' by default). - $ kubectl -n ray get pods - - # Get a remote screen on the head node. - $ ray attach ray/python/ray/autoscaler/kubernetes/example-full.yaml - $ # Try running a Ray program with 'ray.init(address="auto")'. - - # View monitor logs - $ ray monitor ray/python/ray/autoscaler/kubernetes/example-full.yaml - - # Tear down the cluster - $ ray down ray/python/ray/autoscaler/kubernetes/example-full.yaml - -* Learn about :ref:`running Ray programs on Kubernetes ` - -.. _k8s-operator: - -Managing clusters with the Ray Kubernetes Operator -================================================== - -.. role:: bash(code) - :language: bash - -This section explains how to use the Ray Kubernetes Operator to launch a Ray cluster on your existing Kubernetes cluster. - -The example commands in this document launch six Kubernetes pods, using a total of 6 CPU and 3.5Gi memory. -If you are experimenting using a test Kubernetes environment such as `minikube`_, make sure to provision sufficient resources, e.g. -:bash:`minikube start --cpus=6 --memory=\"4G\"`. -Alternatively, reduce resource usage by editing the ``yaml`` files referenced in this document; for example, reduce ``minWorkers`` -in ``example_cluster.yaml`` and ``example_cluster2.yaml``. +Deploying on Kubernetes +======================= .. note:: - 1. The Ray Kubernetes Operator is still experimental. For the yaml files in the examples below, we recommend using the latest master version of Ray. - 2. The Ray Kubernetes Operator requires Kubernetes version at least ``v1.17.0``. Check Kubernetes version info with the command :bash:`kubectl version`. + This document is mainly for advanced Kubernetes usage. The easiest way to run a Ray cluster on Kubernetes is by using the built-in Cluster Launcher. Please see the :ref:`Cluster Launcher documentation ` for details. -Applying the RayCluster Custom Resource Definition --------------------------------------------------- -The Ray Kubernetes operator works by managing a user-submitted `Kubernetes Custom Resource`_ (CR) called a ``RayCluster``. -A RayCluster custom resource describes the desired state of the Ray cluster. -To get started, we need to apply the `Kubernetes Custom Resource Definition`_ (CRD) defining a RayCluster. +This document assumes that you have access to a Kubernetes cluster and have +``kubectl`` installed locally and configured to access the cluster. It will +first walk you through how to deploy a Ray cluster on your existing Kubernetes +cluster, then explore a few different ways to run programs on the Ray cluster. -.. code-block:: shell +To learn about deploying an autoscaling Ray cluster using :ref:`Ray's Kubernetes operator`, read +:ref:`here`. - $ kubectl apply -f ray/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml +For information on using GPUs with Ray on Kubernetes, see :ref:`here`. - customresourcedefinition.apiextensions.k8s.io/rayclusters.cluster.ray.io created +The configuration ``yaml`` files used here are provided in the `Ray repository`_ +as examples to get you started. When deploying real applications, you will probably +want to build and use your own container images, add more worker nodes to the +cluster (or use the `Kubernetes Horizontal Pod Autoscaler`_), and change the +resource requests for the head and worker nodes. Refer to the provided ``yaml`` +files to be sure that you maintain important configuration options for Ray to +function properly. -.. note:: +.. _`Ray repository`: https://github.com/ray-project/ray/tree/master/doc/kubernetes - The file ``cluster_crd.yaml`` defining the CRD is not meant to meant to be modified by the user. Rather, users :ref:`configure ` a RayCluster CR via a file like `ray/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml `__. - The Kubernetes API server then validates the user-submitted RayCluster resource against the CRD. +Creating a Ray Namespace +------------------------ -Picking a Kubernetes Namespace -------------------------------- -The rest of the Kubernetes resources we will use are `namespaced`_. -You can use an existing namespace for your Ray clusters or create a new one if you have permissions. -For this example, we will create a namespace called ``ray``. +First, create a `Kubernetes Namespace`_ for Ray resources on your cluster. The +following commands will create resources under this Namespace, so if you want +to use a different one than ``ray``, please be sure to also change the +`namespace` fields in the provided ``yaml`` files and anytime you see a ``-n`` +flag passed to ``kubectl``. .. code-block:: shell - $ kubectl create namespace ray + $ kubectl create -f ray/doc/kubernetes/ray-namespace.yaml - namespace/ray created - -Starting the Operator +Starting a Ray Cluster ---------------------- -To launch the operator in our namespace, we execute the following command. +.. toctree:: + :hidden: -.. code-block:: shell + /cluster/k8s-operator.rst - $ kubectl -n ray apply -f ray/python/ray/autoscaler/kubernetes/operator_configs/operator.yaml +A Ray cluster consists of a single head node and a set of worker nodes (the +provided ``ray-cluster.yaml`` file will start 3 worker nodes). In the example +Kubernetes configuration, this is implemented as: - serviceaccount/ray-operator-serviceaccount created - role.rbac.authorization.k8s.io/ray-operator-role created - rolebinding.rbac.authorization.k8s.io/ray-operator-rolebinding created - pod/ray-operator-pod created +- A ``ray-head`` `Kubernetes Service`_ that enables the worker nodes to discover the location of the head node on start up. +- A ``ray-head`` `Kubernetes Deployment`_ that backs the ``ray-head`` Service with a single head node pod (replica). +- A ``ray-worker`` `Kubernetes Deployment`_ with multiple worker node pods (replicas) that connect to the ``ray-head`` pod using the ``ray-head`` Service. -The output shows that we've launched a Pod named ``ray-operator-pod``. This is the pod that runs the operator process. -The ServiceAccount, Role, and RoleBinding we have created grant the operator pod the `permissions`_ it needs to manage Ray clusters. +Note that because the head and worker nodes are Deployments, Kubernetes will +automatically restart pods that crash to maintain the correct number of +replicas. -.. _operator-launch: +- If a worker node goes down, a replacement pod will be started and joined to the cluster. +- If the head node goes down, it will be restarted. This will start a new Ray cluster. Worker nodes that were connected to the old head node will crash and be restarted, connecting to the new head node when they come back up. -Launching Ray Clusters ----------------------- -Finally, to launch a Ray cluster, we create a RayCluster custom resource. +Try deploying a cluster with the provided Kubernetes config by running the +following command: .. code-block:: shell - $ kubectl -n ray apply -f ray/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml + $ kubectl apply -f ray/doc/kubernetes/ray-cluster.yaml - raycluster.cluster.ray.io/example-cluster created - -The operator detects the RayCluster resource we've created and launches an autoscaling Ray cluster. -Our RayCluster configuration specifies ``minWorkers:2`` in the second entry of ``spec.podTypes``, so we get a head node and two workers upon launch. - -.. note:: - - For more details about RayCluster resources, we recommend take a looking at the annotated example `example_cluster.yaml `__ applied in the last command. +Verify that the pods are running by running ``kubectl get pods -n ray``. You +may have to wait up to a few minutes for the pods to enter the 'Running' +state on the first run. .. code-block:: shell - $ kubectl -n ray get pods - NAME READY STATUS RESTARTS AGE - example-cluster-ray-head-hbxvv 1/1 Running 0 72s - example-cluster-ray-worker-4hvv6 1/1 Running 0 64s - example-cluster-ray-worker-78kp5 1/1 Running 0 64s - ray-operator-pod 1/1 Running 0 2m33s - -We see four pods: the operator, the Ray head node, and two Ray worker nodes. - -Let's launch another cluster in the same namespace, this one specifiying ``minWorkers:1``. + $ kubectl -n ray get pods + NAME READY STATUS RESTARTS AGE + ray-head-5455bb66c9-6bxvz 1/1 Running 0 10s + ray-worker-5c49b7cc57-c6xs8 1/1 Running 0 5s + ray-worker-5c49b7cc57-d9m86 1/1 Running 0 5s + ray-worker-5c49b7cc57-kzk4s 1/1 Running 0 5s -.. code-block:: shell +.. note:: - $ kubectl -n ray apply -f ray/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml + You might see a nonzero number of RESTARTS for the worker pods. That can + happen when the worker pods start up before the head pod and the workers + aren't able to connect. This shouldn't affect the behavior of the cluster. -We confirm that both clusters are running in our namespace. +To change the number of worker nodes in the cluster, change the ``replicas`` +field in the worker deployment configuration in that file and then re-apply +the config as follows: .. code-block:: shell - $ kubectl -n ray get rayclusters - NAME STATUS AGE - example-cluster Running 19s - example-cluster2 Running 19s - - - $ kubectl -n ray get pods - NAME READY STATUS RESTARTS AGE - example-cluster-ray-head-th4wv 1/1 Running 0 10m - example-cluster-ray-worker-q9pjn 1/1 Running 0 10m - example-cluster-ray-worker-qltnp 1/1 Running 0 10m - example-cluster2-ray-head-kj5mg 1/1 Running 0 10s - example-cluster2-ray-worker-qsgnd 1/1 Running 0 1s - ray-operator-pod 1/1 Running 0 10m + # Edit 'ray/doc/kubernetes/ray-cluster.yaml' and change the 'replicas' + # field under the ray-worker deployment to, e.g., 4. -Now we can :ref:`run Ray programs` on our Ray clusters. - -.. _operator-logs: - -Monitoring ----------- -Autoscaling logs are written to the operator pod's ``stdout`` and can be accessed with :code:`kubectl logs`. -Each line of output is prefixed by the name of the cluster followed by a colon. -The following command gets the last hundred lines of autoscaling logs for our second cluster. - -.. code-block:: shell + # Re-apply the new configuration to the running deployment. + $ kubectl apply -f ray/doc/kubernetes/ray-cluster.yaml + service/ray-head unchanged + deployment.apps/ray-head unchanged + deployment.apps/ray-worker configured - $ kubectl -n ray logs ray-operator-pod | grep ^example-cluster2: | tail -n 100 + # Verify that there are now the correct number of worker pods running. + $ kubectl -n ray get pods + NAME READY STATUS RESTARTS AGE + ray-head-5455bb66c9-6bxvz 1/1 Running 0 30s + ray-worker-5c49b7cc57-c6xs8 1/1 Running 0 25s + ray-worker-5c49b7cc57-d9m86 1/1 Running 0 25s + ray-worker-5c49b7cc57-kzk4s 1/1 Running 0 25s + ray-worker-5c49b7cc57-zzfg2 1/1 Running 0 0s -The output should include monitoring updates that look like this: +To validate that the restart behavior is working properly, try killing pods +and checking that they are restarted by Kubernetes: .. code-block:: shell - example-cluster2:2020-12-12 13:55:36,814 DEBUG autoscaler.py:693 -- Cluster status: 1 nodes - example-cluster2: - MostDelayedHeartbeats: {'172.17.0.4': 0.04093289375305176, '172.17.0.5': 0.04084634780883789} - example-cluster2: - NodeIdleSeconds: Min=36 Mean=38 Max=41 - example-cluster2: - ResourceUsage: 0.0/2.0 CPU, 0.0/1.0 Custom1, 0.0/1.0 is_spot, 0.0 GiB/0.58 GiB memory, 0.0 GiB/0.1 GiB object_store_memory - example-cluster2: - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 - example-cluster2:Worker node types: - example-cluster2: - worker-nodes: 1 - example-cluster2:2020-12-12 13:55:36,870 INFO resource_demand_scheduler.py:148 -- Cluster resources: [{'object_store_memory': 1.0, 'node:172.17.0.4': 1.0, 'memory': 5.0, 'CPU': 1.0}, {'object_store_memory': 1.0, 'is_spot': 1.0, 'memory': 6.0, 'node:172.17.0.5': 1.0, 'Custom1': 1.0, 'CPU': 1.0}] - example-cluster2:2020-12-12 13:55:36,870 INFO resource_demand_scheduler.py:149 -- Node counts: defaultdict(, {'head-node': 1, 'worker-nodes - ': 1}) - example-cluster2:2020-12-12 13:55:36,870 INFO resource_demand_scheduler.py:159 -- Placement group demands: [] - example-cluster2:2020-12-12 13:55:36,870 INFO resource_demand_scheduler.py:186 -- Resource demands: [] - example-cluster2:2020-12-12 13:55:36,870 INFO resource_demand_scheduler.py:187 -- Unfulfilled demands: [] - example-cluster2:2020-12-12 13:55:36,891 INFO resource_demand_scheduler.py:209 -- Node requests: {} - example-cluster2:2020-12-12 13:55:36,903 DEBUG autoscaler.py:654 -- example-cluster2-ray-worker-tdxdr is not being updated and passes config check (can_update=True). - example-cluster2:2020-12-12 13:55:36,923 DEBUG autoscaler.py:654 -- example-cluster2-ray-worker-tdxdr is not being updated and passes config check (can_update=True). + # Delete a worker pod. + $ kubectl -n ray delete pod ray-worker-5c49b7cc57-c6xs8 + pod "ray-worker-5c49b7cc57-c6xs8" deleted -Cleaning Up ------------ -We shut down a Ray cluster by deleting the associated RayCluster resource. -Either of the next two commands will delete our second cluster ``example-cluster2``. - -.. code-block:: shell - - $ kubectl -n ray delete raycluster example-cluster2 - # OR - $ kubectl -n ray delete -f ray/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml + # Check that a new worker pod was started (this may take a few seconds). + $ kubectl -n ray get pods + NAME READY STATUS RESTARTS AGE + ray-head-5455bb66c9-6bxvz 1/1 Running 0 45s + ray-worker-5c49b7cc57-d9m86 1/1 Running 0 40s + ray-worker-5c49b7cc57-kzk4s 1/1 Running 0 40s + ray-worker-5c49b7cc57-ypq8x 1/1 Running 0 0s -The pods associated with ``example-cluster2`` go into the ``TERMINATING`` phase. In a few moments, we check that these pods are gone: + # Delete the head pod. + $ kubectl -n ray delete pod ray-head-5455bb66c9-6bxvz + pod "ray-head-5455bb66c9-6bxvz" deleted -.. code-block:: shell + # Check that a new head pod was started and the worker pods were restarted. + $ kubectl -n ray get pods + NAME READY STATUS RESTARTS AGE + ray-head-5455bb66c9-gqzql 1/1 Running 0 0s + ray-worker-5c49b7cc57-d9m86 1/1 Running 1 50s + ray-worker-5c49b7cc57-kzk4s 1/1 Running 1 50s + ray-worker-5c49b7cc57-ypq8x 1/1 Running 1 10s + + # You can even try deleting all of the pods in the Ray namespace and checking + # that Kubernetes brings the right number back up. + $ kubectl -n ray delete pods --all + $ kubectl -n ray get pods + NAME READY STATUS RESTARTS AGE + ray-head-5455bb66c9-7l6xj 1/1 Running 0 10s + ray-worker-5c49b7cc57-57tpv 1/1 Running 0 10s + ray-worker-5c49b7cc57-6m4kp 1/1 Running 0 10s + ray-worker-5c49b7cc57-jx2w2 1/1 Running 0 10s - $ kubectl -n ray get pods - NAME READY STATUS RESTARTS AGE - example-cluster-ray-head-th4wv 1/1 Running 0 57m - example-cluster-ray-worker-q9pjn 1/1 Running 0 56m - example-cluster-ray-worker-qltnp 1/1 Running 0 56m - ray-operator-pod 1/1 Running 0 57m +.. _ray-k8s-run: -Only the operator pod and the first ``example-cluster`` remain. +Running Ray Programs +-------------------- -To finish clean-up, we delete the cluster ``example-cluster`` and then the operator's resources. +This section assumes that you have a running Ray cluster (if you don't, please +refer to the section above to get started) and will walk you through three +different options to run a Ray program on it: -.. code-block:: shell +1. Using `kubectl exec` to run a Python script. +2. Using `kubectl exec -it bash` to work interactively in a remote shell. +3. Submitting a `Kubernetes Job`_. - $ kubectl -n ray delete raycluster example-cluster - $ kubectl -n ray delete -f ray/python/ray/autoscaler/kubernetes/operator_configs/operator.yaml +Running a program using 'kubectl exec' +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -If you like, you can delete the RayCluster customer resource definition. -(Using the operator again will then require reapplying the CRD.) +To run an example program that tests object transfers between nodes in the +cluster, try the following commands (don't forget to replace the head pod name +- you can find it by running ``kubectl -n ray get pods``): .. code-block:: shell - $ kubectl delete crd rayclusters.cluster.ray.io - # OR - $ kubectl delete -f ray/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml - - -.. _ray-k8s-interact: - -Interacting with a Ray Cluster -============================== -:ref:`Ray Client ` allows you to connect to your Ray cluster on Kubernetes and execute Ray programs. -The Ray Client server runs the Ray head node, by default on port 10001. - -:ref:`Ray Dashboard ` gives visibility into the state of your cluster. -By default, the dashboard uses port 8265 on the Ray head node. - -.. _k8s-service: + # Copy the test script onto the head node. + $ kubectl -n ray cp ray/doc/kubernetes/example.py ray-head-5455bb66c9-7l6xj:/example.py -Configuring a head node service -------------------------------- -To use Ray Client and Ray Dashboard, -you can connect via a `Kubernetes Service`_ targeting the relevant ports on the head node: + # Run the example program on the head node. + $ kubectl -n ray exec ray-head-5455bb66c9-7l6xj -- python example.py + # You should see repeated output for 10 iterations and then 'Success!' -.. _svc-example: +Running a program in a remote shell +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. code-block:: yaml - - apiVersion: v1 - kind: Service - metadata: - name: example-cluster-ray-head - spec: - # This selector must match the head node pod's selector. - selector: - component: example-cluster-ray-head - ports: - - name: client - protocol: TCP - port: 10001 - targetPort: 10001 - - name: dashboard - protocol: TCP - port: 8265 - targetPort: 8265 - - -The head node pod's ``metadata`` should have a ``label`` matching the service's ``selector`` field: +You can also run tasks interactively on the cluster by connecting a remote +shell to one of the pods. -.. code-block:: yaml +.. code-block:: shell - apiVersion: v1 - kind: Pod - metadata: - # Automatically generates a name for the pod with this prefix. - generateName: example-cluster-ray-head- - # Must match the head node service selector above if a head node - # service is required. - labels: - component: example-cluster-ray-head + # Copy the test script onto the head node. + $ kubectl -n ray cp ray/doc/kubernetes/example.py ray-head-5455bb66c9-7l6xj:/example.py -- The Ray Kubernetes Operator automatically configures a default service exposing ports 10001 and 8265 \ - on the head node pod. The Operator also adds the relevant label to the head node pod's configuration. \ - If this default service does not suit your use case, you can modify the service or create a new one, \ - for example by using the tools ``kubectl edit``, ``kubectl create``, or ``kubectl apply``. + # Get a remote shell to the head node. + $ kubectl -n ray exec -it ray-head-5455bb66c9-7l6xj -- bash -- The Ray Cluster launcher does not automatically configure a service targeting the head node. A \ - head node service can be specified in the cluster launching config's ``provider.services`` field. The example cluster lauching \ - config `example-full.yaml `__ includes \ - the :ref:`above ` service configuration as an example. + # Run the example program on the head node. + root@ray-head-6f566446c-5rdmb:/# python example.py + # You should see repeated output for 10 iterations and then 'Success!' -After launching a Ray cluster with either the Operator or Cluster Launcher, you can view the configured service: +You can also start an IPython interpreter to work interactively: .. code-block:: shell - $ kubectl -n ray get services + # From your local machine. + $ kubectl -n ray exec -it ray-head-5455bb66c9-7l6xj -- ipython - NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE - example-cluster-ray-head ClusterIP 10.106.123.159 10001/TCP,8265/TCP 52s + # From a remote shell on the head node. + $ kubectl -n ray exec -it ray-head-5455bb66c9-7l6xj -- bash + root@ray-head-6f566446c-5rdmb:/# ipython -.. _ray-k8s-run: +Once you have the IPython interpreter running, try running the following example +program: -Running Ray Programs --------------------- -Given a running Ray cluster and a :ref:`Service ` exposing the Ray Client server's port on the head pod, -we can now run Ray programs on our cluster. +.. code-block:: python -In the following examples, we assume that we have a running Ray cluster with one head node and -two worker nodes. This can be achieved in one of two ways: + from collections import Counter + import platform + import time + import ray -- Using the :ref:`Operator ` with the example resource `ray/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml `__. -- Using :ref:`Cluster Launcher `. Modify the example file `ray/python/ray/autoscaler/kubernetes/example-full.yaml `__ - by setting the field ``available_node_types.worker_node.min_workers`` - to 2 and then run ``ray up`` with the modified config. + ray.init(address="$RAY_HEAD_SERVICE_HOST:$RAY_HEAD_SERVICE_PORT_REDIS_PRIMARY") + @ray.remote + def f(x): + time.sleep(0.01) + return x + (platform.node(), ) -Using Ray Client to connect from within the Kubernetes cluster -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -You can connect to your Ray cluster from another pod in the same Kubernetes cluster. + # Check that objects can be transferred from each node to each other node. + %time Counter(ray.get([f.remote(f.remote(())) for _ in range(100)])) -For example, you can submit a Ray application to run on the Kubernetes cluster as a `Kubernetes +Submitting a Job +~~~~~~~~~~~~~~~~ + +You can also submit a Ray application to run on the cluster as a `Kubernetes Job`_. The Job will run a single pod running the Ray driver program to completion, then terminate the pod but allow you to access the logs. -The following command submits a Job which executes an `example Ray program`_. +To submit a Job that downloads and executes an `example program`_ that tests +object transfers between nodes in the cluster, run the following command: -.. code-block:: yaml +.. code-block:: shell - $ kubectl create -f ray/python/ray/autoscaler/kubernetes/job-example.yaml + $ kubectl create -f ray/doc/kubernetes/ray-job.yaml + job.batch/ray-test-job-kw5gn created -The program executed by the Job waits for three Ray nodes to connect and then tests object transfer -between the nodes. Note that the program uses the environment variables -``EXAMPLE_CLUSTER_RAY_HEAD_SERVICE_HOST`` and ``EXAMPLE_CLUSTER_RAY_HEAD_SERVICE_PORT_CLIENT`` -to access Ray Client. These `environment variables`_ are set by Kubernetes based on -the service we are using to expose the Ray head node. +.. _`example program`: https://github.com/ray-project/ray/blob/master/doc/kubernetes/example.py To view the output of the Job, first find the name of the pod that ran it, then fetch its logs: @@ -432,15 +256,16 @@ then fetch its logs: .. code-block:: shell $ kubectl -n ray get pods - NAME READY STATUS RESTARTS AGE - example-cluster-ray-head-rpqfb 1/1 Running 0 11m - example-cluster-ray-worker-4c7cn 1/1 Running 0 11m - example-cluster-ray-worker-zvglb 1/1 Running 0 11m - ray-test-job-8x2pm-77lb5 1/1 Running 0 8s + NAME READY STATUS RESTARTS AGE + ray-head-5455bb66c9-7l6xj 1/1 Running 0 15s + ray-test-job-kw5gn-5g7tv 0/1 Completed 0 10s + ray-worker-5c49b7cc57-57tpv 1/1 Running 0 15s + ray-worker-5c49b7cc57-6m4kp 1/1 Running 0 15s + ray-worker-5c49b7cc57-jx2w2 1/1 Running 0 15s # Fetch the logs. You should see repeated output for 10 iterations and then # 'Success!' - $ kubectl -n ray logs ray-test-job-8x2pm-77lb5 + $ kubectl -n ray logs ray-test-job-kw5gn-5g7tv To clean up the resources created by the Job after checking its output, run the following: @@ -457,139 +282,94 @@ the following: # Verify that the Job's pod was cleaned up. $ kubectl -n ray get pods - NAME READY STATUS RESTARTS AGE - example-cluster-ray-head-rpqfb 1/1 Running 0 11m - example-cluster-ray-worker-4c7cn 1/1 Running 0 11m - example-cluster-ray-worker-zvglb 1/1 Running 0 11m - -.. _`environment variables`: https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables -.. _`example Ray program`: https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/kubernetes/example_scripts/job_example.py - + NAME READY STATUS RESTARTS AGE + ray-head-5455bb66c9-7l6xj 1/1 Running 0 60s + ray-worker-5c49b7cc57-57tpv 1/1 Running 0 60s + ray-worker-5c49b7cc57-6m4kp 1/1 Running 0 60s + ray-worker-5c49b7cc57-jx2w2 1/1 Running 0 60s -Using Ray Client to connect from outside the Kubernetes cluster -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -To connect to the Ray cluster from outside your Kubernetes cluster, -the head node Service needs to communicate with the outside world. +Cleaning Up +----------- -One way to achieve this is by port-forwarding. -Run the following command locally: +To delete a running Ray cluster, you can run the following command: .. code-block:: shell - $ kubectl -n ray port-forward service/example-cluster-ray-head 10001:10001 + kubectl delete -f ray/doc/kubernetes/ray-cluster.yaml -`Alternatively`, you can find the head node pod and connect to it directly with -the following command: +.. _k8s-gpus: -.. code-block:: shell +Using GPUs +---------- - # Substitute the name of your Ray cluster if using a name other than "example-cluster". - $ kubectl -n ray port-forward \ - $(kubectl -n ray get pods -l ray-cluster-name=example-cluster -l ray-node-type=head -o custom-columns=:metadata.name) 10001:10001 +To use GPUs on Kubernetes, you will need to configure both your Kubernetes setup and add additional values to your Ray cluster configuration. -Then open a new shell and try out a sample program: +For relevant documentation for GPU usage on different clouds, see instructions for `GKE`_, for `EKS`_, and for `AKS`_. -.. code-block:: shell +The `Ray Docker Hub `_ hosts CUDA-based images packaged with Ray for use in Kubernetes pods. +For example, the image ``rayproject/ray-ml:nightly-gpu`` is ideal for running GPU-based ML workloads with the most recent nightly build of Ray. +Read :ref:`here` for further details on Ray images. - $ python ray/python/ray/autoscaler/kubernetes/example_scripts/run_local_example.py +Using Nvidia GPUs requires specifying the relevant resource `limits` in the container fields of your Kubernetes configurations. +(Kubernetes `sets `_ +the GPU request equal to the limit.) The configuration for a pod running a Ray GPU image and +using one Nvidia GPU looks like this: -The program in this example uses ``ray.util.connect(127.0.0.1:10001)`` to connect to the Ray cluster. +.. code-block:: yaml + apiVersion: v1 + kind: Pod + metadata: + generateName: example-cluster-ray-worker + spec: + ... + containers: + - name: ray-node + image: rayproject/ray:nightly-gpu + ... + resources: + cpu: 1000m + memory: 512Mi + limits: + memory: 512Mi + nvidia.com/gpu: 1 + +GPU taints and tolerations +~~~~~~~~~~~~~~~~~~~~~~~~~~ .. note:: - Connecting with Ray client requires using the matching minor versions of Python (for example 3.7) - on the server and client end -- that is on the Ray head node and in the environment where - ``ray.util.connect`` is invoked. Note that the default ``rayproject/ray`` images use Python 3.7. - Nightly builds are now available for Python 3.6 and 3.8 at the `Ray Docker Hub `_. + Users using a managed Kubernetes service probably don't need to worry about this section. -Running the program on the head node -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -It is also possible to execute a Ray program on the Ray head node. -(Replace the pod name with the name of your head pod -- you can find it by running ``kubectl -n ray get pods``.) +The `Nvidia gpu plugin`_ for Kubernetes applies `taints`_ to GPU nodes; these taints prevent non-GPU pods from being scheduled on GPU nodes. +Managed Kubernetes services like GKE, EKS, and AKS automatically apply matching `tolerations`_ +to pods requesting GPU resources. Tolerations are applied by means of Kubernetes's `ExtendedResourceToleration`_ `admission controller`_. +If this admission controller is not enabled for your Kubernetes cluster, you may need to manually add a GPU toleration each of to your GPU pod configurations. For example, -.. code-block:: shell - - $ kubectl -n ray exec example-cluster-ray-head-5455bb66c9-7l6xj -- python /home/ray/anaconda3/lib/python3.7/site-packages/ray/autoscaler/kubernetes/example_scripts/run_on_head.py - - -Alternatively, you can run tasks interactively on the cluster by connecting a remote -shell to one of the pods. - -.. code-block:: shell - - # Get a remote shell to the head node. - $ kubectl -n ray exec -it example-cluster-ray-head-5455bb66c9-7l6xj -- bash - - # Run the example program on the head node. - root@ray-head-6f566446c-5rdmb:/# python /home/ray/anaconda3/lib/python3.7/site-packages/ray/autoscaler/kubernetes/example_scripts/run_on_head.py - # You should see repeated output for 10 iterations and then 'Success!' - - -The program in this example uses ``ray.init(address="auto")`` to connect to the Ray cluster. - -Accessing the Dashboard ------------------------ - -The Ray Dashboard can accessed locally using ``kubectl port-forward``. - -.. code-block:: shell - - $ kubectl -n ray port-forward service/example-cluster-ray-head 8265:8265 - -After running the above command locally, the Dashboard will be accessible at ``http://localhost:8265``. - -You can also monitor the state of the cluster with ``kubectl logs`` when using the :ref:`Operator ` or with ``ray monitor`` when using -the :ref:`Ray Cluster Launcher `. - -.. warning:: - The Dashboard currently shows resource limits of the physical host each Ray node is running on, - rather than the limits of the container the node is running in. - This is a known bug tracked `here `_. - - -.. _k8s-comparison: - -Cluster Launcher vs Operator -============================ - -We compare the Ray Cluster Launcher and Ray Kubernetes Operator as methods of managing an autoscaling Ray cluster. - - -Comparison of use cases ------------------------ - -- The Cluster Launcher is convenient for development and experimentation. Using the Cluster Launcher requires a local installation of Ray. The Ray CLI then provides a convenient interface for interacting with a Ray cluster. - -- The Operator is geared towards production use cases. It does not require installing Ray locally - all interactions with your Ray cluster are mediated by Kubernetes. - - -Comparison of architectures ---------------------------- - -- With the Cluster Launcher, the user launches a Ray cluster from their local environment by invoking ``ray up``. This provisions a pod for the Ray head node, which then runs the `autoscaling process `__. - -- The `Operator `__ centralizes cluster launching and autoscaling in the `Operator pod `__. \ - The user creates a `Kubernetes Custom Resource`_ describing the intended state of the Ray cluster. \ - The Operator then detects the resource, launches a Ray cluster, and runs the autoscaling process in the operator pod. \ - The Operator can manage multiple Ray clusters by running an autoscaling process for each Ray cluster. - -Comparison of configuration options ------------------------------------ - -The configuration options for the two methods are completely analogous - compare sample configurations for the `Cluster Launcher `__ -and for the `Operator `__. -With a few exceptions, the fields of the RayCluster resource managed by the Operator are camelCase versions of the corresponding snake_case Cluster Launcher fields. -In fact, the Operator `internally `__ converts -RayCluster resources to Cluster Launching configs. - -A summary of the configuration differences: +.. code-block:: yaml -- The Cluster Launching field ``available_node_types`` for specifiying the types of pods available for autoscaling is renamed to ``podTypes`` in the Operator's RayCluster configuration. -- The Cluster Launching field ``resources`` for specifying custom Ray resources provided by a node type is renamed to ``rayResources`` in the Operator's RayCluster configuration. -- The ``provider`` field in the Cluster Launching config has no analogue in the Operator's RayCluster configuration. (The Operator fills this field internally.) -- * When using the Cluster Launcher, ``head_ray_start_commands`` should include the argument ``--autoscaling-config=~/ray_bootstrap_config.yaml``; this is important for the configuration of the head node's autoscaler. - * On the other hand, the Operator's ``headRayStartCommands`` should include a ``--no-monitor`` flag to prevent the autoscaling/monitoring process from running on the head node. + apiVersion: v1 + kind: Pod + metadata: + generateName: example-cluster-ray-worker + spec: + ... + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + ... + containers: + - name: ray-node + image: rayproject/ray:nightly-gpu + ... + +Further reference and discussion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Read about Kubernetes device plugins `here `__, +about Kubernetes GPU plugins `here `__, +and about Nvidia's GPU plugin for Kubernetes `here `__. + +If you run into problems setting up GPUs for your Ray cluster on Kubernetes, please reach out to us at ``_. Questions or Issues? -------------------- @@ -597,13 +377,19 @@ Questions or Issues? .. include:: /_help.rst - -.. _`Kubernetes Job`: https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/ +.. _`Kubernetes Horizontal Pod Autoscaler`: https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/ +.. _`Kubernetes Namespace`: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/ .. _`Kubernetes Service`: https://kubernetes.io/docs/concepts/services-networking/service/ -.. _`Kubernetes Operator`: https://kubernetes.io/docs/concepts/extend-kubernetes/operator/ -.. _`Kubernetes Custom Resource`: https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/custom-resources/ -.. _`Kubernetes Custom Resource Definition`: https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definitions/ -.. _`annotation`: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/#attaching-metadata-to-objects -.. _`permissions`: https://kubernetes.io/docs/reference/access-authn-authz/rbac/ -.. _`minikube`: https://minikube.sigs.k8s.io/docs/start/ -.. _`namespaced`: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/ +.. _`Kubernetes Deployment`: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/ +.. _`Kubernetes Job`: https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/ + +.. _`Discussion Board`: https://discuss.ray.io/ +.. _`GKE`: https://cloud.google.com/kubernetes-engine/docs/how-to/gpus +.. _`EKS`: https://docs.aws.amazon.com/eks/latest/userguide/eks-optimized-ami.html +.. _`AKS`: https://docs.microsoft.com/en-us/azure/aks/gpu-cluster + +.. _`tolerations`: https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ +.. _`taints`: https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ +.. _`Nvidia gpu plugin`: https://github.com/NVIDIA/k8s-device-plugin +.. _`admission controller`: https://kubernetes.io/docs/reference/access-authn-authz/admission-controllers/ +.. _`ExtendedResourceToleration`: https://kubernetes.io/docs/reference/access-authn-authz/admission-controllers/#extendedresourcetoleration diff --git a/doc/source/package-ref.rst b/doc/source/package-ref.rst index ebe059f972b1..db3cbd56004a 100644 --- a/doc/source/package-ref.rst +++ b/doc/source/package-ref.rst @@ -211,7 +211,6 @@ Experimental APIs .. automodule:: ray.experimental :members: -.. _ray-cli: The Ray Command Line API ------------------------ diff --git a/doc/source/ray-dashboard.rst b/doc/source/ray-dashboard.rst index 6c7276b2a5da..09a935fa2311 100644 --- a/doc/source/ray-dashboard.rst +++ b/doc/source/ray-dashboard.rst @@ -1,5 +1,3 @@ -.. _ray-dashboard: - Ray Dashboard ============= Ray's built-in dashboard provides metrics, charts, and other features that help From c5dc4a25f63abec4d63064636de49d0431f7d92c Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 024/244] Revert "[hotfix] Disable dashboard agent windows (#14062)" This reverts commit 42ad8ea636864637501ba1deea46df0e044856f6. --- dashboard/agent.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/dashboard/agent.py b/dashboard/agent.py index 7f77e2f3c09c..a1afb5f77f2a 100644 --- a/dashboard/agent.py +++ b/dashboard/agent.py @@ -7,7 +7,6 @@ import sys import socket import json -import time import traceback import aiohttp @@ -300,16 +299,6 @@ async def _check_parent(): max_bytes=args.logging_rotate_bytes, backup_count=args.logging_rotate_backup_count) - # The dashboard is currently broken on Windows. - # https://github.com/ray-project/ray/issues/14026. - if sys.platform == "win32": - logger.warning( - "The dashboard is currently disabled on windows." - "See https://github.com/ray-project/ray/issues/14026" - "for more details") - while True: - time.sleep(999) - agent = DashboardAgent( args.node_ip_address, args.redis_address, From 9128e6805dd6ec715dbaff8f0d628418d25f71ce Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 025/244] Revert "[Doc] Add PTL and RAG to community integrations (#14064)" This reverts commit 7e4892b946aad0644cf580ae04bab9b61b9ed6da. --- doc/source/ray-libraries.rst | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/doc/source/ray-libraries.rst b/doc/source/ray-libraries.rst index 604e680befac..3a0f2d8673c1 100644 --- a/doc/source/ray-libraries.rst +++ b/doc/source/ray-libraries.rst @@ -46,14 +46,8 @@ Hugging Face Transformers |hugging| State-of-the-art Natural Language Processing for Pytorch and TensorFlow 2.0. -It integrates with Ray for distributed hyperparameter tuning of transformer models: - [`Link to integration `__] -As well as for distributed document retrieval for Retrieval Augmented Generation Models - -[`Link to integration `__] - Intel Analytics Zoo |zoo| ------------------------- @@ -88,13 +82,6 @@ PyCaret is an open source low-code machine learning library in Python that aims GitHub: `https://github.com/pycaret/pycaret `_ -PyTorch Lightning |ptl| ------------------------ - -PyTorch Lightning is a popular open-source library that provides a high level interface for PyTorch. The goal of PyTorch Lightning is to structure your PyTorch code to abstract the details of training, making AI research scalable and fast to iterate on. - -[`Link to integration `__] - RayDP |raydp| ------------- @@ -164,10 +151,6 @@ XGBoost is a popular gradient boosting library for classification and regression :class: inline-figure :height: 30 -.. |ptl| image:: images/pytorch_lightning_small.png - :class: inline-figure - :height: 30 - .. |raydp| image:: images/intel.png :class: inline-figure :height: 30 From 555d1ad3b55fb7c4e081ac7c946f21e9d930ab31 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 026/244] Revert "[Tune] Revert Pinning Tune Dependencies (#14059)" This reverts commit d5bc5475d33dec6622a952c436985178664deef9. --- .github/dependabot.yml | 12 + ci/travis/install-dependencies.sh | 10 +- docker/ray-ml/Dockerfile | 3 +- .../linux-py3.6-requirements_tune.txt | 885 ++++++++++++++++++ .../linux-py3.7-requirements_tune.txt | 877 +++++++++++++++++ .../linux-py3.8-requirements_tune.txt | 864 +++++++++++++++++ python/{ => requirements}/requirements.txt | 0 ...irements_tune.txt => requirements_tune.in} | 3 + 8 files changed, 2651 insertions(+), 3 deletions(-) create mode 100644 python/requirements/linux-py3.6-requirements_tune.txt create mode 100644 python/requirements/linux-py3.7-requirements_tune.txt create mode 100644 python/requirements/linux-py3.8-requirements_tune.txt rename python/{ => requirements}/requirements.txt (100%) rename python/requirements/{requirements_tune.txt => requirements_tune.in} (92%) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 9f8b6b7a730a..3074b6042bc9 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -21,3 +21,15 @@ updates: open-pull-requests-limit: 3 reviewers: - "ray-project/ray-tune" + ignore: + # Ignore pinned dependencies in requirements.txt. + - dependency-name: aiohttp + - dependency-name: msgpack + - dependency-name: opencv-python-headless + - dependency-name: pandas + - dependency-name: scipy + - dependency-name: pydantic + - dependency-name: cython + - dependency-name: llmvlite + - dependency-name: pytest + - dependency-name: scikit-learn diff --git a/ci/travis/install-dependencies.sh b/ci/travis/install-dependencies.sh index ea4691723d99..498aaf419533 100755 --- a/ci/travis/install-dependencies.sh +++ b/ci/travis/install-dependencies.sh @@ -293,7 +293,7 @@ install_dependencies() { local status="0"; local errmsg=""; for _ in {1..3}; do - errmsg=$(CC=gcc pip install -r "${WORKSPACE_DIR}"/python/requirements.txt 2>&1) && break; + errmsg=$(CC=gcc pip install -r "${WORKSPACE_DIR}"/python/requirements/requirements.txt 2>&1) && break; status=$errmsg && echo "'pip install ...' failed, will retry after n seconds!" && sleep 30; done if [ "$status" != "0" ]; then @@ -324,7 +324,13 @@ install_dependencies() { # Additional Tune/SGD/Doc test dependencies. if [ "${TUNE_TESTING-}" = 1 ] || [ "${SGD_TESTING-}" = 1 ] || [ "${DOC_TESTING-}" = 1 ]; then - pip install -r "${WORKSPACE_DIR}"/python/requirements/requirements_tune.txt + if [ -n "${PYTHON-}" ] && [ "${PYTHON-}" = "3.7" ]; then + # Install Python 3.7 dependencies if 3.7 is set. + pip install -r "${WORKSPACE_DIR}"/python/requirements/linux-py3.7-requirements_tune.txt + else + # Else default to Python 3.6. + pip install -r "${WORKSPACE_DIR}"/python/requirements/linux-py3.6-requirements_tune.txt + fi fi # For Tune, install upstream dependencies. diff --git a/docker/ray-ml/Dockerfile b/docker/ray-ml/Dockerfile index 2c5f37540a2c..908351df19d9 100644 --- a/docker/ray-ml/Dockerfile +++ b/docker/ray-ml/Dockerfile @@ -6,7 +6,8 @@ ARG PYTHON_MINOR_VERSION=7 COPY requirements.txt ./ COPY requirements_ml_docker.txt ./ COPY requirements_rllib.txt ./ -COPY requirements_tune.txt ./requirements_tune.txt +# Docker image uses Python 3.7 +COPY linux-py3."$PYTHON_MINOR_VERSION"-requirements_tune.txt ./requirements_tune.txt RUN sudo apt-get update \ && sudo apt-get install -y gcc \ diff --git a/python/requirements/linux-py3.6-requirements_tune.txt b/python/requirements/linux-py3.6-requirements_tune.txt new file mode 100644 index 000000000000..1bafdac84b67 --- /dev/null +++ b/python/requirements/linux-py3.6-requirements_tune.txt @@ -0,0 +1,885 @@ +# +# This file is autogenerated by pip-compile +# To update, run: +# +# pip-compile requirements_tune.in +# +--find-links https://download.pytorch.org/whl/torch_stable.html + +absl-py==0.11.0 + # via tensorboard +alembic==1.4.1 + # via + # mlflow + # optuna +argon2-cffi==20.1.0 + # via notebook +async-generator==1.10 + # via nbclient +atari-py==0.2.6 + # via + # -c ../requirements.txt + # gym +attrs==20.3.0 + # via + # cmd2 + # jsonschema + # pytest +autocfg==0.0.6 + # via gluoncv +autogluon.core==0.0.16b20210125 + # via gluoncv +autograd==1.3 + # via autogluon.core +ax-platform==0.1.9 ; python_version < "3.7" + # via -r requirements_tune.in +azure-core==1.10.0 + # via azure-storage-blob +azure-storage-blob==12.7.1 + # via mlflow +backcall==0.2.0 + # via ipython +bayesian-optimization==1.2.0 + # via + # -r requirements_tune.in + # nevergrad +bcrypt==3.2.0 + # via paramiko +bleach==3.2.2 + # via nbconvert +bokeh==2.2.3 + # via dask +boto3==1.16.58 + # via + # -c ../requirements.txt + # autogluon.core + # smart-open +botocore==1.19.58 + # via + # boto3 + # s3transfer +botorch==0.2.1 + # via ax-platform +cached-property==1.5.2 + # via h5py +cachetools==4.2.0 + # via google-auth +certifi==2020.12.5 + # via + # kubernetes + # msrest + # requests + # sentry-sdk +cffi==1.14.4 + # via + # argon2-cffi + # bcrypt + # cryptography + # pynacl +chardet==4.0.0 + # via requests +click==7.1.2 + # via + # -c ../requirements.txt + # databricks-cli + # distributed + # flask + # mlflow + # sacremoses + # wandb +cliff==3.6.0 + # via optuna +cloudpickle==1.6.0 + # via + # dask + # distributed + # gym + # hyperopt + # mlflow + # tensorflow-probability +cma==3.0.3 + # via nevergrad +cmaes==0.7.0 + # via optuna +cmd2==1.4.0 + # via cliff +colorama==0.4.4 + # via + # -c ../requirements.txt + # cmd2 +colorlog==4.7.2 + # via optuna +configparser==5.0.1 + # via wandb +configspace==0.4.10 + # via + # -r requirements_tune.in + # autogluon.core + # hpbandster +contextvars==2.4 + # via distributed +cryptography==3.3.1 + # via + # azure-storage-blob + # paramiko +cycler==0.10.0 + # via matplotlib +cython==0.29.0 + # via + # -c ../requirements.txt + # autogluon.core + # configspace +dask[complete]==2021.1.0 + # via + # -c ../requirements.txt + # autogluon.core + # distributed +databricks-cli==0.14.1 + # via mlflow +dataclasses==0.8 ; python_version < "3.7" + # via + # -c ../requirements.txt + # autocfg + # torch + # transformers +decorator==4.4.2 + # via + # ipython + # networkx + # paramz + # tensorflow-probability + # traitlets +decord==0.4.2 + # via gluoncv +defusedxml==0.6.0 + # via nbconvert +dill==0.3.3 + # via autogluon.core +distributed==2021.1.1 + # via + # autogluon.core + # dask +dm-tree==0.1.5 + # via + # -c ../requirements.txt + # tensorflow-probability +docker-pycreds==0.4.0 + # via wandb +docker==4.4.1 + # via mlflow +dragonfly-opt==0.1.6 + # via -r requirements_tune.in +entrypoints==0.3 + # via + # mlflow + # nbconvert +filelock==3.0.12 + # via + # -c ../requirements.txt + # transformers +flask==1.1.2 + # via + # -c ../requirements.txt + # mlflow + # prometheus-flask-exporter +fsspec==0.8.5 + # via + # dask + # pytorch-lightning +future==0.18.2 + # via + # autograd + # dragonfly-opt + # hyperopt + # pyglet + # pytorch-lightning + # torch +gast==0.4.0 + # via tensorflow-probability +gitdb==4.0.5 + # via gitpython +gitpython==3.1.12 + # via + # mlflow + # wandb +gluoncv==0.9.1 + # via -r requirements_tune.in +google-auth-oauthlib==0.4.2 + # via tensorboard +google-auth==1.24.0 + # via + # google-auth-oauthlib + # kubernetes + # tensorboard +gpy==1.9.9 + # via -r requirements_tune.in +gpytorch==1.3.1 + # via botorch +graphviz==0.8.4 + # via + # autogluon.core + # mxnet +grpcio==1.35.0 + # via + # -c ../requirements.txt + # tensorboard +gunicorn==20.0.4 + # via mlflow +gym[atari]==0.18.0 + # via + # -c ../requirements.txt + # -r requirements_tune.in +h5py==3.1.0 + # via + # -r requirements_tune.in + # keras +heapdict==1.0.1 + # via zict +hpbandster==0.7.4 + # via -r requirements_tune.in +hyperopt==0.2.5 + # via -r requirements_tune.in +idna==2.10 + # via requests +immutables==0.14 + # via contextvars +importlib-metadata==3.4.0 + # via + # cmd2 + # jsonschema + # markdown + # pluggy + # pytest + # stevedore +ipykernel==5.4.3 + # via + # ipywidgets + # jupyter + # jupyter-console + # notebook + # qtconsole +ipython-genutils==0.2.0 + # via + # nbformat + # notebook + # qtconsole + # traitlets +ipython==7.16.1 + # via + # ipykernel + # ipywidgets + # jupyter-console +ipywidgets==7.6.3 + # via jupyter +isodate==0.6.0 + # via msrest +itsdangerous==1.1.0 + # via flask +jedi==0.18.0 + # via ipython +jinja2==2.11.2 + # via + # ax-platform + # bokeh + # flask + # nbconvert + # notebook +jmespath==0.10.0 + # via + # boto3 + # botocore +joblib==1.0.0 + # via + # optuna + # sacremoses + # scikit-learn + # scikit-optimize +jsonschema==3.2.0 + # via + # -c ../requirements.txt + # nbformat +jupyter-client==6.1.11 + # via + # ipykernel + # jupyter-console + # nbclient + # notebook + # qtconsole +jupyter-console==6.2.0 + # via jupyter +jupyter-core==4.7.0 + # via + # jupyter-client + # nbconvert + # nbformat + # notebook + # qtconsole +jupyter==1.0.0 + # via -r requirements_tune.in +jupyterlab-pygments==0.1.2 + # via nbconvert +jupyterlab-widgets==1.0.0 + # via ipywidgets +keras==2.4.3 + # via -r requirements_tune.in +kiwisolver==1.3.1 + # via matplotlib +kubernetes==12.0.1 + # via + # -c ../requirements.txt + # -r requirements_tune.in +lightgbm==3.1.1 + # via -r requirements_tune.in +locket==0.2.1 + # via partd +mako==1.1.4 + # via alembic +markdown==3.3.3 + # via tensorboard +markupsafe==1.1.1 + # via + # jinja2 + # mako +matplotlib==3.3.3 + # via + # -r requirements_tune.in + # autogluon.core + # gluoncv + # zoopt +mistune==0.8.4 + # via nbconvert +mlflow==1.13.1 + # via -r requirements_tune.in +more-itertools==8.6.0 + # via pytest +msgpack==1.0.2 + # via + # -c ../requirements.txt + # distributed +msrest==0.6.19 + # via azure-storage-blob +mxnet==1.7.0.post1 + # via -r requirements_tune.in +nbclient==0.5.1 + # via nbconvert +nbconvert==6.0.7 + # via + # jupyter + # notebook +nbformat==5.1.2 + # via + # ipywidgets + # nbclient + # nbconvert + # notebook +nest-asyncio==1.4.3 + # via nbclient +netifaces==0.10.9 + # via hpbandster +networkx==2.5 + # via + # -c ../requirements.txt + # hyperopt +nevergrad==0.4.2.post5 + # via -r requirements_tune.in +notebook==6.2.0 + # via + # jupyter + # widgetsnbextension +numpy==1.19.5 + # via + # -c ../requirements.txt + # atari-py + # autogluon.core + # autograd + # bayesian-optimization + # bokeh + # cma + # cmaes + # configspace + # dask + # decord + # dragonfly-opt + # gluoncv + # gpy + # gym + # h5py + # hpbandster + # hyperopt + # keras + # lightgbm + # matplotlib + # mlflow + # mxnet + # nevergrad + # opencv-python + # optuna + # pandas + # paramz + # patsy + # pytorch-lightning + # scikit-learn + # scikit-optimize + # scipy + # statsmodels + # tensorboard + # tensorboardx + # tensorflow-probability + # torch + # torchvision + # transformers + # xgboost + # zoopt +oauthlib==3.1.0 + # via requests-oauthlib +opencv-python==4.5.1.48 + # via + # gluoncv + # gym +optuna==2.4.0 + # via -r requirements_tune.in +packaging==20.8 + # via + # bleach + # bokeh + # optuna + # pytest + # transformers +pandas==1.0.5 + # via + # -c ../requirements.txt + # autogluon.core + # ax-platform + # dask + # gluoncv + # mlflow + # statsmodels +pandocfilters==1.4.3 + # via nbconvert +paramiko==2.7.2 + # via autogluon.core +paramz==0.9.5 + # via gpy +parso==0.8.1 + # via jedi +partd==1.1.0 + # via dask +patsy==0.5.1 + # via statsmodels +pbr==5.5.1 + # via + # cliff + # stevedore +pexpect==4.8.0 + # via + # -c ../requirements.txt + # ipython +pickleshare==0.7.5 + # via ipython +pillow==7.2.0 ; platform_system != "Windows" + # via + # -c ../requirements.txt + # bokeh + # gluoncv + # gym + # matplotlib + # torchvision +plotly==4.14.3 + # via ax-platform +pluggy==0.13.1 + # via pytest +portalocker==2.0.0 + # via gluoncv +prettytable==0.7.2 + # via cliff +prometheus-client==0.9.0 + # via + # -c ../requirements.txt + # notebook + # prometheus-flask-exporter +prometheus-flask-exporter==0.18.1 + # via mlflow +promise==2.3 + # via wandb +prompt-toolkit==3.0.13 + # via + # ipython + # jupyter-console +protobuf==3.14.0 + # via + # -c ../requirements.txt + # mlflow + # tensorboard + # tensorboardx + # wandb +psutil==5.8.0 + # via + # distributed + # wandb +ptyprocess==0.7.0 + # via + # pexpect + # terminado +py==1.10.0 + # via pytest +pyaml==20.4.0 + # via scikit-optimize +pyasn1-modules==0.2.8 + # via google-auth +pyasn1==0.4.8 + # via + # pyasn1-modules + # rsa +pycparser==2.20 + # via cffi +pyglet==1.5.0 + # via gym +pygments==2.7.4 + # via + # -c ../requirements.txt + # ipython + # jupyter-console + # jupyterlab-pygments + # nbconvert + # qtconsole +pynacl==1.4.0 + # via paramiko +pyparsing==2.4.7 + # via + # cliff + # configspace + # matplotlib + # packaging +pyperclip==1.8.1 + # via cmd2 +pyro4==4.80 + # via hpbandster +pyrsistent==0.17.3 + # via jsonschema +pytest-remotedata==0.3.2 + # via -r requirements_tune.in +pytest==5.4.3 + # via + # -c ../requirements.txt + # autogluon.core + # pytest-remotedata +python-dateutil==2.8.1 + # via + # alembic + # bokeh + # botocore + # jupyter-client + # kubernetes + # matplotlib + # mlflow + # pandas + # wandb +python-editor==1.0.4 + # via alembic +pytorch-lightning-bolts==0.2.5 + # via -r requirements_tune.in +pytorch-lightning==1.0.3 + # via + # -r requirements_tune.in + # pytorch-lightning-bolts +pytz==2020.5 + # via pandas +pyyaml==5.4.1 + # via + # -c ../requirements.txt + # autocfg + # bokeh + # cliff + # dask + # distributed + # gluoncv + # keras + # kubernetes + # mlflow + # pyaml + # pytorch-lightning + # wandb + # yacs +pyzmq==21.0.1 + # via + # jupyter-client + # notebook + # qtconsole +qtconsole==5.0.2 + # via jupyter +qtpy==1.9.0 + # via qtconsole +querystring-parser==1.2.4 + # via mlflow +regex==2020.11.13 + # via + # sacremoses + # transformers +requests-oauthlib==1.3.0 + # via + # google-auth-oauthlib + # kubernetes + # msrest +requests==2.25.1 + # via + # -c ../requirements.txt + # autogluon.core + # azure-core + # databricks-cli + # docker + # gluoncv + # kubernetes + # mlflow + # msrest + # mxnet + # requests-oauthlib + # sigopt + # tensorboard + # transformers + # wandb +retrying==1.3.3 + # via plotly +rsa==4.7 + # via google-auth +s3transfer==0.3.4 + # via boto3 +sacremoses==0.0.43 + # via transformers +scikit-learn==0.22.2 + # via + # -c ../requirements.txt + # -r requirements_tune.in + # autogluon.core + # ax-platform + # bayesian-optimization + # gpytorch + # lightgbm + # scikit-optimize +scikit-optimize==0.8.1 + # via + # -r requirements_tune.in + # autogluon.core +scipy==1.4.1 + # via + # -c ../requirements.txt + # autogluon.core + # ax-platform + # bayesian-optimization + # botorch + # dragonfly-opt + # gluoncv + # gpy + # gpytorch + # gym + # hpbandster + # hyperopt + # keras + # lightgbm + # optuna + # paramz + # scikit-learn + # scikit-optimize + # statsmodels + # xgboost +send2trash==1.5.0 + # via notebook +sentencepiece==0.1.95 + # via transformers +sentry-sdk==0.19.5 + # via wandb +serpent==1.30.2 + # via + # hpbandster + # pyro4 +shortuuid==1.0.1 + # via wandb +sigopt==5.7.0 + # via -r requirements_tune.in +six==1.15.0 + # via + # absl-py + # argon2-cffi + # atari-py + # azure-core + # bcrypt + # bleach + # cryptography + # cycler + # databricks-cli + # dm-tree + # docker + # docker-pycreds + # dragonfly-opt + # google-auth + # gpy + # grpcio + # hyperopt + # isodate + # jsonschema + # kubernetes + # mlflow + # paramz + # patsy + # plotly + # promise + # protobuf + # pynacl + # pytest-remotedata + # python-dateutil + # querystring-parser + # retrying + # sacremoses + # tensorboard + # tensorboardx + # tensorflow-probability + # traitlets + # wandb + # websocket-client +smart_open[s3]==4.1.2 + # via + # -c ../requirements.txt + # -r requirements_tune.in +smmap==3.0.4 + # via gitdb +sortedcontainers==2.3.0 + # via distributed +sqlalchemy==1.3.22 + # via + # alembic + # mlflow + # optuna +sqlparse==0.4.1 + # via mlflow +statsmodels==0.12.1 + # via hpbandster +stevedore==3.3.0 + # via cliff +subprocess32==3.5.4 + # via wandb +tabulate==0.8.7 + # via + # -c ../requirements.txt + # databricks-cli +tblib==1.7.0 + # via distributed +tensorboard-plugin-wit==1.8.0 + # via tensorboard +tensorboard==2.4.1 + # via pytorch-lightning +tensorboardx==2.1 + # via + # -c ../requirements.txt + # gluoncv +tensorflow-probability==0.11.1 + # via -r requirements_tune.in +terminado==0.9.2 + # via notebook +testpath==0.4.4 + # via nbconvert +timm==0.3.2 + # via -r requirements_tune.in +tokenizers==0.8.1.rc2 + # via transformers +toolz==0.11.1 + # via + # dask + # distributed + # partd +torch==1.7.0+cpu ; sys_platform != "darwin" + # via + # -r requirements_tune.in + # botorch + # gpytorch + # pytorch-lightning + # pytorch-lightning-bolts + # timm + # torchvision +torchvision==0.8.1+cpu ; sys_platform != "darwin" + # via + # -r requirements_tune.in + # timm +tornado==6.1 + # via + # autogluon.core + # bokeh + # distributed + # ipykernel + # jupyter-client + # notebook + # terminado +tqdm==4.56.0 + # via + # autogluon.core + # gluoncv + # hyperopt + # optuna + # pytorch-lightning + # sacremoses + # transformers +traitlets==4.3.3 + # via + # ipykernel + # ipython + # ipywidgets + # jupyter-client + # jupyter-core + # nbclient + # nbconvert + # nbformat + # notebook + # qtconsole +transformers==3.1 + # via -r requirements_tune.in +typing-extensions==3.7.4.3 + # via + # bokeh + # importlib-metadata + # nevergrad + # torch +typing==3.7.4.3 + # via configspace +urllib3==1.26.2 + # via + # botocore + # kubernetes + # requests + # sentry-sdk +wandb==0.10.12 + # via -r requirements_tune.in +watchdog==1.0.2 + # via wandb +wcwidth==0.2.5 + # via + # cmd2 + # prompt-toolkit + # pytest +webencodings==0.5.1 + # via bleach +websocket-client==0.57.0 + # via + # docker + # kubernetes +werkzeug==1.0.1 + # via + # -c ../requirements.txt + # flask + # tensorboard +wheel==0.36.2 + # via + # lightgbm + # tensorboard +widgetsnbextension==3.5.1 + # via ipywidgets +xgboost==1.3.0.post0 + # via -r requirements_tune.in +yacs==0.1.8 + # via gluoncv +zict==2.0.0 + # via distributed +zipp==3.4.0 + # via importlib-metadata +zoopt==0.4.1 + # via -r requirements_tune.in + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/python/requirements/linux-py3.7-requirements_tune.txt b/python/requirements/linux-py3.7-requirements_tune.txt new file mode 100644 index 000000000000..920222b459ef --- /dev/null +++ b/python/requirements/linux-py3.7-requirements_tune.txt @@ -0,0 +1,877 @@ +# +# This file is autogenerated by pip-compile +# To update, run: +# +# pip-compile requirements_tune.in +# +--find-links https://download.pytorch.org/whl/torch_stable.html + +absl-py==0.11.0 + # via tensorboard +alembic==1.4.1 + # via + # mlflow + # optuna +argon2-cffi==20.1.0 + # via notebook +async-generator==1.10 + # via nbclient +atari-py==0.2.6 + # via + # -c ../requirements.txt + # gym +attrs==20.3.0 + # via + # cmd2 + # jsonschema + # pytest +autocfg==0.0.6 + # via gluoncv +autogluon.core==0.0.16b20210125 + # via gluoncv +autograd==1.3 + # via autogluon.core +ax-platform==0.1.19 ; python_version >= "3.7" + # via -r requirements_tune.in +azure-core==1.10.0 + # via azure-storage-blob +azure-storage-blob==12.7.1 + # via mlflow +backcall==0.2.0 + # via ipython +bayesian-optimization==1.2.0 + # via + # -r requirements_tune.in + # nevergrad +bcrypt==3.2.0 + # via paramiko +bleach==3.2.2 + # via nbconvert +bokeh==2.2.3 + # via dask +boto3==1.16.58 + # via + # -c ../requirements.txt + # autogluon.core + # smart-open +botocore==1.19.58 + # via + # boto3 + # s3transfer +botorch==0.3.3 + # via ax-platform +cached-property==1.5.2 + # via h5py +cachetools==4.2.0 + # via google-auth +certifi==2020.12.5 + # via + # kubernetes + # msrest + # requests + # sentry-sdk +cffi==1.14.4 + # via + # argon2-cffi + # bcrypt + # cryptography + # pynacl +chardet==4.0.0 + # via requests +click==7.1.2 + # via + # -c ../requirements.txt + # databricks-cli + # distributed + # flask + # mlflow + # sacremoses + # wandb +cliff==3.6.0 + # via optuna +cloudpickle==1.6.0 + # via + # dask + # distributed + # gym + # hyperopt + # mlflow + # tensorflow-probability +cma==3.0.3 + # via nevergrad +cmaes==0.7.0 + # via optuna +cmd2==1.4.0 + # via cliff +colorama==0.4.4 + # via + # -c ../requirements.txt + # cmd2 +colorlog==4.7.2 + # via optuna +configparser==5.0.1 + # via wandb +configspace==0.4.10 + # via + # -r requirements_tune.in + # autogluon.core + # hpbandster +cryptography==3.3.1 + # via + # azure-storage-blob + # paramiko +cycler==0.10.0 + # via matplotlib +cython==0.29.0 + # via + # -c ../requirements.txt + # autogluon.core + # configspace +dask[complete]==2021.1.0 + # via + # -c ../requirements.txt + # autogluon.core + # distributed +databricks-cli==0.14.1 + # via mlflow +dataclasses==0.6 + # via torch +decorator==4.4.2 + # via + # ipython + # networkx + # paramz + # tensorflow-probability +decord==0.4.2 + # via gluoncv +defusedxml==0.6.0 + # via nbconvert +dill==0.3.3 + # via autogluon.core +distributed==2021.1.1 + # via + # autogluon.core + # dask +dm-tree==0.1.5 + # via + # -c ../requirements.txt + # tensorflow-probability +docker-pycreds==0.4.0 + # via wandb +docker==4.4.1 + # via mlflow +dragonfly-opt==0.1.6 + # via -r requirements_tune.in +entrypoints==0.3 + # via + # mlflow + # nbconvert +filelock==3.0.12 + # via + # -c ../requirements.txt + # transformers +flask==1.1.2 + # via + # -c ../requirements.txt + # mlflow + # prometheus-flask-exporter +fsspec==0.8.5 + # via + # dask + # pytorch-lightning +future==0.18.2 + # via + # autograd + # dragonfly-opt + # hyperopt + # pyglet + # pytorch-lightning + # torch +gast==0.4.0 + # via tensorflow-probability +gitdb==4.0.5 + # via gitpython +gitpython==3.1.12 + # via + # mlflow + # wandb +gluoncv==0.9.1 + # via -r requirements_tune.in +google-auth-oauthlib==0.4.2 + # via tensorboard +google-auth==1.24.0 + # via + # google-auth-oauthlib + # kubernetes + # tensorboard +gpy==1.9.9 + # via -r requirements_tune.in +gpytorch==1.3.1 + # via botorch +graphviz==0.8.4 + # via + # autogluon.core + # mxnet +grpcio==1.35.0 + # via + # -c ../requirements.txt + # tensorboard +gunicorn==20.0.4 + # via mlflow +gym[atari]==0.18.0 + # via + # -c ../requirements.txt + # -r requirements_tune.in +h5py==3.1.0 + # via + # -r requirements_tune.in + # keras +heapdict==1.0.1 + # via zict +hpbandster==0.7.4 + # via -r requirements_tune.in +hyperopt==0.2.5 + # via -r requirements_tune.in +idna==2.10 + # via requests +importlib-metadata==3.4.0 + # via + # cmd2 + # jsonschema + # markdown + # pluggy + # pytest + # stevedore +ipykernel==5.4.3 + # via + # ipywidgets + # jupyter + # jupyter-console + # notebook + # qtconsole +ipython-genutils==0.2.0 + # via + # nbformat + # notebook + # qtconsole + # traitlets +ipython==7.19.0 + # via + # ipykernel + # ipywidgets + # jupyter-console +ipywidgets==7.6.3 + # via jupyter +isodate==0.6.0 + # via msrest +itsdangerous==1.1.0 + # via flask +jedi==0.18.0 + # via ipython +jinja2==2.11.2 + # via + # ax-platform + # bokeh + # flask + # nbconvert + # notebook +jmespath==0.10.0 + # via + # boto3 + # botocore +joblib==1.0.0 + # via + # optuna + # sacremoses + # scikit-learn + # scikit-optimize +jsonschema==3.2.0 + # via + # -c ../requirements.txt + # nbformat +jupyter-client==6.1.11 + # via + # ipykernel + # jupyter-console + # nbclient + # notebook + # qtconsole +jupyter-console==6.2.0 + # via jupyter +jupyter-core==4.7.0 + # via + # jupyter-client + # nbconvert + # nbformat + # notebook + # qtconsole +jupyter==1.0.0 + # via -r requirements_tune.in +jupyterlab-pygments==0.1.2 + # via nbconvert +jupyterlab-widgets==1.0.0 + # via ipywidgets +keras==2.4.3 + # via -r requirements_tune.in +kiwisolver==1.3.1 + # via matplotlib +kubernetes==12.0.1 + # via + # -c ../requirements.txt + # -r requirements_tune.in +lightgbm==3.1.1 + # via -r requirements_tune.in +locket==0.2.1 + # via partd +mako==1.1.4 + # via alembic +markdown==3.3.3 + # via tensorboard +markupsafe==1.1.1 + # via + # jinja2 + # mako +matplotlib==3.3.3 + # via + # -r requirements_tune.in + # autogluon.core + # gluoncv + # zoopt +mistune==0.8.4 + # via nbconvert +mlflow==1.13.1 + # via -r requirements_tune.in +more-itertools==8.6.0 + # via pytest +msgpack==1.0.2 + # via + # -c ../requirements.txt + # distributed +msrest==0.6.19 + # via azure-storage-blob +mxnet==1.7.0.post1 + # via -r requirements_tune.in +nbclient==0.5.1 + # via nbconvert +nbconvert==6.0.7 + # via + # jupyter + # notebook +nbformat==5.1.2 + # via + # ipywidgets + # nbclient + # nbconvert + # notebook +nest-asyncio==1.4.3 + # via nbclient +netifaces==0.10.9 + # via hpbandster +networkx==2.5 + # via + # -c ../requirements.txt + # hyperopt +nevergrad==0.4.2.post5 + # via -r requirements_tune.in +notebook==6.2.0 + # via + # jupyter + # widgetsnbextension +numpy==1.19.5 + # via + # -c ../requirements.txt + # atari-py + # autogluon.core + # autograd + # bayesian-optimization + # bokeh + # cma + # cmaes + # configspace + # dask + # decord + # dragonfly-opt + # gluoncv + # gpy + # gym + # h5py + # hpbandster + # hyperopt + # keras + # lightgbm + # matplotlib + # mlflow + # mxnet + # nevergrad + # opencv-python + # optuna + # pandas + # paramz + # patsy + # pytorch-lightning + # scikit-learn + # scikit-optimize + # scipy + # statsmodels + # tensorboard + # tensorboardx + # tensorflow-probability + # torch + # torchvision + # transformers + # xgboost + # zoopt +oauthlib==3.1.0 + # via requests-oauthlib +opencv-python==4.5.1.48 + # via + # gluoncv + # gym +optuna==2.4.0 + # via -r requirements_tune.in +packaging==20.8 + # via + # bleach + # bokeh + # optuna + # pytest + # transformers +pandas==1.0.5 + # via + # -c ../requirements.txt + # autogluon.core + # ax-platform + # dask + # gluoncv + # mlflow + # statsmodels +pandocfilters==1.4.3 + # via nbconvert +paramiko==2.7.2 + # via autogluon.core +paramz==0.9.5 + # via gpy +parso==0.8.1 + # via jedi +partd==1.1.0 + # via dask +patsy==0.5.1 + # via statsmodels +pbr==5.5.1 + # via + # cliff + # stevedore +pexpect==4.8.0 + # via + # -c ../requirements.txt + # ipython +pickleshare==0.7.5 + # via ipython +pillow==7.2.0 ; platform_system != "Windows" + # via + # -c ../requirements.txt + # bokeh + # gluoncv + # gym + # matplotlib + # torchvision +plotly==4.14.3 + # via ax-platform +pluggy==0.13.1 + # via pytest +portalocker==2.0.0 + # via gluoncv +prettytable==0.7.2 + # via cliff +prometheus-client==0.9.0 + # via + # -c ../requirements.txt + # notebook + # prometheus-flask-exporter +prometheus-flask-exporter==0.18.1 + # via mlflow +promise==2.3 + # via wandb +prompt-toolkit==3.0.13 + # via + # ipython + # jupyter-console +protobuf==3.14.0 + # via + # -c ../requirements.txt + # mlflow + # tensorboard + # tensorboardx + # wandb +psutil==5.8.0 + # via + # distributed + # wandb +ptyprocess==0.7.0 + # via + # pexpect + # terminado +py==1.10.0 + # via pytest +pyaml==20.4.0 + # via scikit-optimize +pyasn1-modules==0.2.8 + # via google-auth +pyasn1==0.4.8 + # via + # pyasn1-modules + # rsa +pycparser==2.20 + # via cffi +pyglet==1.5.0 + # via gym +pygments==2.7.4 + # via + # -c ../requirements.txt + # ipython + # jupyter-console + # jupyterlab-pygments + # nbconvert + # qtconsole +pynacl==1.4.0 + # via paramiko +pyparsing==2.4.7 + # via + # cliff + # configspace + # matplotlib + # packaging +pyperclip==1.8.1 + # via cmd2 +pyro4==4.80 + # via hpbandster +pyrsistent==0.17.3 + # via jsonschema +pytest-remotedata==0.3.2 + # via -r requirements_tune.in +pytest==5.4.3 + # via + # -c ../requirements.txt + # autogluon.core + # pytest-remotedata +python-dateutil==2.8.1 + # via + # alembic + # bokeh + # botocore + # jupyter-client + # kubernetes + # matplotlib + # mlflow + # pandas + # wandb +python-editor==1.0.4 + # via alembic +pytorch-lightning-bolts==0.2.5 + # via -r requirements_tune.in +pytorch-lightning==1.0.3 + # via + # -r requirements_tune.in + # pytorch-lightning-bolts +pytz==2020.5 + # via pandas +pyyaml==5.4.1 + # via + # -c ../requirements.txt + # autocfg + # bokeh + # cliff + # dask + # distributed + # gluoncv + # keras + # kubernetes + # mlflow + # pyaml + # pytorch-lightning + # wandb + # yacs +pyzmq==21.0.1 + # via + # jupyter-client + # notebook + # qtconsole +qtconsole==5.0.2 + # via jupyter +qtpy==1.9.0 + # via qtconsole +querystring-parser==1.2.4 + # via mlflow +regex==2020.11.13 + # via + # sacremoses + # transformers +requests-oauthlib==1.3.0 + # via + # google-auth-oauthlib + # kubernetes + # msrest +requests==2.25.1 + # via + # -c ../requirements.txt + # autogluon.core + # azure-core + # databricks-cli + # docker + # gluoncv + # kubernetes + # mlflow + # msrest + # mxnet + # requests-oauthlib + # sigopt + # tensorboard + # transformers + # wandb +retrying==1.3.3 + # via plotly +rsa==4.7 + # via google-auth +s3transfer==0.3.4 + # via boto3 +sacremoses==0.0.43 + # via transformers +scikit-learn==0.22.2 + # via + # -c ../requirements.txt + # -r requirements_tune.in + # autogluon.core + # ax-platform + # bayesian-optimization + # gpytorch + # lightgbm + # scikit-optimize +scikit-optimize==0.8.1 + # via + # -r requirements_tune.in + # autogluon.core +scipy==1.4.1 + # via + # -c ../requirements.txt + # autogluon.core + # ax-platform + # bayesian-optimization + # botorch + # dragonfly-opt + # gluoncv + # gpy + # gpytorch + # gym + # hpbandster + # hyperopt + # keras + # lightgbm + # optuna + # paramz + # scikit-learn + # scikit-optimize + # statsmodels + # xgboost +send2trash==1.5.0 + # via notebook +sentencepiece==0.1.95 + # via transformers +sentry-sdk==0.19.5 + # via wandb +serpent==1.30.2 + # via + # hpbandster + # pyro4 +shortuuid==1.0.1 + # via wandb +sigopt==5.7.0 + # via -r requirements_tune.in +six==1.15.0 + # via + # absl-py + # argon2-cffi + # atari-py + # azure-core + # bcrypt + # bleach + # cryptography + # cycler + # databricks-cli + # dm-tree + # docker + # docker-pycreds + # dragonfly-opt + # google-auth + # gpy + # grpcio + # hyperopt + # isodate + # jsonschema + # kubernetes + # mlflow + # paramz + # patsy + # plotly + # promise + # protobuf + # pynacl + # pytest-remotedata + # python-dateutil + # querystring-parser + # retrying + # sacremoses + # tensorboard + # tensorboardx + # tensorflow-probability + # wandb + # websocket-client +smart_open[s3]==4.1.2 + # via + # -c ../requirements.txt + # -r requirements_tune.in +smmap==3.0.4 + # via gitdb +sortedcontainers==2.3.0 + # via distributed +sqlalchemy==1.3.22 + # via + # alembic + # mlflow + # optuna +sqlparse==0.4.1 + # via mlflow +statsmodels==0.12.1 + # via hpbandster +stevedore==3.3.0 + # via cliff +subprocess32==3.5.4 + # via wandb +tabulate==0.8.7 + # via + # -c ../requirements.txt + # databricks-cli +tblib==1.7.0 + # via distributed +tensorboard-plugin-wit==1.8.0 + # via tensorboard +tensorboard==2.4.1 + # via pytorch-lightning +tensorboardx==2.1 + # via + # -c ../requirements.txt + # gluoncv +tensorflow-probability==0.11.1 + # via -r requirements_tune.in +terminado==0.9.2 + # via notebook +testpath==0.4.4 + # via nbconvert +timm==0.3.2 + # via -r requirements_tune.in +tokenizers==0.8.1.rc2 + # via transformers +toolz==0.11.1 + # via + # dask + # distributed + # partd +torch==1.7.0+cpu ; sys_platform != "darwin" + # via + # -r requirements_tune.in + # botorch + # gpytorch + # pytorch-lightning + # pytorch-lightning-bolts + # timm + # torchvision +torchvision==0.8.1+cpu ; sys_platform != "darwin" + # via + # -r requirements_tune.in + # timm +tornado==6.1 + # via + # autogluon.core + # bokeh + # distributed + # ipykernel + # jupyter-client + # notebook + # terminado +tqdm==4.56.0 + # via + # autogluon.core + # gluoncv + # hyperopt + # optuna + # pytorch-lightning + # sacremoses + # transformers +traitlets==5.0.5 + # via + # ipykernel + # ipython + # ipywidgets + # jupyter-client + # jupyter-core + # nbclient + # nbconvert + # nbformat + # notebook + # qtconsole +transformers==3.1 + # via -r requirements_tune.in +typeguard==2.10.0 + # via ax-platform +typing-extensions==3.7.4.3 + # via + # bokeh + # importlib-metadata + # nevergrad + # torch +typing==3.7.4.3 + # via configspace +urllib3==1.26.2 + # via + # botocore + # kubernetes + # requests + # sentry-sdk +wandb==0.10.12 + # via -r requirements_tune.in +watchdog==1.0.2 + # via wandb +wcwidth==0.2.5 + # via + # cmd2 + # prompt-toolkit + # pytest +webencodings==0.5.1 + # via bleach +websocket-client==0.57.0 + # via + # docker + # kubernetes +werkzeug==1.0.1 + # via + # -c ../requirements.txt + # flask + # tensorboard +wheel==0.36.2 + # via + # lightgbm + # tensorboard +widgetsnbextension==3.5.1 + # via ipywidgets +xgboost==1.3.0.post0 + # via -r requirements_tune.in +yacs==0.1.8 + # via gluoncv +zict==2.0.0 + # via distributed +zipp==3.4.0 + # via importlib-metadata +zoopt==0.4.1 + # via -r requirements_tune.in + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/python/requirements/linux-py3.8-requirements_tune.txt b/python/requirements/linux-py3.8-requirements_tune.txt new file mode 100644 index 000000000000..14aade6549ee --- /dev/null +++ b/python/requirements/linux-py3.8-requirements_tune.txt @@ -0,0 +1,864 @@ +# +# This file is autogenerated by pip-compile +# To update, run: +# +# pip-compile requirements_tune.in +# +--find-links https://download.pytorch.org/whl/torch_stable.html + +absl-py==0.11.0 + # via tensorboard +alembic==1.4.1 + # via + # mlflow + # optuna +argon2-cffi==20.1.0 + # via notebook +async-generator==1.10 + # via nbclient +atari-py==0.2.6 + # via + # -c ../requirements.txt + # gym +attrs==20.3.0 + # via + # cmd2 + # jsonschema + # pytest +autocfg==0.0.6 + # via gluoncv +autogluon.core==0.0.16b20210125 + # via gluoncv +autograd==1.3 + # via autogluon.core +ax-platform==0.1.19 ; python_version >= "3.7" + # via -r requirements_tune.in +azure-core==1.10.0 + # via azure-storage-blob +azure-storage-blob==12.7.1 + # via mlflow +backcall==0.2.0 + # via ipython +bayesian-optimization==1.2.0 + # via + # -r requirements_tune.in + # nevergrad +bcrypt==3.2.0 + # via paramiko +bleach==3.2.2 + # via nbconvert +bokeh==2.2.3 + # via dask +boto3==1.16.58 + # via + # -c ../requirements.txt + # autogluon.core + # smart-open +botocore==1.19.58 + # via + # boto3 + # s3transfer +botorch==0.3.3 + # via ax-platform +cachetools==4.2.0 + # via google-auth +certifi==2020.12.5 + # via + # kubernetes + # msrest + # requests + # sentry-sdk +cffi==1.14.4 + # via + # argon2-cffi + # bcrypt + # cryptography + # pynacl +chardet==4.0.0 + # via requests +click==7.1.2 + # via + # -c ../requirements.txt + # databricks-cli + # distributed + # flask + # mlflow + # sacremoses + # wandb +cliff==3.6.0 + # via optuna +cloudpickle==1.6.0 + # via + # dask + # distributed + # gym + # hyperopt + # mlflow + # tensorflow-probability +cma==3.0.3 + # via nevergrad +cmaes==0.7.0 + # via optuna +cmd2==1.4.0 + # via cliff +colorama==0.4.4 + # via + # -c ../requirements.txt + # cmd2 +colorlog==4.7.2 + # via optuna +configparser==5.0.1 + # via wandb +configspace==0.4.10 + # via + # -r requirements_tune.in + # autogluon.core + # hpbandster +cryptography==3.3.1 + # via + # azure-storage-blob + # paramiko +cycler==0.10.0 + # via matplotlib +cython==0.29.0 + # via + # -c ../requirements.txt + # autogluon.core + # configspace +dask[complete]==2021.1.0 + # via + # -c ../requirements.txt + # autogluon.core + # distributed +databricks-cli==0.14.1 + # via mlflow +dataclasses==0.6 + # via torch +decorator==4.4.2 + # via + # ipython + # networkx + # paramz + # tensorflow-probability +decord==0.4.2 + # via gluoncv +defusedxml==0.6.0 + # via nbconvert +dill==0.3.3 + # via autogluon.core +distributed==2021.1.1 + # via + # autogluon.core + # dask +dm-tree==0.1.5 + # via + # -c ../requirements.txt + # tensorflow-probability +docker-pycreds==0.4.0 + # via wandb +docker==4.4.1 + # via mlflow +dragonfly-opt==0.1.6 + # via -r requirements_tune.in +entrypoints==0.3 + # via + # mlflow + # nbconvert +filelock==3.0.12 + # via + # -c ../requirements.txt + # transformers +flask==1.1.2 + # via + # -c ../requirements.txt + # mlflow + # prometheus-flask-exporter +fsspec==0.8.5 + # via + # dask + # pytorch-lightning +future==0.18.2 + # via + # autograd + # dragonfly-opt + # hyperopt + # pyglet + # pytorch-lightning + # torch +gast==0.4.0 + # via tensorflow-probability +gitdb==4.0.5 + # via gitpython +gitpython==3.1.12 + # via + # mlflow + # wandb +gluoncv==0.9.1 + # via -r requirements_tune.in +google-auth-oauthlib==0.4.2 + # via tensorboard +google-auth==1.24.0 + # via + # google-auth-oauthlib + # kubernetes + # tensorboard +gpy==1.9.9 + # via -r requirements_tune.in +gpytorch==1.3.1 + # via botorch +graphviz==0.8.4 + # via + # autogluon.core + # mxnet +grpcio==1.35.0 + # via + # -c ../requirements.txt + # tensorboard +gunicorn==20.0.4 + # via mlflow +gym==0.18.0 + # via + # -c ../requirements.txt + # -r requirements_tune.in +h5py==3.1.0 + # via + # -r requirements_tune.in + # keras +heapdict==1.0.1 + # via zict +hpbandster==0.7.4 + # via -r requirements_tune.in +hyperopt==0.2.5 + # via -r requirements_tune.in +idna==2.10 + # via requests +ipykernel==5.4.3 + # via + # ipywidgets + # jupyter + # jupyter-console + # notebook + # qtconsole +ipython-genutils==0.2.0 + # via + # nbformat + # notebook + # qtconsole + # traitlets +ipython==7.19.0 + # via + # ipykernel + # ipywidgets + # jupyter-console +ipywidgets==7.6.3 + # via jupyter +isodate==0.6.0 + # via msrest +itsdangerous==1.1.0 + # via flask +jedi==0.18.0 + # via ipython +jinja2==2.11.2 + # via + # ax-platform + # bokeh + # flask + # nbconvert + # notebook +jmespath==0.10.0 + # via + # boto3 + # botocore +joblib==1.0.0 + # via + # optuna + # sacremoses + # scikit-learn + # scikit-optimize +jsonschema==3.2.0 + # via + # -c ../requirements.txt + # nbformat +jupyter-client==6.1.11 + # via + # ipykernel + # jupyter-console + # nbclient + # notebook + # qtconsole +jupyter-console==6.2.0 + # via jupyter +jupyter-core==4.7.0 + # via + # jupyter-client + # nbconvert + # nbformat + # notebook + # qtconsole +jupyter==1.0.0 + # via -r requirements_tune.in +jupyterlab-pygments==0.1.2 + # via nbconvert +jupyterlab-widgets==1.0.0 + # via ipywidgets +keras==2.4.3 + # via -r requirements_tune.in +kiwisolver==1.3.1 + # via matplotlib +kubernetes==12.0.1 + # via + # -c ../requirements.txt + # -r requirements_tune.in +lightgbm==3.1.1 + # via -r requirements_tune.in +locket==0.2.1 + # via partd +mako==1.1.4 + # via alembic +markdown==3.3.3 + # via tensorboard +markupsafe==1.1.1 + # via + # jinja2 + # mako +matplotlib==3.3.3 + # via + # -r requirements_tune.in + # autogluon.core + # gluoncv + # zoopt +mistune==0.8.4 + # via nbconvert +mlflow==1.13.1 + # via -r requirements_tune.in +more-itertools==8.6.0 + # via pytest +msgpack==1.0.2 + # via + # -c ../requirements.txt + # distributed +msrest==0.6.19 + # via azure-storage-blob +mxnet==1.7.0.post1 + # via -r requirements_tune.in +nbclient==0.5.1 + # via nbconvert +nbconvert==6.0.7 + # via + # jupyter + # notebook +nbformat==5.1.2 + # via + # ipywidgets + # nbclient + # nbconvert + # notebook +nest-asyncio==1.4.3 + # via nbclient +netifaces==0.10.9 + # via hpbandster +networkx==2.5 + # via + # -c ../requirements.txt + # hyperopt +nevergrad==0.4.2.post5 + # via -r requirements_tune.in +notebook==6.2.0 + # via + # jupyter + # widgetsnbextension +numpy==1.19.5 + # via + # -c ../requirements.txt + # atari-py + # autogluon.core + # autograd + # bayesian-optimization + # bokeh + # cma + # cmaes + # configspace + # dask + # decord + # dragonfly-opt + # gluoncv + # gpy + # gym + # h5py + # hpbandster + # hyperopt + # keras + # lightgbm + # matplotlib + # mlflow + # mxnet + # nevergrad + # opencv-python + # optuna + # pandas + # paramz + # patsy + # pytorch-lightning + # scikit-learn + # scikit-optimize + # scipy + # statsmodels + # tensorboard + # tensorboardx + # tensorflow-probability + # torch + # torchvision + # transformers + # xgboost + # zoopt +oauthlib==3.1.0 + # via requests-oauthlib +opencv-python==4.5.1.48 + # via + # gluoncv + # gym +optuna==2.4.0 + # via -r requirements_tune.in +packaging==20.8 + # via + # bleach + # bokeh + # optuna + # pytest + # transformers +pandas==1.0.5 + # via + # -c ../requirements.txt + # autogluon.core + # ax-platform + # dask + # gluoncv + # mlflow + # statsmodels +pandocfilters==1.4.3 + # via nbconvert +paramiko==2.7.2 + # via autogluon.core +paramz==0.9.5 + # via gpy +parso==0.8.1 + # via jedi +partd==1.1.0 + # via dask +patsy==0.5.1 + # via statsmodels +pbr==5.5.1 + # via + # cliff + # stevedore +pexpect==4.8.0 + # via + # -c ../requirements.txt + # ipython +pickleshare==0.7.5 + # via ipython +pillow==7.2.0 ; platform_system != "Windows" + # via + # -c ../requirements.txt + # bokeh + # gluoncv + # gym + # matplotlib + # torchvision +plotly==4.14.3 + # via ax-platform +pluggy==0.13.1 + # via pytest +portalocker==2.0.0 + # via gluoncv +prettytable==0.7.2 + # via cliff +prometheus-client==0.9.0 + # via + # -c ../requirements.txt + # notebook + # prometheus-flask-exporter +prometheus-flask-exporter==0.18.1 + # via mlflow +promise==2.3 + # via wandb +prompt-toolkit==3.0.13 + # via + # ipython + # jupyter-console +protobuf==3.14.0 + # via + # -c ../requirements.txt + # mlflow + # tensorboard + # tensorboardx + # wandb +psutil==5.8.0 + # via + # distributed + # wandb +ptyprocess==0.7.0 + # via + # pexpect + # terminado +py==1.10.0 + # via pytest +pyaml==20.4.0 + # via scikit-optimize +pyasn1-modules==0.2.8 + # via google-auth +pyasn1==0.4.8 + # via + # pyasn1-modules + # rsa +pycparser==2.20 + # via cffi +pyglet==1.5.0 + # via gym +pygments==2.7.4 + # via + # -c ../requirements.txt + # ipython + # jupyter-console + # jupyterlab-pygments + # nbconvert + # qtconsole +pynacl==1.4.0 + # via paramiko +pyparsing==2.4.7 + # via + # cliff + # configspace + # matplotlib + # packaging +pyperclip==1.8.1 + # via cmd2 +pyro4==4.80 + # via hpbandster +pyrsistent==0.17.3 + # via jsonschema +pytest-remotedata==0.3.2 + # via -r requirements_tune.in +pytest==5.4.3 + # via + # -c ../requirements.txt + # autogluon.core + # pytest-remotedata +python-dateutil==2.8.1 + # via + # alembic + # bokeh + # botocore + # jupyter-client + # kubernetes + # matplotlib + # mlflow + # pandas + # wandb +python-editor==1.0.4 + # via alembic +pytorch-lightning-bolts==0.2.5 + # via -r requirements_tune.in +pytorch-lightning==1.0.3 + # via + # -r requirements_tune.in + # pytorch-lightning-bolts +pytz==2020.5 + # via pandas +pyyaml==5.4.1 + # via + # -c ../requirements.txt + # autocfg + # bokeh + # cliff + # dask + # distributed + # gluoncv + # keras + # kubernetes + # mlflow + # pyaml + # pytorch-lightning + # wandb + # yacs +pyzmq==21.0.1 + # via + # jupyter-client + # notebook + # qtconsole +qtconsole==5.0.2 + # via jupyter +qtpy==1.9.0 + # via qtconsole +querystring-parser==1.2.4 + # via mlflow +regex==2020.11.13 + # via + # sacremoses + # transformers +requests-oauthlib==1.3.0 + # via + # google-auth-oauthlib + # kubernetes + # msrest +requests==2.25.1 + # via + # -c ../requirements.txt + # autogluon.core + # azure-core + # databricks-cli + # docker + # gluoncv + # kubernetes + # mlflow + # msrest + # mxnet + # requests-oauthlib + # sigopt + # tensorboard + # transformers + # wandb +retrying==1.3.3 + # via plotly +rsa==4.7 + # via google-auth +s3transfer==0.3.4 + # via boto3 +sacremoses==0.0.43 + # via transformers +scikit-learn==0.22.2 + # via + # -c ../requirements.txt + # -r requirements_tune.in + # autogluon.core + # ax-platform + # bayesian-optimization + # gpytorch + # lightgbm + # scikit-optimize +scikit-optimize==0.8.1 + # via + # -r requirements_tune.in + # autogluon.core +scipy==1.4.1 + # via + # -c ../requirements.txt + # autogluon.core + # ax-platform + # bayesian-optimization + # botorch + # dragonfly-opt + # gluoncv + # gpy + # gpytorch + # gym + # hpbandster + # hyperopt + # keras + # lightgbm + # optuna + # paramz + # scikit-learn + # scikit-optimize + # statsmodels + # xgboost +send2trash==1.5.0 + # via notebook +sentencepiece==0.1.95 + # via transformers +sentry-sdk==0.19.5 + # via wandb +serpent==1.30.2 + # via + # hpbandster + # pyro4 +shortuuid==1.0.1 + # via wandb +sigopt==5.7.0 + # via -r requirements_tune.in +six==1.15.0 + # via + # absl-py + # argon2-cffi + # atari-py + # azure-core + # bcrypt + # bleach + # cryptography + # cycler + # databricks-cli + # dm-tree + # docker + # docker-pycreds + # dragonfly-opt + # google-auth + # gpy + # grpcio + # hyperopt + # isodate + # jsonschema + # kubernetes + # mlflow + # paramz + # patsy + # plotly + # promise + # protobuf + # pynacl + # pytest-remotedata + # python-dateutil + # querystring-parser + # retrying + # sacremoses + # tensorboard + # tensorboardx + # tensorflow-probability + # wandb + # websocket-client +smart_open==4.0.1 + # via + # -c ../requirements.txt + # -r requirements_tune.in +smmap==3.0.4 + # via gitdb +sortedcontainers==2.3.0 + # via distributed +sqlalchemy==1.3.22 + # via + # alembic + # mlflow + # optuna +sqlparse==0.4.1 + # via mlflow +statsmodels==0.12.1 + # via hpbandster +stevedore==3.3.0 + # via cliff +subprocess32==3.5.4 + # via wandb +tabulate==0.8.7 + # via + # -c ../requirements.txt + # databricks-cli +tblib==1.7.0 + # via distributed +tensorboard-plugin-wit==1.8.0 + # via tensorboard +tensorboard==2.4.1 + # via pytorch-lightning +tensorboardx==2.1 + # via + # -c ../requirements.txt + # gluoncv +tensorflow-probability==0.11.1 + # via -r requirements_tune.in +terminado==0.9.2 + # via notebook +testpath==0.4.4 + # via nbconvert +timm==0.3.2 + # via -r requirements_tune.in +tokenizers==0.8.1.rc2 + # via transformers +toolz==0.11.1 + # via + # dask + # distributed + # partd +torch==1.7.0+cpu ; sys_platform != "darwin" + # via + # -r requirements_tune.in + # botorch + # gpytorch + # pytorch-lightning + # pytorch-lightning-bolts + # timm + # torchvision +torchvision==0.8.1+cpu ; sys_platform != "darwin" + # via + # -r requirements_tune.in + # timm +tornado==6.1 + # via + # autogluon.core + # bokeh + # distributed + # ipykernel + # jupyter-client + # notebook + # terminado +tqdm==4.56.0 + # via + # autogluon.core + # gluoncv + # hyperopt + # optuna + # pytorch-lightning + # sacremoses + # transformers +traitlets==5.0.5 + # via + # ipykernel + # ipython + # ipywidgets + # jupyter-client + # jupyter-core + # nbclient + # nbconvert + # nbformat + # notebook + # qtconsole +transformers==3.1 + # via -r requirements_tune.in +typeguard==2.10.0 + # via ax-platform +typing-extensions==3.7.4.3 + # via + # bokeh + # nevergrad + # torch +typing==3.7.4.3 + # via configspace +urllib3==1.26.2 + # via + # botocore + # kubernetes + # requests + # sentry-sdk +wandb==0.10.12 + # via -r requirements_tune.in +watchdog==1.0.2 + # via wandb +wcwidth==0.2.5 + # via + # cmd2 + # prompt-toolkit + # pytest +webencodings==0.5.1 + # via bleach +websocket-client==0.57.0 + # via + # docker + # kubernetes +werkzeug==1.0.1 + # via + # -c ../requirements.txt + # flask + # tensorboard +wheel==0.36.2 + # via + # lightgbm + # tensorboard +widgetsnbextension==3.5.1 + # via ipywidgets +xgboost==1.3.0.post0 + # via -r requirements_tune.in +yacs==0.1.8 + # via gluoncv +zict==2.0.0 + # via distributed +zoopt==0.4.1 + # via -r requirements_tune.in + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/python/requirements.txt b/python/requirements/requirements.txt similarity index 100% rename from python/requirements.txt rename to python/requirements/requirements.txt diff --git a/python/requirements/requirements_tune.txt b/python/requirements/requirements_tune.in similarity index 92% rename from python/requirements/requirements_tune.txt rename to python/requirements/requirements_tune.in index 5ee1b9026f9e..96a263204e97 100644 --- a/python/requirements/requirements_tune.txt +++ b/python/requirements/requirements_tune.in @@ -1,3 +1,6 @@ +# Use base requirements to constrain these requirements. +-c ./requirements.txt + ax-platform==0.1.9; python_version < '3.7' ax-platform==0.1.19; python_version >= '3.7' bayesian-optimization==1.2.0 From 416a093c152e74aa5604fbb5efda8663a3a05851 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 027/244] Revert "Fix broken link to Flow docs (#14058)" This reverts commit c216bca63c2d98020ab9dd2bdc5e5f79d48fd1bf. --- doc/source/rllib-examples.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/rllib-examples.rst b/doc/source/rllib-examples.rst index 0f70a536a4b4..9764644a0c46 100644 --- a/doc/source/rllib-examples.rst +++ b/doc/source/rllib-examples.rst @@ -123,5 +123,5 @@ Community Examples Example of using the multi-agent API to model several `social dilemma games `__. - `StarCraft2 `__: Example of training in StarCraft2 maps with RLlib / multi-agent. -- `Traffic Flow `__: +- `Traffic Flow `__: Example of optimizing mixed-autonomy traffic simulations with RLlib / multi-agent. From b4bf7f53d3d5d96bd2a5b989a5bd1fbbbebb4771 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 028/244] Revert "Fix the wrong spark on ray link. (#14057)" This reverts commit a43aee69d570606c2062fe134cfb6f9f05352f79. --- doc/source/raydp.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/raydp.rst b/doc/source/raydp.rst index a0ee98282895..9a8353ccc9f1 100644 --- a/doc/source/raydp.rst +++ b/doc/source/raydp.rst @@ -7,7 +7,7 @@ data processing using the PySpark API and seemlessly use that data to train your models using TensorFlow and PyTorch. For more information and examples, see the RayDP Github page: -https://github.com/oap-project/raydp +https://github.com/oap_project/raydp ================ Installing RayDP From e1f3357ca2e13b06cca960c00b0ae39b9c807ec0 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 029/244] Revert "[Core] Ownership-based Object Directory - Added support for object spilling in the ownership-based object directory. (#13948)" This reverts commit e6b200b1d656d299836a83fa0b4159955ce574dc. --- python/ray/_raylet.pxd | 3 +- python/ray/_raylet.pyx | 27 +- python/ray/external_storage.py | 61 ++--- python/ray/includes/libcoreworker.pxd | 4 +- python/ray/tests/test_object_spilling.py | 3 +- src/ray/core_worker/core_worker.cc | 44 +--- src/ray/core_worker/core_worker.h | 9 +- src/ray/core_worker/reference_count.cc | 32 +-- src/ray/core_worker/reference_count.h | 21 +- .../ownership_based_object_directory.cc | 180 +++++--------- src/ray/protobuf/core_worker.proto | 32 +-- src/ray/protobuf/node_manager.proto | 4 - src/ray/raylet/local_object_manager.cc | 138 ++++------- src/ray/raylet/local_object_manager.h | 18 +- src/ray/raylet/node_manager.cc | 15 +- .../raylet/test/local_object_manager_test.cc | 230 +++++------------- src/ray/rpc/worker/core_worker_client.h | 5 - src/ray/rpc/worker/core_worker_server.h | 2 - 18 files changed, 212 insertions(+), 616 deletions(-) diff --git a/python/ray/_raylet.pxd b/python/ray/_raylet.pxd index 4a0f7b923b54..e8edc78a71b1 100644 --- a/python/ray/_raylet.pxd +++ b/python/ray/_raylet.pxd @@ -101,8 +101,7 @@ cdef class CoreWorker: cdef _create_put_buffer(self, shared_ptr[CBuffer] &metadata, size_t data_size, ObjectRef object_ref, c_vector[CObjectID] contained_ids, - CObjectID *c_object_id, shared_ptr[CBuffer] *data, - owner_address=*) + CObjectID *c_object_id, shared_ptr[CBuffer] *data) cdef store_task_outputs( self, worker, outputs, const c_vector[CObjectID] return_ids, c_vector[shared_ptr[CRayObject]] *returns) diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index da00f627345e..47b6aa4f8358 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -628,8 +628,7 @@ cdef void gc_collect() nogil: cdef c_vector[c_string] spill_objects_handler( - const c_vector[CObjectID]& object_ids_to_spill, - const c_vector[c_string]& owner_addresses) nogil: + const c_vector[CObjectID]& object_ids_to_spill) nogil: cdef c_vector[c_string] return_urls with gil: object_refs = VectorToObjectRefs(object_ids_to_spill) @@ -637,8 +636,7 @@ cdef c_vector[c_string] spill_objects_handler( with ray.worker._changeproctitle( ray_constants.WORKER_PROCESS_TYPE_SPILL_WORKER, ray_constants.WORKER_PROCESS_TYPE_SPILL_WORKER_IDLE): - urls = external_storage.spill_objects( - object_refs, owner_addresses) + urls = external_storage.spill_objects(object_refs) for url in urls: return_urls.push_back(url) except Exception: @@ -932,11 +930,7 @@ cdef class CoreWorker: cdef _create_put_buffer(self, shared_ptr[CBuffer] &metadata, size_t data_size, ObjectRef object_ref, c_vector[CObjectID] contained_ids, - CObjectID *c_object_id, shared_ptr[CBuffer] *data, - owner_address=None): - cdef: - CAddress c_owner_address - + CObjectID *c_object_id, shared_ptr[CBuffer] *data): if object_ref is None: with nogil: check_status(CCoreWorkerProcess.GetCoreWorker().CreateOwned( @@ -944,16 +938,11 @@ cdef class CoreWorker: c_object_id, data)) else: c_object_id[0] = object_ref.native() - if owner_address is None: - c_owner_address = CCoreWorkerProcess.GetCoreWorker( - ).GetRpcAddress() - else: - c_owner_address = CAddress() - c_owner_address.ParseFromString(owner_address) with nogil: check_status(CCoreWorkerProcess.GetCoreWorker().CreateExisting( metadata, data_size, c_object_id[0], - c_owner_address, data)) + CCoreWorkerProcess.GetCoreWorker().GetRpcAddress(), + data)) # If data is nullptr, that means the ObjectRef already existed, # which we ignore. @@ -962,8 +951,7 @@ cdef class CoreWorker: return data.get() == NULL def put_file_like_object( - self, metadata, data_size, file_like, ObjectRef object_ref, - owner_address): + self, metadata, data_size, file_like, ObjectRef object_ref): """Directly create a new Plasma Store object from a file like object. This avoids extra memory copy. @@ -973,7 +961,6 @@ cdef class CoreWorker: file_like: A python file object that provides the `readinto` interface. object_ref: The new ObjectRef. - owner_address: Owner address for this object ref. """ cdef: CObjectID c_object_id @@ -988,7 +975,7 @@ cdef class CoreWorker: object_already_exists = self._create_put_buffer( metadata_buf, data_size, object_ref, ObjectRefsToVector([]), - &c_object_id, &data_buf, owner_address) + &c_object_id, &data_buf) if object_already_exists: logger.debug("Object already exists in 'put_file_like_object'.") return diff --git a/python/ray/external_storage.py b/python/ray/external_storage.py index 138561f432e2..26d5c4a4dbd9 100644 --- a/python/ray/external_storage.py +++ b/python/ray/external_storage.py @@ -80,8 +80,6 @@ class ExternalStorage(metaclass=abc.ABCMeta): the external storage is invalid. """ - HEADER_LENGTH = 24 - def _get_objects_from_store(self, object_refs): worker = ray.worker.global_worker # Since the object should always exist in the plasma store before @@ -91,21 +89,18 @@ def _get_objects_from_store(self, object_refs): ray_object_pairs = worker.core_worker.get_if_local(object_refs) return ray_object_pairs - def _put_object_to_store(self, metadata, data_size, file_like, object_ref, - owner_address): + def _put_object_to_store(self, metadata, data_size, file_like, object_ref): worker = ray.worker.global_worker worker.core_worker.put_file_like_object(metadata, data_size, file_like, - object_ref, owner_address) + object_ref) def _write_multiple_objects(self, f: IO, object_refs: List[ObjectRef], - owner_addresses: List[str], url: str) -> List[str]: """Fuse all given objects into a given file handle. Args: f(IO): File handle to fusion all given object refs. object_refs(list): Object references to fusion to a single file. - owner_addresses(list): Owner addresses for the provided objects. url(str): url where the object ref is stored in the external storage. @@ -117,18 +112,13 @@ def _write_multiple_objects(self, f: IO, object_refs: List[ObjectRef], keys = [] offset = 0 ray_object_pairs = self._get_objects_from_store(object_refs) - for ref, (buf, metadata), owner_address in zip( - object_refs, ray_object_pairs, owner_addresses): - address_len = len(owner_address) + for ref, (buf, metadata) in zip(object_refs, ray_object_pairs): metadata_len = len(metadata) buf_len = len(buf) - # 24 bytes to store owner address, metadata, and buffer lengths. - data_size_in_bytes = ( - address_len + metadata_len + buf_len + self.HEADER_LENGTH) - f.write(address_len.to_bytes(8, byteorder="little")) + # 16 bytes to store metadata and buffer length. + data_size_in_bytes = metadata_len + buf_len + 16 f.write(metadata_len.to_bytes(8, byteorder="little")) f.write(buf_len.to_bytes(8, byteorder="little")) - f.write(owner_address) f.write(metadata) f.write(memoryview(buf)) url_with_offset = create_url_with_offset( @@ -137,8 +127,7 @@ def _write_multiple_objects(self, f: IO, object_refs: List[ObjectRef], offset += data_size_in_bytes return keys - def _size_check(self, address_len, metadata_len, buffer_len, - obtained_data_size): + def _size_check(self, metadata_len, buffer_len, obtained_data_size): """Check whether or not the obtained_data_size is as expected. Args: @@ -149,11 +138,9 @@ def _size_check(self, address_len, metadata_len, buffer_len, Raises: ValueError if obtained_data_size is different from - address_len + metadata_len + buffer_len + - 24 (first 8 bytes to store length). + metadata_len + buffer_len + 16(first 8 bytes to store length). """ - data_size_in_bytes = ( - address_len + metadata_len + buffer_len + self.HEADER_LENGTH) + data_size_in_bytes = metadata_len + buffer_len + 16 if data_size_in_bytes != obtained_data_size: raise ValueError( f"Obtained data has a size of {data_size_in_bytes}, " @@ -161,7 +148,7 @@ def _size_check(self, address_len, metadata_len, buffer_len, f"size of {obtained_data_size}.") @abc.abstractmethod - def spill_objects(self, object_refs, owner_addresses) -> List[str]: + def spill_objects(self, object_refs) -> List[str]: """Spill objects to the external storage. Objects are specified by their object refs. @@ -204,7 +191,7 @@ def destroy_external_storage(self): class NullStorage(ExternalStorage): """The class that represents an uninitialized external storage.""" - def spill_objects(self, object_refs, owner_addresses) -> List[str]: + def spill_objects(self, object_refs) -> List[str]: raise NotImplementedError("External storage is not initialized") def restore_spilled_objects(self, object_refs, url_with_offset_list): @@ -233,7 +220,7 @@ def __init__(self, directory_path): raise ValueError("The given directory path to store objects, " f"{self.directory_path}, could not be created.") - def spill_objects(self, object_refs, owner_addresses) -> List[str]: + def spill_objects(self, object_refs) -> List[str]: if len(object_refs) == 0: return [] # Always use the first object ref as a key when fusioning objects. @@ -241,8 +228,7 @@ def spill_objects(self, object_refs, owner_addresses) -> List[str]: filename = f"{first_ref.hex()}-multi-{len(object_refs)}" url = f"{os.path.join(self.directory_path, filename)}" with open(url, "wb") as f: - return self._write_multiple_objects(f, object_refs, - owner_addresses, url) + return self._write_multiple_objects(f, object_refs, url) def restore_spilled_objects(self, object_refs: List[ObjectRef], url_with_offset_list: List[str]): @@ -257,17 +243,13 @@ def restore_spilled_objects(self, object_refs: List[ObjectRef], # Read a part of the file and recover the object. with open(base_url, "rb") as f: f.seek(offset) - address_len = int.from_bytes(f.read(8), byteorder="little") metadata_len = int.from_bytes(f.read(8), byteorder="little") buf_len = int.from_bytes(f.read(8), byteorder="little") - self._size_check(address_len, metadata_len, buf_len, - parsed_result.size) + self._size_check(metadata_len, buf_len, parsed_result.size) total += buf_len - owner_address = f.read(address_len) metadata = f.read(metadata_len) # read remaining data to our buffer - self._put_object_to_store(metadata, buf_len, f, object_ref, - owner_address) + self._put_object_to_store(metadata, buf_len, f, object_ref) return total def delete_spilled_objects(self, urls: List[str]): @@ -338,7 +320,7 @@ def __init__(self, self.transport_params = {"defer_seek": True} self.transport_params.update(self.override_transport_params) - def spill_objects(self, object_refs, owner_addresses) -> List[str]: + def spill_objects(self, object_refs) -> List[str]: if len(object_refs) == 0: return [] from smart_open import open @@ -349,8 +331,7 @@ def spill_objects(self, object_refs, owner_addresses) -> List[str]: with open( url, "wb", transport_params=self.transport_params) as file_like: - return self._write_multiple_objects(file_like, object_refs, - owner_addresses, url) + return self._write_multiple_objects(file_like, object_refs, url) def restore_spilled_objects(self, object_refs: List[ObjectRef], url_with_offset_list: List[str]): @@ -371,16 +352,13 @@ def restore_spilled_objects(self, object_refs: List[ObjectRef], # smart open seek reads the file from offset-end_of_the_file # when the seek is called. f.seek(offset) - address_len = int.from_bytes(f.read(8), byteorder="little") metadata_len = int.from_bytes(f.read(8), byteorder="little") buf_len = int.from_bytes(f.read(8), byteorder="little") self._size_check(metadata_len, buf_len, parsed_result.size) - owner_address = f.read(address_len) total += buf_len metadata = f.read(metadata_len) # read remaining data to our buffer - self._put_object_to_store(metadata, buf_len, f, object_ref, - owner_address) + self._put_object_to_store(metadata, buf_len, f, object_ref) return total def delete_spilled_objects(self, urls: List[str]): @@ -419,17 +397,16 @@ def reset_external_storage(): _external_storage = NullStorage() -def spill_objects(object_refs, owner_addresses): +def spill_objects(object_refs): """Spill objects to the external storage. Objects are specified by their object refs. Args: object_refs: The list of the refs of the objects to be spilled. - owner_addresses: The owner addresses of the provided object refs. Returns: A list of keys corresponding to the input object refs. """ - return _external_storage.spill_objects(object_refs, owner_addresses) + return _external_storage.spill_objects(object_refs) def restore_spilled_objects(object_refs: List[ObjectRef], diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd index 6114b9e7d58c..0b7c3b0f537f 100644 --- a/python/ray/includes/libcoreworker.pxd +++ b/python/ray/includes/libcoreworker.pxd @@ -241,9 +241,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: (void(const CWorkerID &) nogil) on_worker_shutdown (CRayStatus() nogil) check_signals (void() nogil) gc_collect - (c_vector[c_string]( - const c_vector[CObjectID] &, - const c_vector[c_string] &) nogil) spill_objects + (c_vector[c_string](const c_vector[CObjectID] &) nogil) spill_objects (int64_t( const c_vector[CObjectID] &, const c_vector[c_string] &) nogil) restore_spilled_objects diff --git a/python/ray/tests/test_object_spilling.py b/python/ray/tests/test_object_spilling.py index e0e3033d255a..500c662250ac 100644 --- a/python/ray/tests/test_object_spilling.py +++ b/python/ray/tests/test_object_spilling.py @@ -564,8 +564,7 @@ def wait_until_actor_dead(): @pytest.mark.skipif( - platform.system() in ["Windows", "Darwin"], - reason="Failing on Windows and MacOS.") + platform.system() == "Windows", reason="Failing on Windows.") def test_delete_objects_multi_node(multi_node_object_spilling_config, ray_start_cluster): # Limit our object store to 75 MiB of memory. diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 86f6344b53dc..73b8b89815f2 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -1271,8 +1271,6 @@ void CoreWorker::SpillOwnedObject(const ObjectID &object_id, RAY_LOG(ERROR) << "Failed to spill object " << object_id << ", raylet unreachable or object could not be spilled."; } - // TODO(Clark): Provide spilled URL and spilled node ID to callback so it can - // added them to the reference. callback(); }); } @@ -1283,7 +1281,6 @@ Status CoreWorker::SpillObjects(const std::vector &object_ids) { auto ready_promise = std::make_shared>(std::promise()); Status final_status; - // TODO(Clark): Add spilled URL and spilled node ID to reference in this callback. auto callback = [mutex, num_remaining, ready_promise]() { absl::MutexLock lock(mutex.get()); (*num_remaining)--; @@ -1323,10 +1320,7 @@ Status CoreWorker::SpillObjects(const std::vector &object_ids) { ready_promise->get_future().wait(); for (const auto &object_id : object_ids) { - // TODO(Clark): Move this to the callback (unless we really wanted to batch it) and - // also include the spilled URL, spilled node ID, and updated object size. - reference_counter_->HandleObjectSpilled(object_id, "", NodeID::Nil(), -1, - /*release*/ true); + reference_counter_->HandleObjectSpilled(object_id); } return final_status; } @@ -2237,19 +2231,15 @@ void CoreWorker::HandleGetObjectLocationsOwner( auto object_id = ObjectID::FromBinary(request.object_id()); const auto &callback = [object_id, reply, send_reply_callback]( const absl::flat_hash_set &locations, - int64_t object_size, const std::string &spilled_url, - const NodeID &spilled_node_id, int64_t current_version) { + int64_t object_size, int64_t current_version) { RAY_LOG(DEBUG) << "Replying to HandleGetObjectLocationsOwner for " << object_id << " with location update version " << current_version << ", " - << locations.size() << " locations, " << spilled_url - << " spilled url, " << spilled_node_id << " spilled node ID, and " - << object_size << " object size."; + << locations.size() << " locations, and " << object_size + << " object size."; for (const auto &node_id : locations) { reply->add_node_ids(node_id.Binary()); } reply->set_object_size(object_size); - reply->set_spilled_url(spilled_url); - reply->set_spilled_node_id(spilled_node_id.Binary()); reply->set_current_version(current_version); send_reply_callback(Status::OK(), nullptr, nullptr); }; @@ -2442,13 +2432,7 @@ void CoreWorker::HandleSpillObjects(const rpc::SpillObjectsRequest &request, for (const auto &id_binary : request.object_ids_to_spill()) { object_ids_to_spill.push_back(ObjectID::FromBinary(id_binary)); } - std::vector owner_addresses; - owner_addresses.reserve(request.owner_addresses_size()); - for (const auto &owner_address : request.owner_addresses()) { - owner_addresses.push_back(owner_address.SerializeAsString()); - } - std::vector object_urls = - options_.spill_objects(object_ids_to_spill, owner_addresses); + std::vector object_urls = options_.spill_objects(object_ids_to_spill); for (size_t i = 0; i < object_urls.size(); i++) { reply->add_spilled_objects_url(std::move(object_urls[i])); } @@ -2459,24 +2443,6 @@ void CoreWorker::HandleSpillObjects(const rpc::SpillObjectsRequest &request, } } -void CoreWorker::HandleAddSpilledUrl(const rpc::AddSpilledUrlRequest &request, - rpc::AddSpilledUrlReply *reply, - rpc::SendReplyCallback send_reply_callback) { - const ObjectID object_id = ObjectID::FromBinary(request.object_id()); - const std::string &spilled_url = request.spilled_url(); - const NodeID node_id = NodeID::FromBinary(request.spilled_node_id()); - RAY_LOG(DEBUG) << "Received AddSpilledUrl request for object " << object_id - << ", which has been spilled to " << spilled_url << " on node " - << node_id; - auto reference_exists = reference_counter_->HandleObjectSpilled( - object_id, spilled_url, node_id, request.size(), /*release*/ false); - Status status = - reference_exists - ? Status::OK() - : Status::ObjectNotFound("Object " + object_id.Hex() + " not found"); - send_reply_callback(status, nullptr, nullptr); -} - void CoreWorker::HandleRestoreSpilledObjects( const rpc::RestoreSpilledObjectsRequest &request, rpc::RestoreSpilledObjectsReply *reply, rpc::SendReplyCallback send_reply_callback) { diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h index 2ced7a10fdb8..e1632644195d 100644 --- a/src/ray/core_worker/core_worker.h +++ b/src/ray/core_worker/core_worker.h @@ -137,9 +137,7 @@ struct CoreWorkerOptions { /// be held up in garbage objects. std::function gc_collect; /// Application-language callback to spill objects to external storage. - std::function(const std::vector &, - const std::vector &)> - spill_objects; + std::function(const std::vector &)> spill_objects; /// Application-language callback to restore objects from external storage. std::function &, const std::vector &)> restore_spilled_objects; @@ -913,11 +911,6 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { rpc::SpillObjectsReply *reply, rpc::SendReplyCallback send_reply_callback) override; - // Add spilled URL to owned reference. - void HandleAddSpilledUrl(const rpc::AddSpilledUrlRequest &request, - rpc::AddSpilledUrlReply *reply, - rpc::SendReplyCallback send_reply_callback) override; - // Restore objects from external storage. void HandleRestoreSpilledObjects(const rpc::RestoreSpilledObjectsRequest &request, rpc::RestoreSpilledObjectsReply *reply, diff --git a/src/ray/core_worker/reference_count.cc b/src/ray/core_worker/reference_count.cc index 87400ca21252..db05320a9c8b 100644 --- a/src/ray/core_worker/reference_count.cc +++ b/src/ray/core_worker/reference_count.cc @@ -960,33 +960,17 @@ size_t ReferenceCounter::GetObjectSize(const ObjectID &object_id) const { return it->second.object_size; } -bool ReferenceCounter::HandleObjectSpilled(const ObjectID &object_id, - const std::string spilled_url, - const NodeID &spilled_node_id, int64_t size, - bool release) { +void ReferenceCounter::HandleObjectSpilled(const ObjectID &object_id) { absl::MutexLock lock(&mutex_); auto it = object_id_refs_.find(object_id); if (it == object_id_refs_.end()) { RAY_LOG(WARNING) << "Spilled object " << object_id << " already out of scope"; - return false; + return; } it->second.spilled = true; - if (spilled_url != "") { - it->second.spilled_url = spilled_url; - } - if (!spilled_node_id.IsNil()) { - it->second.spilled_node_id = spilled_node_id; - } - if (size > 0) { - it->second.object_size = size; - } - PushToLocationSubscribers(it); - if (release) { - // Release the primary plasma copy, if any. - ReleasePlasmaObject(it); - } - return true; + // Release the primary plasma copy, if any. + ReleasePlasmaObject(it); } absl::optional ReferenceCounter::GetLocalityData( @@ -1026,9 +1010,8 @@ void ReferenceCounter::PushToLocationSubscribers(ReferenceTable::iterator it) { const auto callbacks = it->second.location_subscription_callbacks; it->second.location_subscription_callbacks.clear(); it->second.location_version++; - for (const auto callback : callbacks) { - callback(it->second.locations, it->second.object_size, it->second.spilled_url, - it->second.spilled_node_id, it->second.location_version); + for (const auto &callback : callbacks) { + callback(it->second.locations, it->second.object_size, it->second.location_version); } } @@ -1048,8 +1031,7 @@ Status ReferenceCounter::SubscribeObjectLocations( // If the last location version is less than the current location version, we // already have location data that the subscriber hasn't seen yet, so we immediately // invoke the callback. - callback(it->second.locations, it->second.object_size, it->second.spilled_url, - it->second.spilled_node_id, it->second.location_version); + callback(it->second.locations, it->second.object_size, it->second.location_version); } else { // Otherwise, save the callback for later invocation. it->second.location_subscription_callbacks.push_back(callback); diff --git a/src/ray/core_worker/reference_count.h b/src/ray/core_worker/reference_count.h index 415044d702dd..014b94714715 100644 --- a/src/ray/core_worker/reference_count.h +++ b/src/ray/core_worker/reference_count.h @@ -51,8 +51,7 @@ class ReferenceCounterInterface { // Callback for location subscriptions. using LocationSubscriptionCallback = - std::function &, int64_t, const std::string &, - const NodeID &, int64_t)>; + std::function &, int64_t, int64_t)>; /// Class used by the core worker to keep track of ObjectID reference counts for garbage /// collection. This class is thread safe. @@ -424,15 +423,8 @@ class ReferenceCounter : public ReferenceCounterInterface, /// Handle an object has been spilled to external storage. /// /// This notifies the primary raylet that the object is safe to release and - /// records the spill URL, spill node ID, and updated object size. - /// \param[in] object_id The object that has been spilled. - /// \param[in] spilled_url The URL to which the object has been spilled. - /// \param[in] spilled_node_id The ID of the node on which the object was spilled. - /// \param[in] size The size of the object. - /// \param[in] release Whether to release the reference. - /// \return True if the reference exists, false otherwise. - bool HandleObjectSpilled(const ObjectID &object_id, const std::string spilled_url, - const NodeID &spilled_node_id, int64_t size, bool release); + /// records that the object has been spilled to suppress reconstruction. + void HandleObjectSpilled(const ObjectID &object_id); /// Get locality data for object. absl::optional GetLocalityData(const ObjectID &object_id); @@ -594,13 +586,6 @@ class ReferenceCounter : public ReferenceCounterInterface, size_t lineage_ref_count = 0; /// Whether this object has been spilled to external storage. bool spilled = false; - /// For objects that have been spilled to external storage, the URL from which - /// they can be retrieved. - std::string spilled_url = ""; - /// The ID of the node that spilled the object. - /// This will be Nil if the object has not been spilled or if it is spilled - /// distributed external storage. - NodeID spilled_node_id = NodeID::Nil(); /// Location subscription callbacks registered by async location get requests. /// These will be invoked whenever locations or object_size are changed. std::vector location_subscription_callbacks; diff --git a/src/ray/object_manager/ownership_based_object_directory.cc b/src/ray/object_manager/ownership_based_object_directory.cc index e5477c0c20f7..3f2ccc540ed2 100644 --- a/src/ray/object_manager/ownership_based_object_directory.cc +++ b/src/ray/object_manager/ownership_based_object_directory.cc @@ -34,56 +34,6 @@ void FilterRemovedNodes(std::shared_ptr gcs_client, } } -/// Update object location data based on response from the owning core worker. -bool UpdateObjectLocations(const rpc::GetObjectLocationsOwnerReply &location_reply, - const Status &status, const ObjectID &object_id, - std::shared_ptr gcs_client, - std::unordered_set *node_ids, std::string *spilled_url, - NodeID *spilled_node_id, size_t *object_size) { - bool is_updated = false; - - std::unordered_set new_node_ids; - - if (!status.ok()) { - RAY_LOG(INFO) << "Failed to return location updates to subscribers for " << object_id - << ": " << status.ToString() - << ", assuming that the object was freed or evicted."; - // When we can't get location updates from the owner, we assume that the object was - // freed or evicted, so we send an empty location update to all subscribers. - *node_ids = new_node_ids; - is_updated = true; - } else { - // The size can be 0 if the update was a deletion. This assumes that an - // object's size is always greater than 0. - // TODO(swang): If that's not the case, we should use a flag to check - // whether the size is set instead. - if (location_reply.object_size() > 0) { - *object_size = location_reply.object_size(); - is_updated = true; - } - for (auto const &node_id : location_reply.node_ids()) { - new_node_ids.emplace(NodeID::FromBinary(node_id)); - } - // Filter out the removed nodes from the object locations. - FilterRemovedNodes(gcs_client, &new_node_ids); - if (new_node_ids != *node_ids) { - *node_ids = new_node_ids; - is_updated = true; - } - const std::string &new_spilled_url = location_reply.spilled_url(); - if (new_spilled_url != *spilled_url) { - const auto new_spilled_node_id = - NodeID::FromBinary(location_reply.spilled_node_id()); - RAY_LOG(DEBUG) << "Received object spilled to " << new_spilled_url << " spilled on " - << new_spilled_node_id; - *spilled_url = new_spilled_url; - *spilled_node_id = new_spilled_node_id; - is_updated = true; - } - } - return is_updated; -} - rpc::Address GetOwnerAddressFromObjectInfo( const object_manager::protocol::ObjectInfoT &object_info) { rpc::Address owner_address; @@ -191,13 +141,28 @@ void OwnershipBasedObjectDirectory::SubscriptionCallback( if (it == listeners_.end()) { return; } + std::unordered_set node_ids; + // Once this flag is set to true, it should never go back to false. it->second.subscribed = true; - // Update entries for this object. - if (UpdateObjectLocations(reply, status, object_id, gcs_client_, - &it->second.current_object_locations, &it->second.spilled_url, - &it->second.spilled_node_id, &it->second.object_size)) { + if (!status.ok()) { + RAY_LOG(INFO) << "Worker " << worker_id << " failed to return location updates to " + << "subscribers for " << object_id << ": " << status.ToString() + << ", assuming that the object was freed or evicted."; + it->second.object_size = 0; + } else { + if (reply.object_size() > 0) { + it->second.object_size = reply.object_size(); + } + + for (auto const &node_id : reply.node_ids()) { + node_ids.emplace(NodeID::FromBinary(node_id)); + } + FilterRemovedNodes(gcs_client_, &node_ids); + } + if (node_ids != it->second.current_object_locations || !status.ok()) { + it->second.current_object_locations = std::move(node_ids); // Copy the callbacks so that the callbacks can unsubscribe without interrupting // looping over the callbacks. auto callbacks = it->second.callbacks; @@ -206,12 +171,10 @@ void OwnershipBasedObjectDirectory::SubscriptionCallback( // empty, since this may indicate that the objects have been evicted from // all nodes. for (const auto &callback_pair : callbacks) { - // We can call the callback directly without worrying about invalidating caller - // iterators since this is already running in the subscription callback stack. - // See https://github.com/ray-project/ray/issues/2959. - callback_pair.second(object_id, it->second.current_object_locations, - it->second.spilled_url, it->second.spilled_node_id, - it->second.object_size); + // It is safe to call the callback directly since this is already running + // in the subscription callback stack. + callback_pair.second(object_id, it->second.current_object_locations, "", + NodeID::Nil(), it->second.object_size); } } @@ -259,16 +222,10 @@ ray::Status OwnershipBasedObjectDirectory::SubscribeObjectLocations( // immediately notify the caller of the current known locations. if (listener_state.subscribed) { auto &locations = listener_state.current_object_locations; - auto &spilled_url = listener_state.spilled_url; - auto &spilled_node_id = listener_state.spilled_node_id; - auto object_size = listener_state.object_size; - // We post the callback to the event loop in order to avoid mutating data structures - // shared with the caller and potentially invalidating caller iterators. - // See https://github.com/ray-project/ray/issues/2959. - io_service_.post( - [callback, locations, spilled_url, spilled_node_id, object_size, object_id]() { - callback(object_id, locations, spilled_url, spilled_node_id, object_size); - }); + auto object_size = it->second.object_size; + io_service_.post([callback, locations, object_size, object_id]() { + callback(object_id, locations, "", NodeID::Nil(), object_size); + }); } return Status::OK(); } @@ -289,63 +246,36 @@ ray::Status OwnershipBasedObjectDirectory::UnsubscribeObjectLocations( ray::Status OwnershipBasedObjectDirectory::LookupLocations( const ObjectID &object_id, const rpc::Address &owner_address, const OnLocationsFound &callback) { - auto it = listeners_.find(object_id); - if (it != listeners_.end() && it->second.subscribed) { - // If we have locations cached due to a concurrent SubscribeObjectLocations - // call, and we have received at least one update from the owner about - // the object's creation, then call the callback immediately with the - // cached locations. - auto &locations = it->second.current_object_locations; - auto &spilled_url = it->second.spilled_url; - auto &spilled_node_id = it->second.spilled_node_id; - auto object_size = it->second.object_size; - // We post the callback to the event loop in order to avoid mutating data structures - // shared with the caller and potentially invalidating caller iterators. - // See https://github.com/ray-project/ray/issues/2959. - io_service_.post( - [callback, object_id, locations, spilled_url, spilled_node_id, object_size]() { - callback(object_id, locations, spilled_url, spilled_node_id, object_size); - }); - } else { - WorkerID worker_id = WorkerID::FromBinary(owner_address.worker_id()); - std::shared_ptr rpc_client = GetClient(owner_address); - if (rpc_client == nullptr) { - RAY_LOG(WARNING) << "Object " << object_id << " does not have owner. " - << "LookupLocations returns an empty list of locations."; - // We post the callback to the event loop in order to avoid mutating data structures - // shared with the caller and potentially invalidating caller iterators. - // See https://github.com/ray-project/ray/issues/2959. - io_service_.post([callback, object_id]() { - callback(object_id, std::unordered_set(), "", NodeID::Nil(), 0); - }); - return Status::OK(); - } + WorkerID worker_id = WorkerID::FromBinary(owner_address.worker_id()); + std::shared_ptr rpc_client = GetClient(owner_address); + if (rpc_client == nullptr) { + RAY_LOG(WARNING) << "Object " << object_id << " does not have owner. " + << "LookupLocations returns an empty list of locations."; + io_service_.post([callback, object_id]() { + callback(object_id, std::unordered_set(), "", NodeID::Nil(), 0); + }); + return Status::OK(); + } - rpc::GetObjectLocationsOwnerRequest request; - request.set_intended_worker_id(owner_address.worker_id()); - request.set_object_id(object_id.Binary()); - request.set_last_version(-1); + rpc::GetObjectLocationsOwnerRequest request; + request.set_intended_worker_id(owner_address.worker_id()); + request.set_object_id(object_id.Binary()); + request.set_last_version(-1); - rpc_client->GetObjectLocationsOwner( - request, [this, worker_id, object_id, callback]( - Status status, const rpc::GetObjectLocationsOwnerReply &reply) { - if (!status.ok()) { - RAY_LOG(ERROR) << "Worker " << worker_id << " failed to get the location for " - << object_id; - } - std::unordered_set node_ids; - std::string spilled_url; - NodeID spilled_node_id; - size_t object_size = 0; - UpdateObjectLocations(reply, status, object_id, gcs_client_, &node_ids, - &spilled_url, &spilled_node_id, &object_size); - // We can call the callback directly without worrying about invalidating - // caller iterators since this is already running in the core worker - // client's lookup callback stack. - // See https://github.com/ray-project/ray/issues/2959. - callback(object_id, node_ids, spilled_url, spilled_node_id, object_size); - }); - } + rpc_client->GetObjectLocationsOwner( + request, [this, worker_id, object_id, callback]( + Status status, const rpc::GetObjectLocationsOwnerReply &reply) { + if (!status.ok()) { + RAY_LOG(ERROR) << "Worker " << worker_id << " failed to get the location for " + << object_id; + } + std::unordered_set node_ids; + for (auto const &node_id : reply.node_ids()) { + node_ids.emplace(NodeID::FromBinary(node_id)); + } + FilterRemovedNodes(gcs_client_, &node_ids); + callback(object_id, node_ids, "", NodeID::Nil(), reply.object_size()); + }); return Status::OK(); } diff --git a/src/ray/protobuf/core_worker.proto b/src/ray/protobuf/core_worker.proto index 66d5eb570782..ef5f9730212f 100644 --- a/src/ray/protobuf/core_worker.proto +++ b/src/ray/protobuf/core_worker.proto @@ -189,18 +189,10 @@ message GetObjectLocationsOwnerRequest { } message GetObjectLocationsOwnerReply { - // The IDs of the nodes that this object appeared on or was evicted by. repeated bytes node_ids = 1; - // The size of the object in bytes. uint64 object_size = 2; - // The object has been spilled to this URL. This should be set xor the above - // fields are set. - string spilled_url = 3; - // The ID of the node that spilled the object. - // This will be Nil if the object was spilled to distributed external storage. - bytes spilled_node_id = 4; // The version of the returned location updates. - int64 current_version = 5; + int64 current_version = 3; } message KillActorRequest { @@ -314,9 +306,6 @@ message PlasmaObjectReadyReply { message SpillObjectsRequest { // The IDs of objects to be spilled. repeated bytes object_ids_to_spill = 1; - // The owner addresses of the objects to be spilled. Must be in the same order as - // object_ids_to_spill. - repeated Address owner_addresses = 2; } message SpillObjectsReply { @@ -344,22 +333,6 @@ message DeleteSpilledObjectsRequest { message DeleteSpilledObjectsReply { } -message AddSpilledUrlRequest { - // Object that was spilled. - bytes object_id = 1; - // For objects that have been spilled to external storage, the URL from which - // they can be retrieved. - string spilled_url = 2; - // The ID of the node that spilled the object. - // This will be Nil if the object was spilled to distributed external storage. - bytes spilled_node_id = 3; - // The size of the object in bytes. - int64 size = 4; -} - -message AddSpilledUrlReply { -} - message ExitRequest { } @@ -412,9 +385,6 @@ service CoreWorkerService { // Delete spilled objects from external storage. Caller: raylet; callee: I/O worker. rpc DeleteSpilledObjects(DeleteSpilledObjectsRequest) returns (DeleteSpilledObjectsReply); - // Add spilled URL, spilled node ID, and update object size for owned object. - // Caller: raylet; callee: owner worker. - rpc AddSpilledUrl(AddSpilledUrlRequest) returns (AddSpilledUrlReply); // Notification from raylet that an object ID is available in local plasma. rpc PlasmaObjectReady(PlasmaObjectReadyRequest) returns (PlasmaObjectReadyReply); // Request for a worker to exit. diff --git a/src/ray/protobuf/node_manager.proto b/src/ray/protobuf/node_manager.proto index 9273665f3ed2..8e225293c54f 100644 --- a/src/ray/protobuf/node_manager.proto +++ b/src/ray/protobuf/node_manager.proto @@ -179,10 +179,6 @@ message RequestObjectSpillageRequest { message RequestObjectSpillageReply { // Whether the object spilling was successful or not. bool success = 1; - // Object URL where the object is spilled. - string object_url = 2; - // The node id of a node where the object is spilled. - bytes spilled_node_id = 3; } message RestoreSpilledObjectRequest { diff --git a/src/ray/raylet/local_object_manager.cc b/src/ray/raylet/local_object_manager.cc index 3ee7de57c816..ef9e53e21baf 100644 --- a/src/ray/raylet/local_object_manager.cc +++ b/src/ray/raylet/local_object_manager.cc @@ -21,8 +21,7 @@ namespace ray { namespace raylet { void LocalObjectManager::PinObjects(const std::vector &object_ids, - std::vector> &&objects, - const rpc::Address &owner_address) { + std::vector> &&objects) { RAY_CHECK(object_pinning_enabled_); for (size_t i = 0; i < object_ids.size(); i++) { const auto &object_id = object_ids[i]; @@ -34,7 +33,7 @@ void LocalObjectManager::PinObjects(const std::vector &object_ids, } RAY_LOG(DEBUG) << "Pinning object " << object_id; pinned_objects_size_ += object->GetSize(); - pinned_objects_.emplace(object_id, std::make_pair(std::move(object), owner_address)); + pinned_objects_.emplace(object_id, std::move(object)); } } @@ -72,7 +71,7 @@ void LocalObjectManager::ReleaseFreedObject(const ObjectID &object_id) { spilled_object_pending_delete_.push(object_id); } if (pinned_objects_.count(object_id)) { - pinned_objects_size_ -= pinned_objects_[object_id].first->GetSize(); + pinned_objects_size_ -= pinned_objects_[object_id]->GetSize(); pinned_objects_.erase(object_id); } } @@ -144,7 +143,7 @@ bool LocalObjectManager::SpillObjectsOfSize(int64_t num_bytes_to_spill) { std::vector objects_to_spill; while (bytes_to_spill <= num_bytes_to_spill && it != pinned_objects_.end()) { if (is_plasma_object_spillable_(it->first)) { - bytes_to_spill += it->second.first->GetSize(); + bytes_to_spill += it->second->GetSize(); objects_to_spill.push_back(it->first); } it++; @@ -156,7 +155,7 @@ bool LocalObjectManager::SpillObjectsOfSize(int64_t num_bytes_to_spill) { SpillObjectsInternal(objects_to_spill, [this, bytes_to_spill, objects_to_spill, start_time](const Status &status) { if (!status.ok()) { - RAY_LOG(INFO) << "Failed to spill objects: " << status.ToString(); + RAY_LOG(ERROR) << "Error spilling objects " << status.ToString(); } else { auto now = absl::GetCurrentTimeNanos(); RAY_LOG(DEBUG) << "Spilled " << bytes_to_spill << " bytes in " @@ -211,7 +210,7 @@ void LocalObjectManager::SpillObjectsInternal( if (it != pinned_objects_.end()) { RAY_LOG(DEBUG) << "Spilling object " << id; objects_to_spill.push_back(id); - num_bytes_pending_spill_ += it->second.first->GetSize(); + num_bytes_pending_spill_ += it->second->GetSize(); objects_pending_spill_[id] = std::move(it->second); pinned_objects_.erase(it); } @@ -229,9 +228,6 @@ void LocalObjectManager::SpillObjectsInternal( for (const auto &object_id : objects_to_spill) { RAY_LOG(DEBUG) << "Sending spill request for object " << object_id; request.add_object_ids_to_spill(object_id.Binary()); - auto it = objects_pending_spill_.find(object_id); - RAY_CHECK(it != objects_pending_spill_.end()); - request.add_owner_addresses()->MergeFrom(it->second.second); } io_worker->rpc_client()->SpillObjects( request, [this, objects_to_spill, callback, io_worker]( @@ -245,7 +241,7 @@ void LocalObjectManager::SpillObjectsInternal( for (const auto &object_id : objects_to_spill) { auto it = objects_pending_spill_.find(object_id); RAY_CHECK(it != objects_pending_spill_.end()); - pinned_objects_size_ += it->second.first->GetSize(); + pinned_objects_size_ += it->second->GetSize(); pinned_objects_.emplace(object_id, std::move(it->second)); objects_pending_spill_.erase(it); } @@ -262,46 +258,6 @@ void LocalObjectManager::SpillObjectsInternal( }); } -void LocalObjectManager::UnpinSpilledObjectCallback( - const ObjectID &object_id, const std::string &object_url, - std::shared_ptr num_remaining, - std::function callback, ray::Status status) { - if (!status.ok()) { - RAY_LOG(INFO) << "Failed to send spilled url for object " << object_id - << " to object directory, considering the object to have been freed: " - << status.ToString(); - } else { - RAY_LOG(DEBUG) << "Object " << object_id << " spilled to " << object_url - << " and object directory has been informed"; - } - RAY_LOG(DEBUG) << "Unpinning pending spill object " << object_id; - // Unpin the object. - auto it = objects_pending_spill_.find(object_id); - RAY_CHECK(it != objects_pending_spill_.end()); - num_bytes_pending_spill_ -= it->second.first->GetSize(); - objects_pending_spill_.erase(it); - - // Update the object_id -> url_ref_count to use it for deletion later. - // We need to track the references here because a single file can contain - // multiple objects, and we shouldn't delete the file until - // all the objects are gone out of scope. - // object_url is equivalent to url_with_offset. - auto parsed_url = ParseURL(object_url); - const auto base_url_it = parsed_url->find("url"); - RAY_CHECK(base_url_it != parsed_url->end()); - if (!url_ref_count_.contains(base_url_it->second)) { - url_ref_count_[base_url_it->second] = 1; - } else { - url_ref_count_[base_url_it->second] += 1; - } - spilled_objects_url_.emplace(object_id, object_url); - - (*num_remaining)--; - if (*num_remaining == 0 && callback) { - callback(status); - } -} - void LocalObjectManager::AddSpilledUrls( const std::vector &object_ids, const rpc::SpillObjectsReply &worker_reply, std::function callback) { @@ -318,36 +274,39 @@ void LocalObjectManager::AddSpilledUrls( auto it = objects_pending_spill_.find(object_id); RAY_CHECK(it != objects_pending_spill_.end()); - auto unpin_callback = - std::bind(&LocalObjectManager::UnpinSpilledObjectCallback, this, object_id, - object_url, num_remaining, callback, std::placeholders::_1); - - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - // TODO(Clark): Don't send RPC to owner if we're fulfilling an owner-initiated - // spill RPC. - rpc::AddSpilledUrlRequest request; - request.set_object_id(object_id.Binary()); - request.set_spilled_url(object_url); - request.set_spilled_node_id(node_id_object_spilled.Binary()); - request.set_size(it->second.first->GetSize()); - - auto owner_client = owner_client_pool_.GetOrConnect(it->second.second); - RAY_LOG(DEBUG) << "Sending spilled URL " << object_url << " for object " - << object_id << " to owner " - << WorkerID::FromBinary(it->second.second.worker_id()); - // Send spilled URL, spilled node ID, and object size to owner. - owner_client->AddSpilledUrl( - request, [unpin_callback](Status status, const rpc::AddSpilledUrlReply &reply) { - unpin_callback(status); - }); - } else { - // Write to object directory. Wait for the write to finish before - // releasing the object to make sure that the spilled object can - // be retrieved by other raylets. - RAY_CHECK_OK(object_info_accessor_.AsyncAddSpilledUrl( - object_id, object_url, node_id_object_spilled, it->second.first->GetSize(), - unpin_callback)); - } + // Write to object directory. Wait for the write to finish before + // releasing the object to make sure that the spilled object can + // be retrieved by other raylets. + RAY_CHECK_OK(object_info_accessor_.AsyncAddSpilledUrl( + object_id, object_url, node_id_object_spilled, it->second->GetSize(), + [this, object_id, object_url, callback, num_remaining](Status status) { + RAY_CHECK_OK(status); + // Unpin the object. + auto it = objects_pending_spill_.find(object_id); + RAY_CHECK(it != objects_pending_spill_.end()); + num_bytes_pending_spill_ -= it->second->GetSize(); + objects_pending_spill_.erase(it); + + // Update the object_id -> url_ref_count to use it for deletion later. + // We need to track the references here because a single file can contain + // multiple objects, and we shouldn't delete the file until + // all the objects are gone out of scope. + // object_url is equivalent to url_with_offset. + auto parsed_url = ParseURL(object_url); + const auto base_url_it = parsed_url->find("url"); + RAY_CHECK(base_url_it != parsed_url->end()); + if (!url_ref_count_.contains(base_url_it->second)) { + url_ref_count_[base_url_it->second] = 1; + } else { + url_ref_count_[base_url_it->second] += 1; + } + spilled_objects_url_.emplace(object_id, object_url); + + (*num_remaining)--; + if (*num_remaining == 0 && callback) { + callback(status); + } + })); } } @@ -362,11 +321,11 @@ void LocalObjectManager::AsyncRestoreSpilledObject( if (!node_id.IsNil() && node_id != self_node_id_) { // If we know where this object was spilled, and the current node is not that one, // send a RPC to a remote node that spilled the object to restore it. - RAY_LOG(DEBUG) << "Send an object restoration request of id: " << object_id + RAY_LOG(DEBUG) << "Send a object restoration request of id: " << object_id << " to a remote node: " << node_id; // TODO(sang): We need to deduplicate this remote RPC. Since restore request - // is retried every 10ms without exponential backoff, this can add huge overhead to - // a remote node that spilled the object. + // is retried every 10ms without exponential backoff, this can add huge overhead to a + // remote node that spilled the object. restore_object_from_remote_node_(object_id, object_url, node_id); if (callback) { callback(Status::OK()); @@ -436,9 +395,9 @@ void LocalObjectManager::ProcessSpilledObjectsDeleteQueue(uint32_t max_batch_siz object_urls_to_delete.size() < max_batch_size) { auto &object_id = spilled_object_pending_delete_.front(); // If the object is still spilling, do nothing. This will block other entries to be - // processed, but it should be fine because the spilling will be eventually done, - // and deleting objects is the low priority tasks. This will instead enable simpler - // logic after this block. + // processed, but it should be fine because the spilling will be eventually done, and + // deleting objects is the low priority tasks. + // This will instead enable simpler logic after this block. if (objects_pending_spill_.contains(object_id)) { break; } @@ -446,8 +405,8 @@ void LocalObjectManager::ProcessSpilledObjectsDeleteQueue(uint32_t max_batch_siz // Object id is either spilled or not spilled at this point. const auto spilled_objects_url_it = spilled_objects_url_.find(object_id); if (spilled_objects_url_it != spilled_objects_url_.end()) { - // If the object was spilled, see if we can delete it. We should first check the - // ref count. + // If the object was spilled, see if we can delete it. We should first check the ref + // count. std::string &object_url = spilled_objects_url_it->second; // Note that here, we need to parse the object url to obtain the base_url. auto parsed_url = ParseURL(object_url); @@ -516,4 +475,5 @@ std::string LocalObjectManager::DebugString() const { } }; // namespace raylet + }; // namespace ray diff --git a/src/ray/raylet/local_object_manager.h b/src/ray/raylet/local_object_manager.h index 267edabd9d8a..57ef8d3a1673 100644 --- a/src/ray/raylet/local_object_manager.h +++ b/src/ray/raylet/local_object_manager.h @@ -70,10 +70,8 @@ class LocalObjectManager { /// \param object_ids The objects to be pinned. /// \param objects Pointers to the objects to be pinned. The pointer should /// be kept in scope until the object can be released. - /// \param owner_address The owner of the objects to be pinned. void PinObjects(const std::vector &object_ids, - std::vector> &&objects, - const rpc::Address &owner_address); + std::vector> &&objects); /// Wait for the objects' owner to free the object. The objects will be /// released when the owner at the given address fails or replies that the @@ -166,14 +164,6 @@ class LocalObjectManager { /// objects. void FlushFreeObjects(); - // A callback for unpinning spilled objects. This should be invoked after the object - // has been spilled and after the object directory has been sent the spilled URL. - void UnpinSpilledObjectCallback(const ObjectID &object_id, - const std::string &object_url, - std::shared_ptr num_remaining, - std::function callback, - ray::Status status); - /// Add objects' spilled URLs to the global object directory. Call the /// callback once all URLs have been added. void AddSpilledUrls(const std::vector &object_ids, @@ -213,8 +203,7 @@ class LocalObjectManager { std::function &)> on_objects_freed_; // Objects that are pinned on this node. - absl::flat_hash_map, rpc::Address>> - pinned_objects_; + absl::flat_hash_map> pinned_objects_; // Total size of objects pinned on this node. size_t pinned_objects_size_ = 0; @@ -222,8 +211,7 @@ class LocalObjectManager { // Objects that were pinned on this node but that are being spilled. // These objects will be released once spilling is complete and the URL is // written to the object directory. - absl::flat_hash_map, rpc::Address>> - objects_pending_spill_; + absl::flat_hash_map> objects_pending_spill_; /// Objects that were spilled on this node but that are being restored. /// The field is used to dedup the same restore request while restoration is in diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index 2287fd3e821b..9b66d0a7cc82 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -516,17 +516,11 @@ void NodeManager::DoLocalGC() { void NodeManager::HandleRequestObjectSpillage( const rpc::RequestObjectSpillageRequest &request, rpc::RequestObjectSpillageReply *reply, rpc::SendReplyCallback send_reply_callback) { - const auto &object_id = ObjectID::FromBinary(request.object_id()); - RAY_LOG(DEBUG) << "Received RequestObjectSpillage for object " << object_id; local_object_manager_.SpillObjects( - {object_id}, [object_id, reply, send_reply_callback](const ray::Status &status) { + {ObjectID::FromBinary(request.object_id())}, + [reply, send_reply_callback](const ray::Status &status) { if (status.ok()) { - RAY_LOG(DEBUG) << "Object " << object_id - << " has been spilled, replying to owner"; reply->set_success(true); - // TODO(Clark): Add spilled URLs and spilled node ID to owner RPC reply here - // if OBOD is enabled, instead of relying on automatic raylet spilling path to - // send an extra RPC to the owner. } send_reply_callback(Status::OK(), nullptr, nullptr); }); @@ -2412,7 +2406,6 @@ void NodeManager::HandlePinObjectIDs(const rpc::PinObjectIDsRequest &request, rpc::SendReplyCallback send_reply_callback) { std::vector object_ids; object_ids.reserve(request.object_ids_size()); - const auto &owner_address = request.owner_address(); for (const auto &object_id_binary : request.object_ids()) { object_ids.push_back(ObjectID::FromBinary(object_id_binary)); } @@ -2426,10 +2419,10 @@ void NodeManager::HandlePinObjectIDs(const rpc::PinObjectIDsRequest &request, send_reply_callback(Status::Invalid("Failed to get objects."), nullptr, nullptr); return; } - local_object_manager_.PinObjects(object_ids, std::move(results), owner_address); + local_object_manager_.PinObjects(object_ids, std::move(results)); } // Wait for the object to be freed by the owner, which keeps the ref count. - local_object_manager_.WaitForObjectFree(owner_address, object_ids); + local_object_manager_.WaitForObjectFree(request.owner_address(), object_ids); send_reply_callback(Status::OK(), nullptr, nullptr); } diff --git a/src/ray/raylet/test/local_object_manager_test.cc b/src/ray/raylet/test/local_object_manager_test.cc index d056928c0219..f68707ce7a01 100644 --- a/src/ray/raylet/test/local_object_manager_test.cc +++ b/src/ray/raylet/test/local_object_manager_test.cc @@ -37,41 +37,21 @@ class MockWorkerClient : public rpc::CoreWorkerClientInterface { void WaitForObjectEviction( const rpc::WaitForObjectEvictionRequest &request, const rpc::ClientCallback &callback) override { - eviction_callbacks.push_back(callback); + callbacks.push_back(callback); } bool ReplyObjectEviction(Status status = Status::OK()) { - if (eviction_callbacks.empty()) { + if (callbacks.size() == 0) { return false; } - auto callback = eviction_callbacks.front(); + auto callback = callbacks.front(); auto reply = rpc::WaitForObjectEvictionReply(); callback(status, reply); - eviction_callbacks.pop_front(); - return true; - } - - void AddSpilledUrl( - const rpc::AddSpilledUrlRequest &request, - const rpc::ClientCallback &callback) override { - object_urls.emplace(ObjectID::FromBinary(request.object_id()), request.spilled_url()); - spilled_url_callbacks.push_back(callback); - } - - bool ReplyAddSpilledUrl(Status status = Status::OK()) { - if (spilled_url_callbacks.empty()) { - return false; - } - auto callback = spilled_url_callbacks.front(); - auto reply = rpc::AddSpilledUrlReply(); - callback(status, reply); - spilled_url_callbacks.pop_front(); + callbacks.pop_front(); return true; } - std::deque> eviction_callbacks; - std::unordered_map object_urls; - std::deque> spilled_url_callbacks; + std::list> callbacks; }; class MockIOWorkerClient : public rpc::CoreWorkerClientInterface { @@ -354,7 +334,7 @@ TEST_F(LocalObjectManagerTest, TestPin) { new RayObject(nullptr, meta_buffer, std::vector())); objects.push_back(std::move(object)); } - manager.PinObjects(object_ids, std::move(objects), owner_address); + manager.PinObjects(object_ids, std::move(objects)); manager.WaitForObjectFree(owner_address, object_ids); for (size_t i = 0; i < free_objects_batch_size; i++) { @@ -369,8 +349,6 @@ TEST_F(LocalObjectManagerTest, TestRestoreSpilledObject) { // First, spill objects. std::vector object_ids; std::vector> objects; - rpc::Address owner_address; - owner_address.set_worker_id(WorkerID::FromRandom().Binary()); for (size_t i = 0; i < free_objects_batch_size; i++) { ObjectID object_id = ObjectID::FromRandom(); @@ -380,7 +358,7 @@ TEST_F(LocalObjectManagerTest, TestRestoreSpilledObject) { new RayObject(data_buffer, nullptr, std::vector())); objects.push_back(std::move(object)); } - manager.PinObjects(object_ids, std::move(objects), owner_address); + manager.PinObjects(object_ids, std::move(objects)); manager.SpillObjects(object_ids, [&](const Status &status) mutable { ASSERT_TRUE(status.ok()); }); @@ -390,11 +368,7 @@ TEST_F(LocalObjectManagerTest, TestRestoreSpilledObject) { } ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects(urls)); for (size_t i = 0; i < object_ids.size(); i++) { - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_TRUE(owner_client->ReplyAddSpilledUrl()); - } else { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - } + ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); } // Then try restoring objects from local. @@ -442,8 +416,6 @@ TEST_F(LocalObjectManagerTest, TestRestoreSpilledObject) { TEST_F(LocalObjectManagerTest, TestExplicitSpill) { std::vector object_ids; std::vector> objects; - rpc::Address owner_address; - owner_address.set_worker_id(WorkerID::FromRandom().Binary()); for (size_t i = 0; i < free_objects_batch_size; i++) { ObjectID object_id = ObjectID::FromRandom(); @@ -453,7 +425,7 @@ TEST_F(LocalObjectManagerTest, TestExplicitSpill) { new RayObject(data_buffer, nullptr, std::vector())); objects.push_back(std::move(object)); } - manager.PinObjects(object_ids, std::move(objects), owner_address); + manager.PinObjects(object_ids, std::move(objects)); int num_times_fired = 0; manager.SpillObjects(object_ids, [&](const Status &status) mutable { @@ -472,19 +444,11 @@ TEST_F(LocalObjectManagerTest, TestExplicitSpill) { } ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects(urls)); for (size_t i = 0; i < object_ids.size(); i++) { - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_TRUE(owner_client->ReplyAddSpilledUrl()); - } else { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - } + ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); } ASSERT_EQ(num_times_fired, 1); for (size_t i = 0; i < object_ids.size(); i++) { - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_EQ(owner_client->object_urls[object_ids[i]], urls[i]); - } else { - ASSERT_EQ(object_table.object_urls[object_ids[i]], urls[i]); - } + ASSERT_EQ(object_table.object_urls[object_ids[i]], urls[i]); } for (const auto &id : object_ids) { ASSERT_EQ((*unpins)[id], 1); @@ -506,7 +470,7 @@ TEST_F(LocalObjectManagerTest, TestDuplicateSpill) { new RayObject(data_buffer, nullptr, std::vector())); objects.push_back(std::move(object)); } - manager.PinObjects(object_ids, std::move(objects), owner_address); + manager.PinObjects(object_ids, std::move(objects)); manager.WaitForObjectFree(owner_address, object_ids); int num_times_fired = 0; @@ -530,19 +494,11 @@ TEST_F(LocalObjectManagerTest, TestDuplicateSpill) { EXPECT_CALL(worker_pool, PushSpillWorker(_)); ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects(urls)); for (size_t i = 0; i < object_ids.size(); i++) { - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_TRUE(owner_client->ReplyAddSpilledUrl()); - } else { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - } + ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); } ASSERT_EQ(num_times_fired, 1); for (size_t i = 0; i < object_ids.size(); i++) { - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_EQ(owner_client->object_urls[object_ids[i]], urls[i]); - } else { - ASSERT_EQ(object_table.object_urls[object_ids[i]], urls[i]); - } + ASSERT_EQ(object_table.object_urls[object_ids[i]], urls[i]); } ASSERT_FALSE(worker_pool.io_worker_client->ReplySpillObjects(urls)); for (const auto &id : object_ids) { @@ -568,7 +524,7 @@ TEST_F(LocalObjectManagerTest, TestSpillObjectsOfSize) { new RayObject(data_buffer, nullptr, std::vector())); objects.push_back(std::move(object)); } - manager.PinObjects(object_ids, std::move(objects), owner_address); + manager.PinObjects(object_ids, std::move(objects)); ASSERT_TRUE(manager.SpillObjectsOfSize(total_size / 2)); for (const auto &id : object_ids) { ASSERT_EQ((*unpins)[id], 0); @@ -585,26 +541,13 @@ TEST_F(LocalObjectManagerTest, TestSpillObjectsOfSize) { // to evict. ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects(urls)); for (size_t i = 0; i < urls.size(); i++) { - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_TRUE(owner_client->ReplyAddSpilledUrl()); - } else { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - } + ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); } - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_EQ(owner_client->object_urls.size(), object_ids.size() / 2 + 1); - for (auto &object_url : owner_client->object_urls) { - auto it = std::find(urls.begin(), urls.end(), object_url.second); - ASSERT_TRUE(it != urls.end()); - ASSERT_EQ((*unpins)[object_url.first], 1); - } - } else { - ASSERT_EQ(object_table.object_urls.size(), object_ids.size() / 2 + 1); - for (auto &object_url : object_table.object_urls) { - auto it = std::find(urls.begin(), urls.end(), object_url.second); - ASSERT_TRUE(it != urls.end()); - ASSERT_EQ((*unpins)[object_url.first], 1); - } + ASSERT_EQ(object_table.object_urls.size(), object_ids.size() / 2 + 1); + for (auto &object_url : object_table.object_urls) { + auto it = std::find(urls.begin(), urls.end(), object_url.second); + ASSERT_TRUE(it != urls.end()); + ASSERT_EQ((*unpins)[object_url.first], 1); } // Make sure providing 0 bytes to SpillObjectsOfSize will spill one object. @@ -613,23 +556,13 @@ TEST_F(LocalObjectManagerTest, TestSpillObjectsOfSize) { EXPECT_CALL(worker_pool, PushSpillWorker(_)); const std::string url = BuildURL("url" + std::to_string(object_ids.size())); ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects({url})); + ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); + ASSERT_EQ(object_table.object_urls.size(), 3); urls.push_back(url); - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_TRUE(owner_client->ReplyAddSpilledUrl()); - ASSERT_EQ(owner_client->object_urls.size(), 3); - for (auto &object_url : owner_client->object_urls) { - auto it = std::find(urls.begin(), urls.end(), object_url.second); - ASSERT_TRUE(it != urls.end()); - ASSERT_EQ((*unpins)[object_url.first], 1); - } - } else { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - ASSERT_EQ(object_table.object_urls.size(), 3); - for (auto &object_url : object_table.object_urls) { - auto it = std::find(urls.begin(), urls.end(), object_url.second); - ASSERT_TRUE(it != urls.end()); - ASSERT_EQ((*unpins)[object_url.first], 1); - } + for (auto &object_url : object_table.object_urls) { + auto it = std::find(urls.begin(), urls.end(), object_url.second); + ASSERT_TRUE(it != urls.end()); + ASSERT_EQ((*unpins)[object_url.first], 1); } // Since there's no more object to spill, this should fail. @@ -654,7 +587,7 @@ TEST_F(LocalObjectManagerTest, TestSpillObjectNotEvictable) { new RayObject(data_buffer, nullptr, std::vector())); objects.push_back(std::move(object)); - manager.PinObjects(object_ids, std::move(objects), owner_address); + manager.PinObjects(object_ids, std::move(objects)); ASSERT_FALSE(manager.SpillObjectsOfSize(1000)); for (const auto &id : object_ids) { ASSERT_EQ((*unpins)[id], 0); @@ -683,7 +616,7 @@ TEST_F(LocalObjectManagerTest, TestSpillUptoMaxThroughput) { new RayObject(data_buffer, nullptr, std::vector())); objects.push_back(std::move(object)); } - manager.PinObjects(object_ids, std::move(objects), owner_address); + manager.PinObjects(object_ids, std::move(objects)); // This will spill until 2 workers are occupied. manager.SpillObjectUptoMaxThroughput(); @@ -700,23 +633,12 @@ TEST_F(LocalObjectManagerTest, TestSpillUptoMaxThroughput) { std::vector urls; urls.push_back(BuildURL("url" + std::to_string(0))); ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects({urls[0]})); - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_TRUE(owner_client->ReplyAddSpilledUrl()); - // Make sure object is spilled. - ASSERT_EQ(owner_client->object_urls.size(), 1); - for (auto &object_url : owner_client->object_urls) { - if (urls[0] == object_url.second) { - ASSERT_EQ((*unpins)[object_url.first], 1); - } - } - } else { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - // Make sure object is spilled. - ASSERT_EQ(object_table.object_urls.size(), 1); - for (auto &object_url : object_table.object_urls) { - if (urls[0] == object_url.second) { - ASSERT_EQ((*unpins)[object_url.first], 1); - } + ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); + // Make sure object is spilled. + ASSERT_EQ(object_table.object_urls.size(), 1); + for (auto &object_url : object_table.object_urls) { + if (urls[0] == object_url.second) { + ASSERT_EQ((*unpins)[object_url.first], 1); } } @@ -734,26 +656,13 @@ TEST_F(LocalObjectManagerTest, TestSpillUptoMaxThroughput) { } for (size_t i = 1; i < urls.size(); i++) { ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects({urls[i]})); - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_TRUE(owner_client->ReplyAddSpilledUrl()); - } else { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - } + ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); } - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_EQ(owner_client->object_urls.size(), 3); - for (auto &object_url : owner_client->object_urls) { - auto it = std::find(urls.begin(), urls.end(), object_url.second); - ASSERT_TRUE(it != urls.end()); - ASSERT_EQ((*unpins)[object_url.first], 1); - } - } else { - ASSERT_EQ(object_table.object_urls.size(), 3); - for (auto &object_url : object_table.object_urls) { - auto it = std::find(urls.begin(), urls.end(), object_url.second); - ASSERT_TRUE(it != urls.end()); - ASSERT_EQ((*unpins)[object_url.first], 1); - } + ASSERT_EQ(object_table.object_urls.size(), 3); + for (auto &object_url : object_table.object_urls) { + auto it = std::find(urls.begin(), urls.end(), object_url.second); + ASSERT_TRUE(it != urls.end()); + ASSERT_EQ((*unpins)[object_url.first], 1); } // We cannot spill anymore as there is no more pinned object. @@ -774,7 +683,7 @@ TEST_F(LocalObjectManagerTest, TestSpillError) { std::vector> objects; objects.push_back(std::move(object)); - manager.PinObjects({object_id}, std::move(objects), owner_address); + manager.PinObjects({object_id}, std::move(objects)); int num_times_fired = 0; manager.SpillObjects({object_id}, [&](const Status &status) mutable { @@ -786,11 +695,7 @@ TEST_F(LocalObjectManagerTest, TestSpillError) { EXPECT_CALL(worker_pool, PushSpillWorker(_)); ASSERT_TRUE( worker_pool.io_worker_client->ReplySpillObjects({}, Status::IOError("error"))); - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_FALSE(owner_client->ReplyAddSpilledUrl()); - } else { - ASSERT_FALSE(object_table.ReplyAsyncAddSpilledUrl()); - } + ASSERT_FALSE(object_table.ReplyAsyncAddSpilledUrl()); ASSERT_EQ(num_times_fired, 1); ASSERT_EQ((*unpins)[object_id], 0); @@ -802,14 +707,9 @@ TEST_F(LocalObjectManagerTest, TestSpillError) { std::string url = BuildURL("url"); EXPECT_CALL(worker_pool, PushSpillWorker(_)); ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects({url})); - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_TRUE(owner_client->ReplyAddSpilledUrl()); - ASSERT_EQ(owner_client->object_urls[object_id], url); - } else { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - ASSERT_EQ(object_table.object_urls[object_id], url); - } + ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); ASSERT_EQ(num_times_fired, 2); + ASSERT_EQ(object_table.object_urls[object_id], url); ASSERT_EQ((*unpins)[object_id], 1); } @@ -829,7 +729,7 @@ TEST_F(LocalObjectManagerTest, TestDeleteNoSpilledObjects) { new RayObject(data_buffer, nullptr, std::vector())); objects.push_back(std::move(object)); } - manager.PinObjects(object_ids, std::move(objects), owner_address); + manager.PinObjects(object_ids, std::move(objects)); manager.WaitForObjectFree(owner_address, object_ids); for (size_t i = 0; i < free_objects_batch_size; i++) { @@ -857,7 +757,7 @@ TEST_F(LocalObjectManagerTest, TestDeleteSpilledObjects) { new RayObject(data_buffer, nullptr, std::vector())); objects.push_back(std::move(object)); } - manager.PinObjects(object_ids, std::move(objects), owner_address); + manager.PinObjects(object_ids, std::move(objects)); manager.WaitForObjectFree(owner_address, object_ids); // 2 Objects are spilled out of 3. @@ -874,11 +774,7 @@ TEST_F(LocalObjectManagerTest, TestDeleteSpilledObjects) { } ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects(urls)); for (size_t i = 0; i < object_ids_to_spill.size(); i++) { - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_TRUE(owner_client->ReplyAddSpilledUrl()); - } else { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - } + ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); } // All objects are out of scope now. @@ -909,7 +805,7 @@ TEST_F(LocalObjectManagerTest, TestDeleteURLRefCount) { new RayObject(data_buffer, nullptr, std::vector())); objects.push_back(std::move(object)); } - manager.PinObjects(object_ids, std::move(objects), owner_address); + manager.PinObjects(object_ids, std::move(objects)); manager.WaitForObjectFree(owner_address, object_ids); // Every object is spilled. @@ -930,11 +826,7 @@ TEST_F(LocalObjectManagerTest, TestDeleteURLRefCount) { } ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects(urls)); for (size_t i = 0; i < object_ids_to_spill.size(); i++) { - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_TRUE(owner_client->ReplyAddSpilledUrl()); - } else { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - } + ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); } // Everything is evicted except the last object. In this case, ref count is still > 0. @@ -970,7 +862,7 @@ TEST_F(LocalObjectManagerTest, TestDeleteSpillingObjectsBlocking) { new RayObject(data_buffer, nullptr, std::vector())); objects.push_back(std::move(object)); } - manager.PinObjects(object_ids, std::move(objects), owner_address); + manager.PinObjects(object_ids, std::move(objects)); manager.WaitForObjectFree(owner_address, object_ids); // Objects are spilled. @@ -989,11 +881,7 @@ TEST_F(LocalObjectManagerTest, TestDeleteSpillingObjectsBlocking) { } ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects(urls)); for (size_t i = 0; i < 1; i++) { - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_TRUE(owner_client->ReplyAddSpilledUrl()); - } else { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - } + ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); } // Every object has gone out of scope. for (size_t i = 0; i < free_objects_batch_size; i++) { @@ -1012,11 +900,7 @@ TEST_F(LocalObjectManagerTest, TestDeleteSpillingObjectsBlocking) { new_urls.push_back(BuildURL("url" + std::to_string(i))); } for (size_t i = 1; i < object_ids_to_spill.size(); i++) { - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_TRUE(owner_client->ReplyAddSpilledUrl()); - } else { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - } + ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); } // Every object is now deleted. @@ -1041,7 +925,7 @@ TEST_F(LocalObjectManagerTest, TestDeleteMaxObjects) { new RayObject(data_buffer, nullptr, std::vector())); objects.push_back(std::move(object)); } - manager.PinObjects(object_ids, std::move(objects), owner_address); + manager.PinObjects(object_ids, std::move(objects)); manager.WaitForObjectFree(owner_address, object_ids); std::vector object_ids_to_spill; @@ -1059,11 +943,7 @@ TEST_F(LocalObjectManagerTest, TestDeleteMaxObjects) { } ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects(urls)); for (size_t i = 0; i < object_ids_to_spill.size(); i++) { - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_TRUE(owner_client->ReplyAddSpilledUrl()); - } else { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - } + ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); } // Every reference has gone out of scope. diff --git a/src/ray/rpc/worker/core_worker_client.h b/src/ray/rpc/worker/core_worker_client.h index 8f2796581e31..a014a1776a4e 100644 --- a/src/ray/rpc/worker/core_worker_client.h +++ b/src/ray/rpc/worker/core_worker_client.h @@ -186,9 +186,6 @@ class CoreWorkerClientInterface { const DeleteSpilledObjectsRequest &request, const ClientCallback &callback) {} - virtual void AddSpilledUrl(const AddSpilledUrlRequest &request, - const ClientCallback &callback) {} - virtual void PlasmaObjectReady(const PlasmaObjectReadyRequest &request, const ClientCallback &callback) { } @@ -254,8 +251,6 @@ class CoreWorkerClient : public std::enable_shared_from_this, VOID_RPC_CLIENT_METHOD(CoreWorkerService, DeleteSpilledObjects, grpc_client_, override) - VOID_RPC_CLIENT_METHOD(CoreWorkerService, AddSpilledUrl, grpc_client_, override) - VOID_RPC_CLIENT_METHOD(CoreWorkerService, PlasmaObjectReady, grpc_client_, override) VOID_RPC_CLIENT_METHOD(CoreWorkerService, Exit, grpc_client_, override) diff --git a/src/ray/rpc/worker/core_worker_server.h b/src/ray/rpc/worker/core_worker_server.h index 37c01cf484c2..8f9d236e0b97 100644 --- a/src/ray/rpc/worker/core_worker_server.h +++ b/src/ray/rpc/worker/core_worker_server.h @@ -44,7 +44,6 @@ namespace rpc { RPC_SERVICE_HANDLER(CoreWorkerService, SpillObjects) \ RPC_SERVICE_HANDLER(CoreWorkerService, RestoreSpilledObjects) \ RPC_SERVICE_HANDLER(CoreWorkerService, DeleteSpilledObjects) \ - RPC_SERVICE_HANDLER(CoreWorkerService, AddSpilledUrl) \ RPC_SERVICE_HANDLER(CoreWorkerService, PlasmaObjectReady) \ RPC_SERVICE_HANDLER(CoreWorkerService, Exit) @@ -66,7 +65,6 @@ namespace rpc { DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(SpillObjects) \ DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(RestoreSpilledObjects) \ DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(DeleteSpilledObjects) \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(AddSpilledUrl) \ DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(PlasmaObjectReady) \ DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(Exit) From a5f28c08c2b30da786d167f3e03eedc7009ef199 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 030/244] Revert "[RLlib] Issue #13507: Fix MB-MPO CartPole Env's reward function as well as MB-MPO running into a traj. view API related issue. (#14037)" This reverts commit 5dfdea2c7750fb2eea160546be0839a649f91f0c. --- rllib/BUILD | 12 ++-- rllib/agents/mbmpo/model_ensemble.py | 3 - rllib/examples/env/mbmpo_env.py | 86 ++++++++++++++-------------- rllib/policy/dynamic_tf_policy.py | 8 +-- rllib/policy/policy.py | 9 +-- 5 files changed, 53 insertions(+), 65 deletions(-) diff --git a/rllib/BUILD b/rllib/BUILD index a09a549b1712..431f6b75ab19 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -542,12 +542,12 @@ py_test( ) # MBMPOTrainer -py_test( - name = "test_mbmpo", - tags = ["agents_dir"], - size = "medium", - srcs = ["agents/mbmpo/tests/test_mbmpo.py"] -) +#py_test( +# name = "test_mbmpo", +# tags = ["agents_dir"], +# size = "medium", +# srcs = ["agents/mbmpo/tests/test_mbmpo.py"] +#) # PGTrainer py_test( diff --git a/rllib/agents/mbmpo/model_ensemble.py b/rllib/agents/mbmpo/model_ensemble.py index 1d0f13b719cb..f7cb35b6f9e1 100644 --- a/rllib/agents/mbmpo/model_ensemble.py +++ b/rllib/agents/mbmpo/model_ensemble.py @@ -200,9 +200,6 @@ def loss(self, x, y): def fit(self): # Add env samples to Replay Buffer local_worker = get_global_worker() - for pid, pol in local_worker.policy_map.items(): - pol.view_requirements[ - SampleBatch.NEXT_OBS].used_for_training = True new_samples = local_worker.sample() # Initial Exploration of 8000 timesteps if not self.global_itr: diff --git a/rllib/examples/env/mbmpo_env.py b/rllib/examples/env/mbmpo_env.py index 87c367611d98..c49ef77be78c 100644 --- a/rllib/examples/env/mbmpo_env.py +++ b/rllib/examples/env/mbmpo_env.py @@ -1,12 +1,12 @@ +import gym from gym.envs.classic_control import PendulumEnv, CartPoleEnv import numpy as np # MuJoCo may not be installed. HalfCheetahEnv = HopperEnv = None - try: from gym.envs.mujoco import HalfCheetahEnv, HopperEnv -except Exception: +except (ImportError, gym.error.DependencyNotInstalled): pass @@ -22,12 +22,11 @@ def reward(self, obs, action, obs_next): x = obs_next[:, 0] theta = obs_next[:, 2] - # 1.0 if we are still on, 0.0 if we are terminated due to bounds - # (angular or x-axis) being breached. - rew = 1.0 - ((x < -self.x_threshold) | (x > self.x_threshold) | - (theta < -self.theta_threshold_radians) | - (theta > self.theta_threshold_radians)).astype(np.float32) + rew = (x < -self.x_threshold) | (x > self.x_threshold) | ( + theta < -self.theta_threshold_radians) | ( + theta > self.theta_threshold_radians) + rew = rew.astype(float) return rew @@ -55,45 +54,46 @@ def angle_normalize(x): return (((x + np.pi) % (2 * np.pi)) - np.pi) -class HalfCheetahWrapper(HalfCheetahEnv or object): - """Wrapper for the MuJoCo HalfCheetah-v2 environment. - - Adds an additional `reward` method for some model-based RL algos (e.g. - MB-MPO). - """ - - def reward(self, obs, action, obs_next): - if obs.ndim == 2 and action.ndim == 2: - assert obs.shape == obs_next.shape - forward_vel = obs_next[:, 8] - ctrl_cost = 0.1 * np.sum(np.square(action), axis=1) - reward = forward_vel - ctrl_cost - return np.minimum(np.maximum(-1000.0, reward), 1000.0) - else: - forward_vel = obs_next[8] - ctrl_cost = 0.1 * np.square(action).sum() - reward = forward_vel - ctrl_cost +if HalfCheetahEnv: + + class HalfCheetahWrapper(HalfCheetahEnv): + """Wrapper for the MuJoCo HalfCheetah-v2 environment. + + Adds an additional `reward` method for some model-based RL algos (e.g. + MB-MPO). + """ + + def reward(self, obs, action, obs_next): + if obs.ndim == 2 and action.ndim == 2: + assert obs.shape == obs_next.shape + forward_vel = obs_next[:, 8] + ctrl_cost = 0.1 * np.sum(np.square(action), axis=1) + reward = forward_vel - ctrl_cost + return np.minimum(np.maximum(-1000.0, reward), 1000.0) + else: + forward_vel = obs_next[8] + ctrl_cost = 0.1 * np.square(action).sum() + reward = forward_vel - ctrl_cost + return np.minimum(np.maximum(-1000.0, reward), 1000.0) + + class HopperWrapper(HopperEnv): + """Wrapper for the MuJoCo Hopper-v2 environment. + + Adds an additional `reward` method for some model-based RL algos (e.g. + MB-MPO). + """ + + def reward(self, obs, action, obs_next): + alive_bonus = 1.0 + assert obs.ndim == 2 and action.ndim == 2 + assert (obs.shape == obs_next.shape + and action.shape[0] == obs.shape[0]) + vel = obs_next[:, 5] + ctrl_cost = 1e-3 * np.sum(np.square(action), axis=1) + reward = vel + alive_bonus - ctrl_cost return np.minimum(np.maximum(-1000.0, reward), 1000.0) -class HopperWrapper(HopperEnv or object): - """Wrapper for the MuJoCo Hopper-v2 environment. - - Adds an additional `reward` method for some model-based RL algos (e.g. - MB-MPO). - """ - - def reward(self, obs, action, obs_next): - alive_bonus = 1.0 - assert obs.ndim == 2 and action.ndim == 2 - assert (obs.shape == obs_next.shape - and action.shape[0] == obs.shape[0]) - vel = obs_next[:, 5] - ctrl_cost = 1e-3 * np.sum(np.square(action), axis=1) - reward = vel + alive_bonus - ctrl_cost - return np.minimum(np.maximum(-1000.0, reward), 1000.0) - - if __name__ == "__main__": env = PendulumWrapper() env.reset() diff --git a/rllib/policy/dynamic_tf_policy.py b/rllib/policy/dynamic_tf_policy.py index e56691370eb1..a5b01db875c8 100644 --- a/rllib/policy/dynamic_tf_policy.py +++ b/rllib/policy/dynamic_tf_policy.py @@ -580,14 +580,10 @@ def fake_array(tensor): # Add those needed for postprocessing and training. all_accessed_keys = train_batch.accessed_keys | \ batch_for_postproc.accessed_keys - # Tag those only needed for post-processing (with some exceptions). + # Tag those only needed for post-processing. for key in batch_for_postproc.accessed_keys: if key not in train_batch.accessed_keys and \ - key not in self.model.view_requirements and \ - key not in [ - SampleBatch.EPS_ID, SampleBatch.AGENT_INDEX, - SampleBatch.UNROLL_ID, SampleBatch.DONES, - SampleBatch.REWARDS, SampleBatch.INFOS]: + key not in self.model.view_requirements: if key in self.view_requirements: self.view_requirements[key].used_for_training = False if key in self._loss_input_dict: diff --git a/rllib/policy/policy.py b/rllib/policy/policy.py index 277ec5c24b3c..d208c7d1537d 100644 --- a/rllib/policy/policy.py +++ b/rllib/policy/policy.py @@ -668,16 +668,11 @@ def _initialize_loss_from_dummy_batch( if key not in self.view_requirements: self.view_requirements[key] = ViewRequirement() if self._loss: - # Tag those only needed for post-processing (with some - # exceptions). + # Tag those only needed for post-processing. for key in batch_for_postproc.accessed_keys: if key not in train_batch.accessed_keys and \ key in self.view_requirements and \ - key not in self.model.view_requirements and \ - key not in [ - SampleBatch.EPS_ID, SampleBatch.AGENT_INDEX, - SampleBatch.UNROLL_ID, SampleBatch.DONES, - SampleBatch.REWARDS, SampleBatch.INFOS]: + key not in self.model.view_requirements: self.view_requirements[key].used_for_training = False # Remove those not needed at all (leave those that are needed # by Sampler to properly execute sample collection). From 2e172c80cd67d7965f65bf1732e545df0df8537f Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 031/244] Revert "[RLlib] Issue #13342: Add `validate_spaces` to MB-MPO. (#14038)" This reverts commit f95a020faaa47cb6bfce47cae5e7f6abd53fc793. --- rllib/agents/mbmpo/mbmpo_torch_policy.py | 31 ------------------------ rllib/agents/mbmpo/model_ensemble.py | 2 -- rllib/agents/sac/sac_tf_policy.py | 2 +- 3 files changed, 1 insertion(+), 34 deletions(-) diff --git a/rllib/agents/mbmpo/mbmpo_torch_policy.py b/rllib/agents/mbmpo/mbmpo_torch_policy.py index 5dc03435c43b..06e65042e35f 100644 --- a/rllib/agents/mbmpo/mbmpo_torch_policy.py +++ b/rllib/agents/mbmpo/mbmpo_torch_policy.py @@ -1,5 +1,4 @@ import gym -from gym.spaces import Box, Discrete import logging from typing import Tuple, Type @@ -14,7 +13,6 @@ from ray.rllib.models.torch.torch_action_dist import TorchDistributionWrapper from ray.rllib.policy.policy import Policy from ray.rllib.policy.policy_template import build_policy_class -from ray.rllib.utils.error import UnsupportedSpaceException from ray.rllib.utils.framework import try_import_torch from ray.rllib.utils.torch_ops import apply_grad_clipping from ray.rllib.utils.typing import TrainerConfigDict @@ -24,35 +22,6 @@ logger = logging.getLogger(__name__) -def validate_spaces(policy: Policy, observation_space: gym.spaces.Space, - action_space: gym.spaces.Space, - config: TrainerConfigDict) -> None: - """Validates the observation- and action spaces used for the Policy. - - Args: - policy (Policy): The policy, whose spaces are being validated. - observation_space (gym.spaces.Space): The observation space to - validate. - action_space (gym.spaces.Space): The action space to validate. - config (TrainerConfigDict): The Policy's config dict. - - Raises: - UnsupportedSpaceException: If one of the spaces is not supported. - """ - # Only support single Box or single Discrete spaces. - if not isinstance(action_space, (Box, Discrete)): - raise UnsupportedSpaceException( - "Action space ({}) of {} is not supported for " - "MB-MPO. Must be [Box|Discrete].".format(action_space, policy)) - # If Box, make sure it's a 1D vector space. - elif isinstance(action_space, Box) and len(action_space.shape) > 1: - raise UnsupportedSpaceException( - "Action space ({}) of {} has multiple dimensions " - "{}. ".format(action_space, policy, action_space.shape) + - "Consider reshaping this into a single dimension Box space " - "or using the multi-agent API.") - - def make_model_and_action_dist( policy: Policy, obs_space: gym.spaces.Space, diff --git a/rllib/agents/mbmpo/model_ensemble.py b/rllib/agents/mbmpo/model_ensemble.py index f7cb35b6f9e1..2bb9513dabfb 100644 --- a/rllib/agents/mbmpo/model_ensemble.py +++ b/rllib/agents/mbmpo/model_ensemble.py @@ -136,8 +136,6 @@ def __init__(self, obs_space, action_space, num_outputs, model_config, obs_space.low[0], obs_space.high[0], shape=(obs_space.shape[0] + action_space.shape[0], )) - else: - raise NotImplementedError super(DynamicsEnsembleCustomModel, self).__init__( input_space, action_space, num_outputs, model_config, name) diff --git a/rllib/agents/sac/sac_tf_policy.py b/rllib/agents/sac/sac_tf_policy.py index e4cc080afc66..83fa076ed292 100644 --- a/rllib/agents/sac/sac_tf_policy.py +++ b/rllib/agents/sac/sac_tf_policy.py @@ -652,7 +652,7 @@ def validate_spaces(policy: Policy, observation_space: gym.spaces.Space, Raises: UnsupportedSpaceException: If one of the spaces is not supported. """ - # Only support single Box or single Discrete spaces. + # Only support single Box or single Discreete spaces. if not isinstance(action_space, (Box, Discrete, Simplex)): raise UnsupportedSpaceException( "Action space ({}) of {} is not supported for " From 1d60becc8d820492953a0f967ad3ee52b53624c3 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 032/244] Revert "[autoscaler] run setup commands with restart_only=True (#13836)" This reverts commit a5225099d7af1d91b8fe3d59627e7b7ab44c5bc7. --- python/ray/autoscaler/_private/commands.py | 10 ++------ python/ray/autoscaler/_private/updater.py | 10 +------- python/ray/tests/test_autoscaler.py | 30 +--------------------- 3 files changed, 4 insertions(+), 46 deletions(-) diff --git a/python/ray/autoscaler/_private/commands.py b/python/ray/autoscaler/_private/commands.py index d967543ff984..336dca40ffd2 100644 --- a/python/ray/autoscaler/_private/commands.py +++ b/python/ray/autoscaler/_private/commands.py @@ -646,12 +646,7 @@ def get_or_create_head_node(config: Dict[str, Any], cli_logger.print("Prepared bootstrap config") if restart_only: - # Docker may re-launch nodes, requiring setup - # commands to be rerun. - if config.get("docker", {}).get("container_name"): - setup_commands = config["head_setup_commands"] - else: - setup_commands = [] + setup_commands = [] ray_start_commands = config["head_start_ray_commands"] elif no_restart: setup_commands = config["head_setup_commands"] @@ -683,8 +678,7 @@ def get_or_create_head_node(config: Dict[str, Any], "rsync_exclude": config.get("rsync_exclude"), "rsync_filter": config.get("rsync_filter") }, - docker_config=config.get("docker"), - restart_only=restart_only) + docker_config=config.get("docker")) updater.start() updater.join() diff --git a/python/ray/autoscaler/_private/updater.py b/python/ray/autoscaler/_private/updater.py index 14981252cd6d..7256d9046f49 100644 --- a/python/ray/autoscaler/_private/updater.py +++ b/python/ray/autoscaler/_private/updater.py @@ -48,7 +48,6 @@ class NodeUpdater: use_internal_ip: Wwhether the node_id belongs to an internal ip or external ip. docker_config: Docker section of autoscaler yaml - restart_only: Whether to skip setup commands & just restart ray """ def __init__(self, @@ -69,8 +68,7 @@ def __init__(self, rsync_options=None, process_runner=subprocess, use_internal_ip=False, - docker_config=None, - restart_only=False): + docker_config=None): self.log_prefix = "NodeUpdater: {}: ".format(node_id) use_internal_ip = (use_internal_ip @@ -108,7 +106,6 @@ def __init__(self, self.auth_config = auth_config self.is_head_node = is_head_node self.docker_config = docker_config - self.restart_only = restart_only def run(self): if cmd_output_util.does_allow_interactive( @@ -301,11 +298,6 @@ def do_update(self): sync_run_yet=False) if init_required: node_tags[TAG_RAY_RUNTIME_CONFIG] += "-invalidate" - # This ensures that `setup_commands` are not removed - self.restart_only = False - - if self.restart_only: - self.setup_commands = [] # runtime_hash will only change whenever the user restarts # or updates their cluster with `get_or_create_head_node` diff --git a/python/ray/tests/test_autoscaler.py b/python/ray/tests/test_autoscaler.py index 925cb1d202d8..204ed1ef8c9a 100644 --- a/python/ray/tests/test_autoscaler.py +++ b/python/ray/tests/test_autoscaler.py @@ -500,7 +500,7 @@ def testGetOrCreateHeadNodeFromStopped(self): _provider=self.provider, _runner=runner) self.waitForNodes(1) - # Init & Setup commands must be run for Docker! + # Init & Setup commands msut be run for Docker! runner.assert_has_call("1.2.3.4", "init_cmd") runner.assert_has_call("1.2.3.4", "head_setup_cmd") runner.assert_has_call("1.2.3.4", "start_ray_head") @@ -543,34 +543,6 @@ def testGetOrCreateHeadNodeFromStopped(self): assert first_mkdir < first_rsync assert first_rsync < first_cp - def testGetOrCreateHeadNodeFromStoppedRestartOnly(self): - self.testGetOrCreateHeadNode() - self.provider.cache_stopped = True - existing_nodes = self.provider.non_terminated_nodes({}) - assert len(existing_nodes) == 1 - self.provider.terminate_node(existing_nodes[0]) - config_path = self.write_config(SMALL_CLUSTER) - runner = MockProcessRunner() - runner.respond_to_call("json .Mounts", ["[]"]) - # Two initial calls to docker cp, + 2 more calls during run_init - runner.respond_to_call(".State.Running", - ["false", "false", "false", "false"]) - runner.respond_to_call("json .Config.Env", ["[]"]) - commands.get_or_create_head_node( - SMALL_CLUSTER, - printable_config_file=config_path, - no_restart=False, - restart_only=True, - yes=True, - override_cluster_name=None, - _provider=self.provider, - _runner=runner) - self.waitForNodes(1) - # Init & Setup commands must be run for Docker! - runner.assert_has_call("1.2.3.4", "init_cmd") - runner.assert_has_call("1.2.3.4", "head_setup_cmd") - runner.assert_has_call("1.2.3.4", "start_ray_head") - @unittest.skipIf(sys.platform == "win32", "Failing on Windows.") def testDockerFileMountsAdded(self): config = copy.deepcopy(SMALL_CLUSTER) From 7d492b3624426f648b3969d43006989e5e5cb869 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 033/244] Revert " Revert "Revert "[Autoscaler] Monitor refactor for backward compatability. (#13970)" (#14046)" (#14050)" This reverts commit fd623f0ec93ce04932548a5435ab5c23e821ba76. --- python/ray/monitor.py | 75 +++++-- python/ray/tests/test_multi_node_2.py | 51 +---- python/ray/tests/test_multinode_failures_2.py | 4 +- src/ray/protobuf/common.proto | 34 ++- src/ray/protobuf/gcs.proto | 203 +++++++++--------- src/ray/protobuf/gcs_service.proto | 68 +++--- 6 files changed, 196 insertions(+), 239 deletions(-) diff --git a/python/ray/monitor.py b/python/ray/monitor.py index 30b7f35a578e..fe1edad6380d 100644 --- a/python/ray/monitor.py +++ b/python/ray/monitor.py @@ -8,8 +8,6 @@ import traceback import json -import grpc - import ray from ray.autoscaler._private.autoscaler import StandardAutoscaler from ray.autoscaler._private.commands import teardown_cluster @@ -19,10 +17,11 @@ from ray.autoscaler._private.constants import \ AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE from ray.autoscaler._private.util import DEBUG_AUTOSCALING_STATUS - -from ray.core.generated import gcs_service_pb2, gcs_service_pb2_grpc +import ray.gcs_utils +import ray.utils import ray.ray_constants as ray_constants from ray.ray_logging import setup_component_logger +from ray._raylet import GlobalStateAccessor from ray.experimental.internal_kv import _internal_kv_put, \ _internal_kv_initialized, _internal_kv_get @@ -91,17 +90,16 @@ def __init__(self, redis_address, redis_password=redis_password) self.redis = ray._private.services.create_redis_client( redis_address, password=redis_password) - - # Initialize the gcs stub for getting all node resource usage. - gcs_address = self.redis.get("GcsServerAddress").decode("utf-8") - gcs_channel = grpc.insecure_channel(gcs_address) - self.gcs_node_resources_stub = \ - gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel) - + self.global_state_accessor = GlobalStateAccessor( + redis_address, redis_password, False) + self.global_state_accessor.connect() # Set the redis client and mode so _internal_kv works for autoscaler. worker = ray.worker.global_worker worker.redis_client = self.redis worker.mode = 0 + # Keep a mapping from raylet client ID to IP address to use + # for updating the load metrics. + self.raylet_id_to_ip_map = {} head_node_ip = redis_address.split(":")[0] self.load_metrics = LoadMetrics(local_ip=head_node_ip) self.last_avail_resources = None @@ -119,14 +117,19 @@ def __init__(self, logger.info("Monitor: Started") + def __del__(self): + """Destruct the monitor object.""" + # We close the pubsub client to avoid leaking file descriptors. + if self.global_state_accessor is not None: + self.global_state_accessor.disconnect() + self.global_state_accessor = None + def update_load_metrics(self): """Fetches resource usage data from GCS and updates load metrics.""" - request = gcs_service_pb2.GetAllResourceUsageRequest() - response = self.gcs_node_resources_stub.GetAllResourceUsage( - request, timeout=4) - resources_batch_data = response.resource_usage_data - + all_resources = self.global_state_accessor.get_all_resource_usage() + resources_batch_data = \ + ray.gcs_utils.ResourceUsageBatchData.FromString(all_resources) for resource_message in resources_batch_data.batch: resource_load = dict(resource_message.resource_load) total_resources = dict(resource_message.resources_total) @@ -138,10 +141,17 @@ def update_load_metrics(self): pending_placement_groups = list( resources_batch_data.placement_group_load.placement_group_data) - ip = resource_message.node_manager_address - self.load_metrics.update( - ip, total_resources, available_resources, resource_load, - waiting_bundles, infeasible_bundles, pending_placement_groups) + # Update the load metrics for this raylet. + node_id = ray.utils.binary_to_hex(resource_message.node_id) + ip = self.raylet_id_to_ip_map.get(node_id) + if ip: + self.load_metrics.update(ip, total_resources, + available_resources, resource_load, + waiting_bundles, infeasible_bundles, + pending_placement_groups) + else: + logger.warning( + f"Monitor: could not find ip for node {node_id}") def update_resource_requests(self): """Fetches resource requests from the internal KV and updates load.""" @@ -156,10 +166,29 @@ def update_resource_requests(self): except Exception: logger.exception("Error parsing resource requests") + def update_raylet_map(self, _append_port=False): + """Updates internal raylet map. + + Args: + _append_port (bool): Defaults to False. Appending the port is + useful in testing, as mock clusters have many nodes with + the same IP and cannot be uniquely identified. + """ + all_raylet_nodes = ray.nodes() + self.raylet_id_to_ip_map = {} + for raylet_info in all_raylet_nodes: + node_id = (raylet_info.get("DBClientID") or raylet_info["NodeID"]) + ip_address = (raylet_info.get("AuxAddress") + or raylet_info["NodeManagerAddress"]).split(":")[0] + if _append_port: + ip_address += ":" + str(raylet_info["NodeManagerPort"]) + self.raylet_id_to_ip_map[node_id] = ip_address + def _run(self): """Run the monitor loop.""" while True: + self.update_raylet_map() self.update_load_metrics() self.update_resource_requests() self.update_event_summary() @@ -335,9 +364,9 @@ def run(self): # Something went wrong, so push an error to all drivers. redis_client = ray._private.services.create_redis_client( args.redis_address, password=args.redis_password) + traceback_str = ray.utils.format_error_message(traceback.format_exc()) message = ("The monitor failed with the " - f"following error:\n{traceback.format_exc()}") - from ray.utils import push_error_to_driver_through_redis - push_error_to_driver_through_redis( + f"following error:\n{traceback_str}") + ray.utils.push_error_to_driver_through_redis( redis_client, ray_constants.MONITOR_DIED_ERROR, message) raise e diff --git a/python/ray/tests/test_multi_node_2.py b/python/ray/tests/test_multi_node_2.py index 7569dff68113..b3e739e643eb 100644 --- a/python/ray/tests/test_multi_node_2.py +++ b/python/ray/tests/test_multi_node_2.py @@ -4,7 +4,6 @@ import ray import ray.ray_constants as ray_constants -from ray.util.placement_group import placement_group, remove_placement_group from ray.autoscaler.sdk import request_resources from ray.monitor import Monitor from ray.cluster_utils import Cluster @@ -69,45 +68,16 @@ def f(): def setup_monitor(address): monitor = Monitor( address, None, redis_password=ray_constants.REDIS_DEFAULT_PASSWORD) + monitor.update_raylet_map(_append_port=True) return monitor -def assert_correct_pg(pg_response_data, pg_demands, strategy): - assert len(pg_response_data) == 1 - pg_response_data = pg_response_data[0] - strategy_mapping_dict_protobuf = { - "PACK": 0, - "SPREAD": 1, - "STRICT_PACK": 2, - "STRICT_SPREAD": 3 - } - assert pg_response_data.strategy == strategy_mapping_dict_protobuf[ - strategy] - assert pg_response_data.creator_job_id - assert pg_response_data.creator_actor_id - assert pg_response_data.creator_actor_dead - assert pg_response_data.placement_group_id - - for i, bundle in enumerate(pg_demands): - assert pg_response_data.bundles[i].unit_resources == bundle - assert pg_response_data.bundles[i].bundle_id.placement_group_id - - -# DO NOT CHANGE THIS VERIFICATION WITHOUT NOTIFYING (Eric/Ameer/Alex). def verify_load_metrics(monitor, expected_resource_usage=None, timeout=30): request_resources(num_cpus=42) - # add placement groups. - pg_demands = [{"GPU": 2}, {"extra_resource": 2}] - strategy = "STRICT_PACK" - pg = placement_group(pg_demands, strategy=strategy) - pg.ready() - time.sleep(2) # wait for placemnt groups to propogate. - # Disable event clearing for test. monitor.event_summarizer.clear = lambda *a: None - visited_atleast_once = [set(), set()] while True: monitor.update_load_metrics() monitor.update_resource_requests() @@ -118,29 +88,21 @@ def verify_load_metrics(monitor, expected_resource_usage=None, timeout=30): req = monitor.load_metrics.resource_requests assert req == [{"CPU": 1}] * 42, req - pg_response_data = monitor.load_metrics.pending_placement_groups - assert_correct_pg(pg_response_data, pg_demands, strategy) - if "memory" in resource_usage[0]: del resource_usage[0]["memory"] - visited_atleast_once[0].add("memory") - if "object_store_memory" in resource_usage[0]: + if "object_store_memory" in resource_usage[1]: del resource_usage[0]["object_store_memory"] - visited_atleast_once[0].add("object_store_memory") if "memory" in resource_usage[1]: del resource_usage[1]["memory"] - visited_atleast_once[1].add("memory") if "object_store_memory" in resource_usage[1]: del resource_usage[1]["object_store_memory"] - visited_atleast_once[1].add("object_store_memory") for key in list(resource_usage[0].keys()): if key.startswith("node:"): del resource_usage[0][key] - visited_atleast_once[0].add("node:") for key in list(resource_usage[1].keys()): if key.startswith("node:"): del resource_usage[1][key] - visited_atleast_once[1].add("node:") + if expected_resource_usage is None: if all(x for x in resource_usage[0:]): break @@ -158,13 +120,6 @@ def verify_load_metrics(monitor, expected_resource_usage=None, timeout=30): # Sanity check we emitted a resize event. assert any("Resized to" in x for x in monitor.event_summarizer.summary()) - assert visited_atleast_once[0] == { - "memory", "object_store_memory", "node:" - } - assert visited_atleast_once[0] == visited_atleast_once[1] - - remove_placement_group(pg) - return resource_usage diff --git a/python/ray/tests/test_multinode_failures_2.py b/python/ray/tests/test_multinode_failures_2.py index dc8e7465c6ed..3dc65be557c1 100644 --- a/python/ray/tests/test_multinode_failures_2.py +++ b/python/ray/tests/test_multinode_failures_2.py @@ -126,9 +126,7 @@ def test_driver_lives_sequential(ray_start_regular): ray.worker._global_node.kill_raylet() ray.worker._global_node.kill_plasma_store() ray.worker._global_node.kill_log_monitor() - if not sys.platform.startswith("win"): - # fails on windows. - ray.worker._global_node.kill_monitor() + ray.worker._global_node.kill_monitor() ray.worker._global_node.kill_gcs_server() # If the driver can reach the tearDown method, then it is still alive. diff --git a/src/ray/protobuf/common.proto b/src/ray/protobuf/common.proto index 7178fe7159d8..844f44bea723 100644 --- a/src/ray/protobuf/common.proto +++ b/src/ray/protobuf/common.proto @@ -46,6 +46,19 @@ enum TaskType { DRIVER_TASK = 3; } +// Type of placement group strategy. +enum PlacementStrategy { + // Packs Bundles into as few nodes as possible. + PACK = 0; + // Places Bundles across distinct nodes or processes as even as possible. + SPREAD = 1; + // Packs Bundles within one node. The group is not allowed to span multiple nodes. + STRICT_PACK = 2; + // Places Bundles across distinct nodes. + // The group is not allowed to deploy more than one bundle on a node. + STRICT_SPREAD = 3; +} + // Address of a worker or node manager. message Address { bytes raylet_id = 1; @@ -443,24 +456,3 @@ enum WorkerExitType { // Worker exit due to placement group removal. PLACEMENT_GROUP_REMOVED = 3; } -/////////////////////////////////////////////////////////////////////////////// -/* Please do not modify/remove/change the following enum to maintain -backwards compatibility in autoscaler. This is necessary to make sure we can -run autoscaler with any version of ray. For example, the K8s operator runs -autoscaler in a separate pod, if the user upgrades the ray version on the head -pod autoscaler can crash (if the newer version of ray modified the messages -below). */ - -// Type of placement group strategy. -enum PlacementStrategy { - // Packs Bundles into as few nodes as possible. - PACK = 0; - // Places Bundles across distinct nodes or processes as even as possible. - SPREAD = 1; - // Packs Bundles within one node. The group is not allowed to span multiple nodes. - STRICT_PACK = 2; - // Places Bundles across distinct nodes. - // The group is not allowed to deploy more than one bundle on a node. - STRICT_SPREAD = 3; -} -/////////////////////////////////////////////////////////////////////////////// diff --git a/src/ray/protobuf/gcs.proto b/src/ray/protobuf/gcs.proto index 5da9842f9619..a56bffbe1147 100644 --- a/src/ray/protobuf/gcs.proto +++ b/src/ray/protobuf/gcs.proto @@ -158,6 +158,43 @@ message ErrorTableData { double timestamp = 4; } +message PlacementGroupTableData { + // State of a placement group. + enum PlacementGroupState { + // Placement Group is pending or scheduling + PENDING = 0; + // Placement Group is created. + CREATED = 1; + // Placement Group is already removed and won't be reschedule. + REMOVED = 2; + // Placement Group is rescheduling because the node it placed is dead. + RESCHEDULING = 3; + } + + // ID of the PlacementGroup. + bytes placement_group_id = 1; + // The name of the placement group. + string name = 2; + // The array of the bundle in Placement Group. + repeated Bundle bundles = 3; + // The schedule strategy of this Placement Group. + PlacementStrategy strategy = 4; + // Current state of this placement group. + PlacementGroupState state = 5; + // Fields to detect the owner of the placement group + // for automatic lifecycle management. + // The job id that created this placement group. + bytes creator_job_id = 6; + // The actor id that created this placement group. + bytes creator_actor_id = 7; + // Whether or not if the creator job is dead. + bool creator_job_dead = 8; + // Whether or not if the creator actor is dead. + bool creator_actor_dead = 9; + // Whether the placement group is persistent. + bool is_detached = 10; +} + message ScheduleData { map schedule_plan = 1; } @@ -238,11 +275,71 @@ message GcsNodeInfo { int64 timestamp = 10; } +// Represents the demand for a particular resource shape. +message ResourceDemand { + // The resource shape requested. This is a map from the resource string + // (e.g., "CPU") to the amount requested. + map shape = 1; + // The number of requests that are ready to run (i.e., dependencies have been + // fulfilled), but that are waiting for resources. + uint64 num_ready_requests_queued = 2; + // The number of requests for which there is no node that is a superset of + // the requested resource shape. + uint64 num_infeasible_requests_queued = 3; + // The number of requests of this shape still queued in CoreWorkers that this + // raylet knows about. + int64 backlog_size = 4; +} + +// Represents the demand sorted by resource shape. +message ResourceLoad { + // A list of all resource demands. The resource shape in each demand is + // unique. + repeated ResourceDemand resource_demands = 1; +} + +message PlacementGroupLoad { + // The list of pending placement group specifications. + repeated PlacementGroupTableData placement_group_data = 1; +} + message HeartbeatTableData { // Node id. bytes node_id = 1; } +message ResourcesData { + // Node id. + bytes node_id = 1; + // Resource capacity currently available on this node manager. + map resources_available = 2; + // Indicates whether available resources is changed. Only used when light + // heartbeat enabled. + bool resources_available_changed = 3; + // Total resource capacity configured for this node manager. + map resources_total = 4; + // Aggregate outstanding resource load on this node manager. + map resource_load = 5; + // Indicates whether resource load is changed. Only used when + // light heartbeat enabled. + bool resource_load_changed = 6; + // The resource load on this node, sorted by resource shape. + ResourceLoad resource_load_by_shape = 7; + // Whether this node manager is requesting global GC. + bool should_global_gc = 8; + // IP address of the node. + string node_manager_address = 9; +} + +message ResourceUsageBatchData { + repeated ResourcesData batch = 1; + // The total resource demand on all nodes included in the batch, sorted by + // resource shape. + ResourceLoad resource_load_by_shape = 2; + // The pending list of placement groups. + PlacementGroupLoad placement_group_load = 3; +} + // Data for a lease on task execution. message TaskLeaseData { // The task ID. @@ -356,109 +453,3 @@ message PubSubMessage { bytes id = 1; bytes data = 2; } - -/////////////////////////////////////////////////////////////////////////////// -/* Please do not modify/remove/change the following messages to maintain -backwards compatibility in autoscaler. This is necessary to make sure we can -run autoscaler with any version of ray. For example, the K8s operator runs -autoscaler in a separate pod, if the user upgrades the ray version on the head -pod autoscaler can crash (if the newer version of ray modified the messages -below). */ - -// Represents the demand for a particular resource shape. -message ResourceDemand { - // The resource shape requested. This is a map from the resource string - // (e.g., "CPU") to the amount requested. - map shape = 1; - // The number of requests that are ready to run (i.e., dependencies have been - // fulfilled), but that are waiting for resources. - uint64 num_ready_requests_queued = 2; - // The number of requests for which there is no node that is a superset of - // the requested resource shape. - uint64 num_infeasible_requests_queued = 3; - // The number of requests of this shape still queued in CoreWorkers that this - // raylet knows about. - int64 backlog_size = 4; -} - -// Represents the demand sorted by resource shape. -message ResourceLoad { - // A list of all resource demands. The resource shape in each demand is - // unique. - repeated ResourceDemand resource_demands = 1; -} - -message ResourcesData { - // Node id. - bytes node_id = 1; - // Resource capacity currently available on this node manager. - map resources_available = 2; - // Indicates whether available resources is changed. Only used when light - // heartbeat enabled. - bool resources_available_changed = 3; - // Total resource capacity configured for this node manager. - map resources_total = 4; - // Aggregate outstanding resource load on this node manager. - map resource_load = 5; - // Indicates whether resource load is changed. Only used when - // light heartbeat enabled. - bool resource_load_changed = 6; - // The resource load on this node, sorted by resource shape. - ResourceLoad resource_load_by_shape = 7; - // Whether this node manager is requesting global GC. - bool should_global_gc = 8; - // IP address of the node. - string node_manager_address = 9; -} - -message ResourceUsageBatchData { - repeated ResourcesData batch = 1; - // The total resource demand on all nodes included in the batch, sorted by - // resource shape. - ResourceLoad resource_load_by_shape = 2; - // The pending list of placement groups. - PlacementGroupLoad placement_group_load = 3; -} - -message PlacementGroupLoad { - // The list of pending placement group specifications. - repeated PlacementGroupTableData placement_group_data = 1; -} - -message PlacementGroupTableData { - // State of a placement group. - enum PlacementGroupState { - // Placement Group is pending or scheduling - PENDING = 0; - // Placement Group is created. - CREATED = 1; - // Placement Group is already removed and won't be reschedule. - REMOVED = 2; - // Placement Group is rescheduling because the node it placed is dead. - RESCHEDULING = 3; - } - - // ID of the PlacementGroup. - bytes placement_group_id = 1; - // The name of the placement group. - string name = 2; - // The array of the bundle in Placement Group. - repeated Bundle bundles = 3; - // The schedule strategy of this Placement Group. - PlacementStrategy strategy = 4; - // Current state of this placement group. - PlacementGroupState state = 5; - // Fields to detect the owner of the placement group - // for automatic lifecycle management. - // The job id that created this placement group. - bytes creator_job_id = 6; - // The actor id that created this placement group. - bytes creator_actor_id = 7; - // Whether or not if the creator job is dead. - bool creator_job_dead = 8; - // Whether or not if the creator actor is dead. - bool creator_actor_dead = 9; - // Whether the placement group is persistent. - bool is_detached = 10; -} -/////////////////////////////////////////////////////////////////////////////// diff --git a/src/ray/protobuf/gcs_service.proto b/src/ray/protobuf/gcs_service.proto index 41c71c7e05ca..6e2c450dd111 100644 --- a/src/ray/protobuf/gcs_service.proto +++ b/src/ray/protobuf/gcs_service.proto @@ -19,6 +19,11 @@ package ray.rpc; import "src/ray/protobuf/common.proto"; import "src/ray/protobuf/gcs.proto"; +message GcsStatus { + int32 code = 1; + string message = 2; +} + message AddJobRequest { JobTableData data = 1; } @@ -226,6 +231,31 @@ message ReportResourceUsageReply { GcsStatus status = 1; } +message GetAllResourceUsageRequest { +} + +message GetAllResourceUsageReply { + GcsStatus status = 1; + ResourceUsageBatchData resource_usage_data = 2; +} + +// Service for node resource info access. +service NodeResourceInfoGcsService { + // Get node's resources from GCS Service. + rpc GetResources(GetResourcesRequest) returns (GetResourcesReply); + // Update resources of a node in GCS Service. + rpc UpdateResources(UpdateResourcesRequest) returns (UpdateResourcesReply); + // Delete resources of a node in GCS Service. + rpc DeleteResources(DeleteResourcesRequest) returns (DeleteResourcesReply); + // Get available resources of all nodes. + rpc GetAllAvailableResources(GetAllAvailableResourcesRequest) + returns (GetAllAvailableResourcesReply); + // Report resource usage of a node to GCS Service. + rpc ReportResourceUsage(ReportResourceUsageRequest) returns (ReportResourceUsageReply); + // Get resource usage of all nodes from GCS Service. + rpc GetAllResourceUsage(GetAllResourceUsageRequest) returns (GetAllResourceUsageReply); +} + // Service for heartbeat info access. service HeartbeatInfoGcsService { // Report heartbeat of a node to GCS Service. @@ -523,41 +553,3 @@ service PlacementGroupInfoGcsService { rpc WaitPlacementGroupUntilReady(WaitPlacementGroupUntilReadyRequest) returns (WaitPlacementGroupUntilReadyReply); } -/////////////////////////////////////////////////////////////////////////////// -/* Please do not modify/remove/change the following messages to maintain -backwards compatibility in autoscaler. This is necessary to make sure we can -run autoscaler with any version of ray. For example, the K8s operator runs -autoscaler in a separate pod, if the user upgrades the ray version on the head -pod autoscaler can crash (if the newer version of ray modified the messages -below). */ - -message GetAllResourceUsageRequest { -} - -message GetAllResourceUsageReply { - GcsStatus status = 1; - ResourceUsageBatchData resource_usage_data = 2; -} - -// Service for node resource info access. -service NodeResourceInfoGcsService { - // Get node's resources from GCS Service. - rpc GetResources(GetResourcesRequest) returns (GetResourcesReply); - // Update resources of a node in GCS Service. - rpc UpdateResources(UpdateResourcesRequest) returns (UpdateResourcesReply); - // Delete resources of a node in GCS Service. - rpc DeleteResources(DeleteResourcesRequest) returns (DeleteResourcesReply); - // Get available resources of all nodes. - rpc GetAllAvailableResources(GetAllAvailableResourcesRequest) - returns (GetAllAvailableResourcesReply); - // Report resource usage of a node to GCS Service. - rpc ReportResourceUsage(ReportResourceUsageRequest) returns (ReportResourceUsageReply); - // Get resource usage of all nodes from GCS Service. - rpc GetAllResourceUsage(GetAllResourceUsageRequest) returns (GetAllResourceUsageReply); -} - -message GcsStatus { - int32 code = 1; - string message = 2; -} -/////////////////////////////////////////////////////////////////////////////// From 7d90ec5d2d62060228aba4690d70c3d3272bac3e Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 034/244] Revert "[dask-on-ray] Add better Dask-on-Ray example, and detail custom shuffle optimization. (#13950)" This reverts commit 3dbec8ea94cb69f7b98aaaeda317a18160eaa071. --- doc/source/dask-on-ray.rst | 151 ++++++++++--------------------- doc/source/index.rst | 14 +-- doc/source/memory-management.rst | 1 - 3 files changed, 51 insertions(+), 115 deletions(-) diff --git a/doc/source/dask-on-ray.rst b/doc/source/dask-on-ray.rst index 0530fdc4c7dd..b5383ac8beda 100644 --- a/doc/source/dask-on-ray.rst +++ b/doc/source/dask-on-ray.rst @@ -1,32 +1,22 @@ +*********** Dask on Ray -=========== +*********** -.. _dask-on-ray: +Ray offers a scheduler integration for Dask, allowing you to build data +analyses using the familiar Dask collections (dataframes, arrays) and execute +the underlying computations on a Ray cluster. Using this Dask scheduler, the +entire Dask ecosystem can be executed on top of Ray. -`Dask `__ is a Python parallel computing library geared towards scaling analytics and -scientific computing workloads. It provides `big data collections -`__ that mimic the APIs of -the familiar `NumPy `__ and `Pandas `__ libraries, -allowing those abstractions to represent -larger-than-memory data and/or allowing operations on that data to be run on a multi-machine cluster, -while also providing automatic data parallelism, smart scheduling, -and optimized operations. Operations on these collections create a task graph, which is -executed by a scheduler. - -Ray provides a scheduler for Dask (`dask_on_ray`) which allows you to build data -analyses using Dask's collections and execute -the underlying tasks on a Ray cluster. +.. note:: -`dask_on_ray` uses Dask's scheduler API, which allows you to -specify any callable as the scheduler that you would like Dask to use to execute your -workload. Using the Dask-on-Ray scheduler, the entire Dask ecosystem can be executed on top of Ray. + Note that Ray does not currently support object spilling, and hence cannot + process datasets larger than cluster memory. This is a planned feature. +========= Scheduler ---------- +========= -.. _dask-on-ray-scheduler: - -The Dask-on-Ray scheduler can execute any valid Dask graph, and can be used with +The Dask-Ray scheduler can execute any valid Dask graph, and can be used with any Dask `.compute() `__ call. Here's an example: @@ -35,99 +25,53 @@ Here's an example: import ray from ray.util.dask import ray_dask_get - import dask.array as da - import dask.dataframe as dd - import numpy as np - import pandas as pd + import dask.delayed import time # Start Ray. # Tip: If you're connecting to an existing cluster, use ray.init(address="auto"). ray.init() - d_arr = da.from_array(np.random.randint(0, 1000, size=(256, 256))) - # The Dask scheduler submits the underlying task graph to Ray. - d_arr.mean().compute(scheduler=ray_dask_get) + @dask.delayed + def inc(x): + time.sleep(1) + return x + 1 - # Set the scheduler to ray_dask_get in your config so you don't have to specify it on - # each compute call. - dask.config.set(scheduler=ray_dask_get) + @dask.delayed + def add(x, y): + time.sleep(3) + return x + y - df = dd.from_pandas(pd.DataFrame( - np.random.randint(0, 100, size=(1024, 2)), - columns=["age", "grade"])) - df.groupby(["age"]).mean().compute() - - -.. note:: - For execution on a Ray cluster, you should *not* use the - `Dask.distributed `__ - client; simply use plain Dask and its collections, and pass ``ray_dask_get`` - to ``.compute()`` calls or set the scheduler in one of the other ways detailed `here `__. Follow the instructions for - :ref:`using Ray on a cluster ` to modify the - ``ray.init()`` call. + x = inc(1) + y = inc(2) + z = add(x, y) + # The Dask scheduler submits the underlying task graph to Ray. + z.compute(scheduler=ray_dask_get) Why use Dask on Ray? -1. To take advantage of Ray-specific features such as the + 1. If you'd like to create data analyses using the familiar NumPy and Pandas + APIs provided by Dask and execute them on a production-ready distributed + task execution system like Ray. + 2. If you'd like to use Dask and Ray libraries in the same application + without having two different task execution backends. + 3. To take advantage of Ray-specific features such as the :ref:`cluster launcher ` and :ref:`shared-memory store `. -2. If you'd like to use Dask and Ray libraries in the same application without having two different clusters. -3. If you'd like to create data analyses using the familiar NumPy and Pandas APIs provided by Dask and execute them on a fast, fault-tolerant distributed task execution system geared towards production, like Ray. - -Dask-on-Ray is an ongoing project and is not expected to achieve the same performance as using Ray directly. All `Dask abstractions `__ should run seamlessly on top of Ray using this scheduler, so if you find that one of these abstractions doesn't run on Ray, please `open an issue `__. - -Out-of-Core Data Processing ---------------------------- - -.. _dask-on-ray-out-of-core: - -Processing datasets larger than cluster memory is supported via Ray's :ref:`object spilling `: if -the in-memory object store is full, objects will be spilled to external storage (local disk by -default). This feature is available but off by default in Ray 1.2, and is on by default -in Ray 1.3+. Please see your Ray version's object spilling documentation for steps to enable and/or configure -object spilling. - -Custom optimization for Dask DataFrame shuffling ------------------------------------------------- -.. _dask-on-ray-shuffle-optimization: +Note that for execution on a Ray cluster, you should *not* use the +`Dask.distributed `__ +client; simply use plain Dask and its collections, and pass ``ray_dask_get`` +to ``.compute()`` calls. Follow the instructions for +:ref:`using Ray on a cluster ` to modify the +``ray.init()`` call. -Dask on Ray provides a Dask DataFrame optimizer that leverages Ray's ability to -execute multiple-return tasks in order to speed up shuffling by as much as 4x on Ray. -Simply set the `dataframe_optimize` configuration option to our optimizer function, similar to how you specify the Dask-on-Ray scheduler: - -.. code-block:: python - - import ray - from ray.util.dask import ray_dask_get, dataframe_optimize - import dask.dataframe as dd - import numpy as np - import pandas as pd - import time - - # Start Ray. - # Tip: If you're connecting to an existing cluster, use ray.init(address="auto"). - ray.init() - - # Set the scheduler to ray_dask_get, and set the Dask DataFrame optimizer to our - # custom optimization function, this time using the config setter as a context manager. - with dask.config.set(scheduler=ray_dask_get, dataframe_optimize=dataframe_optimize): - npartitions = 100 - df = dd.from_pandas(pd.DataFrame( - np.random.randint(0, 100, size=(10000, 2)), - columns=["age", "grade"]), npartitions=npartitions) - # We set max_branch to infinity in order to ensure that the task-based shuffle - # happens in a single stage, which is required in order for our optimization to - # work. - df.set_index( - ["age"], shuffle="tasks", max_branch=float("inf")).head(10, npartitions=-1) +Dask-on-Ray is an ongoing project and is not expected to achieve the same performance as using Ray directly. +========= Callbacks ---------- - -.. _dask-on-ray-callbacks: +========= Dask's `custom callback abstraction `__ is extended with Ray-specific callbacks, allowing the user to hook into the @@ -264,12 +208,11 @@ execution time exceeds some user-defined threshold: with cache_callback: z.compute(scheduler=ray_dask_get) -.. note:: - The existing Dask scheduler callbacks (``start``, ``start_state``, - ``pretask``, ``posttask``, ``finish``) are also available, which can be used to - introspect the Dask task to Ray task conversion process, but note that the ``pretask`` - and ``posttask`` hooks are executed before and after the Ray task is *submitted*, not - executed, and that ``finish`` is executed after all Ray tasks have been - *submitted*, not executed. +Note that the existing Dask scheduler callbacks (``start``, ``start_state``, +``pretask``, ``posttask``, ``finish``) are also available, which can be used to +introspect the Dask task to Ray task conversion process, but that ``pretask`` +and ``posttask`` are executed before and after the Ray task is *submitted*, not +executed, and that ``finish`` is executed after all Ray tasks have been +*submitted*, not executed. This callback API is currently unstable and subject to change. diff --git a/doc/source/index.rst b/doc/source/index.rst index e90b52299f5a..277c82e55a69 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -296,16 +296,6 @@ Papers raysgd/raysgd_tune.rst raysgd/raysgd_ref.rst -.. toctree:: - :hidden: - :maxdepth: -1 - :caption: Data Processing - - modin/index.rst - dask-on-ray.rst - mars-on-ray.rst - raydp.rst - .. toctree:: :hidden: :maxdepth: -1 @@ -315,6 +305,10 @@ Papers joblib.rst iter.rst xgboost-ray.rst + modin/index.rst + dask-on-ray.rst + mars-on-ray.rst + raydp.rst ray-client.rst .. toctree:: diff --git a/doc/source/memory-management.rst b/doc/source/memory-management.rst index f12f7efefd33..8892800a6e94 100644 --- a/doc/source/memory-management.rst +++ b/doc/source/memory-management.rst @@ -179,7 +179,6 @@ In the output of ``ray memory``, we see that the second object displays as a nor Object Spilling --------------- -.. _object-spilling: Ray 1.3+ spills objects to external storage once the object store is full. By default, objects are spilled to the local filesystem. To configure the directory where objects are placed, use: From eda4fdf675d703bbbf7bd696853fc909cdc27ee8 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 035/244] Revert "[docs] Add mode to Ray Tune quick start (#14023)" This reverts commit 9adfcb136d65c8d9a6fa281d2a94dff4ab736300. --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index c937160fd836..a69fc92272bd 100644 --- a/README.rst +++ b/README.rst @@ -132,7 +132,7 @@ This example runs a parallel grid search to optimize an example objective functi "beta": tune.choice([1, 2, 3]) }) - print("Best config: ", analysis.get_best_config(metric="mean_loss", mode="min")) + print("Best config: ", analysis.get_best_config(metric="mean_loss")) # Get a dataframe for analyzing trial results. df = analysis.results_df From 65ccfbf53389cb806b6fc06910cba13d5531390f Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 036/244] Revert "[doc] Minor fix to indentation (#14040)" This reverts commit 0667a7b2e13f38b257081ce16359045afb5e6bed. --- doc/source/walkthrough.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/walkthrough.rst b/doc/source/walkthrough.rst index ec0f0ec3a0f9..77e033a997a1 100644 --- a/doc/source/walkthrough.rst +++ b/doc/source/walkthrough.rst @@ -92,8 +92,8 @@ Ray enables arbitrary functions to be executed asynchronously. These asynchronou @ray.remote def slow_function(): - time.sleep(10) - return 1 + time.sleep(10) + return 1 # Invocations of Ray remote functions happen in parallel. # All computation is performed in the background, driven by Ray's internal event loop. From f9c408c4345a2ab721d210f78f23b38f57db1beb Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 037/244] Revert "Subtract from num bytes in use (#13944)" This reverts commit b059f8394031d049940e4d5685d8fdf05daf1e2e. --- src/ray/object_manager/plasma/store.cc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/ray/object_manager/plasma/store.cc b/src/ray/object_manager/plasma/store.cc index 920ced48e39d..af72192732ec 100644 --- a/src/ray/object_manager/plasma/store.cc +++ b/src/ray/object_manager/plasma/store.cc @@ -571,10 +571,6 @@ void PlasmaStore::EraseFromObjectTable(const ObjectID &object_id) { if (object->device_num == 0) { PlasmaAllocator::Free(object->pointer, buff_size); } - if (object->ref_count > 0) { - // A client was using this object. - num_bytes_in_use_ -= object->data_size + object->metadata_size; - } store_info_.objects.erase(object_id); } From 1a0d207923657c60f83eb83a7e5ea825a2368758 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 038/244] Revert "Revert "[Autoscaler] Monitor refactor for backward compatability. (#13970)" (#14046)" This reverts commit 8c0270f770270633b4d833a77ec7be66345892a8. --- python/ray/monitor.py | 75 +++------- python/ray/tests/test_multi_node_2.py | 51 ++++++- src/ray/protobuf/common.proto | 34 +++-- src/ray/protobuf/gcs.proto | 203 ++++++++++++++------------ src/ray/protobuf/gcs_service.proto | 68 +++++---- 5 files changed, 236 insertions(+), 195 deletions(-) diff --git a/python/ray/monitor.py b/python/ray/monitor.py index fe1edad6380d..72de4e87099b 100644 --- a/python/ray/monitor.py +++ b/python/ray/monitor.py @@ -8,6 +8,8 @@ import traceback import json +import grpc + import ray from ray.autoscaler._private.autoscaler import StandardAutoscaler from ray.autoscaler._private.commands import teardown_cluster @@ -17,11 +19,10 @@ from ray.autoscaler._private.constants import \ AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE from ray.autoscaler._private.util import DEBUG_AUTOSCALING_STATUS -import ray.gcs_utils -import ray.utils + +from ray.core.generated import gcs_service_pb2, gcs_service_pb2_grpc import ray.ray_constants as ray_constants from ray.ray_logging import setup_component_logger -from ray._raylet import GlobalStateAccessor from ray.experimental.internal_kv import _internal_kv_put, \ _internal_kv_initialized, _internal_kv_get @@ -90,16 +91,17 @@ def __init__(self, redis_address, redis_password=redis_password) self.redis = ray._private.services.create_redis_client( redis_address, password=redis_password) - self.global_state_accessor = GlobalStateAccessor( - redis_address, redis_password, False) - self.global_state_accessor.connect() + + # Initialize the gcs stub for getting all node resource usage. + gcs_address = self.redis.get("GcsServerAddress").decode("utf-8") + gcs_channel = grpc.insecure_channel(gcs_address) + self.gcs_node_resources_stub = \ + gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel) + # Set the redis client and mode so _internal_kv works for autoscaler. worker = ray.worker.global_worker worker.redis_client = self.redis worker.mode = 0 - # Keep a mapping from raylet client ID to IP address to use - # for updating the load metrics. - self.raylet_id_to_ip_map = {} head_node_ip = redis_address.split(":")[0] self.load_metrics = LoadMetrics(local_ip=head_node_ip) self.last_avail_resources = None @@ -117,19 +119,14 @@ def __init__(self, logger.info("Monitor: Started") - def __del__(self): - """Destruct the monitor object.""" - # We close the pubsub client to avoid leaking file descriptors. - if self.global_state_accessor is not None: - self.global_state_accessor.disconnect() - self.global_state_accessor = None - def update_load_metrics(self): """Fetches resource usage data from GCS and updates load metrics.""" - all_resources = self.global_state_accessor.get_all_resource_usage() - resources_batch_data = \ - ray.gcs_utils.ResourceUsageBatchData.FromString(all_resources) + request = gcs_service_pb2.GetAllResourceUsageRequest() + response = self.gcs_node_resources_stub.GetAllResourceUsage( + request, timeout=3) + resources_batch_data = response.resource_usage_data + for resource_message in resources_batch_data.batch: resource_load = dict(resource_message.resource_load) total_resources = dict(resource_message.resources_total) @@ -141,17 +138,10 @@ def update_load_metrics(self): pending_placement_groups = list( resources_batch_data.placement_group_load.placement_group_data) - # Update the load metrics for this raylet. - node_id = ray.utils.binary_to_hex(resource_message.node_id) - ip = self.raylet_id_to_ip_map.get(node_id) - if ip: - self.load_metrics.update(ip, total_resources, - available_resources, resource_load, - waiting_bundles, infeasible_bundles, - pending_placement_groups) - else: - logger.warning( - f"Monitor: could not find ip for node {node_id}") + ip = resource_message.node_manager_address + self.load_metrics.update( + ip, total_resources, available_resources, resource_load, + waiting_bundles, infeasible_bundles, pending_placement_groups) def update_resource_requests(self): """Fetches resource requests from the internal KV and updates load.""" @@ -166,29 +156,10 @@ def update_resource_requests(self): except Exception: logger.exception("Error parsing resource requests") - def update_raylet_map(self, _append_port=False): - """Updates internal raylet map. - - Args: - _append_port (bool): Defaults to False. Appending the port is - useful in testing, as mock clusters have many nodes with - the same IP and cannot be uniquely identified. - """ - all_raylet_nodes = ray.nodes() - self.raylet_id_to_ip_map = {} - for raylet_info in all_raylet_nodes: - node_id = (raylet_info.get("DBClientID") or raylet_info["NodeID"]) - ip_address = (raylet_info.get("AuxAddress") - or raylet_info["NodeManagerAddress"]).split(":")[0] - if _append_port: - ip_address += ":" + str(raylet_info["NodeManagerPort"]) - self.raylet_id_to_ip_map[node_id] = ip_address - def _run(self): """Run the monitor loop.""" while True: - self.update_raylet_map() self.update_load_metrics() self.update_resource_requests() self.update_event_summary() @@ -364,9 +335,9 @@ def run(self): # Something went wrong, so push an error to all drivers. redis_client = ray._private.services.create_redis_client( args.redis_address, password=args.redis_password) - traceback_str = ray.utils.format_error_message(traceback.format_exc()) message = ("The monitor failed with the " - f"following error:\n{traceback_str}") - ray.utils.push_error_to_driver_through_redis( + f"following error:\n{traceback.format_exc()}") + from ray.utils import push_error_to_driver_through_redis + push_error_to_driver_through_redis( redis_client, ray_constants.MONITOR_DIED_ERROR, message) raise e diff --git a/python/ray/tests/test_multi_node_2.py b/python/ray/tests/test_multi_node_2.py index b3e739e643eb..7569dff68113 100644 --- a/python/ray/tests/test_multi_node_2.py +++ b/python/ray/tests/test_multi_node_2.py @@ -4,6 +4,7 @@ import ray import ray.ray_constants as ray_constants +from ray.util.placement_group import placement_group, remove_placement_group from ray.autoscaler.sdk import request_resources from ray.monitor import Monitor from ray.cluster_utils import Cluster @@ -68,16 +69,45 @@ def f(): def setup_monitor(address): monitor = Monitor( address, None, redis_password=ray_constants.REDIS_DEFAULT_PASSWORD) - monitor.update_raylet_map(_append_port=True) return monitor +def assert_correct_pg(pg_response_data, pg_demands, strategy): + assert len(pg_response_data) == 1 + pg_response_data = pg_response_data[0] + strategy_mapping_dict_protobuf = { + "PACK": 0, + "SPREAD": 1, + "STRICT_PACK": 2, + "STRICT_SPREAD": 3 + } + assert pg_response_data.strategy == strategy_mapping_dict_protobuf[ + strategy] + assert pg_response_data.creator_job_id + assert pg_response_data.creator_actor_id + assert pg_response_data.creator_actor_dead + assert pg_response_data.placement_group_id + + for i, bundle in enumerate(pg_demands): + assert pg_response_data.bundles[i].unit_resources == bundle + assert pg_response_data.bundles[i].bundle_id.placement_group_id + + +# DO NOT CHANGE THIS VERIFICATION WITHOUT NOTIFYING (Eric/Ameer/Alex). def verify_load_metrics(monitor, expected_resource_usage=None, timeout=30): request_resources(num_cpus=42) + # add placement groups. + pg_demands = [{"GPU": 2}, {"extra_resource": 2}] + strategy = "STRICT_PACK" + pg = placement_group(pg_demands, strategy=strategy) + pg.ready() + time.sleep(2) # wait for placemnt groups to propogate. + # Disable event clearing for test. monitor.event_summarizer.clear = lambda *a: None + visited_atleast_once = [set(), set()] while True: monitor.update_load_metrics() monitor.update_resource_requests() @@ -88,21 +118,29 @@ def verify_load_metrics(monitor, expected_resource_usage=None, timeout=30): req = monitor.load_metrics.resource_requests assert req == [{"CPU": 1}] * 42, req + pg_response_data = monitor.load_metrics.pending_placement_groups + assert_correct_pg(pg_response_data, pg_demands, strategy) + if "memory" in resource_usage[0]: del resource_usage[0]["memory"] - if "object_store_memory" in resource_usage[1]: + visited_atleast_once[0].add("memory") + if "object_store_memory" in resource_usage[0]: del resource_usage[0]["object_store_memory"] + visited_atleast_once[0].add("object_store_memory") if "memory" in resource_usage[1]: del resource_usage[1]["memory"] + visited_atleast_once[1].add("memory") if "object_store_memory" in resource_usage[1]: del resource_usage[1]["object_store_memory"] + visited_atleast_once[1].add("object_store_memory") for key in list(resource_usage[0].keys()): if key.startswith("node:"): del resource_usage[0][key] + visited_atleast_once[0].add("node:") for key in list(resource_usage[1].keys()): if key.startswith("node:"): del resource_usage[1][key] - + visited_atleast_once[1].add("node:") if expected_resource_usage is None: if all(x for x in resource_usage[0:]): break @@ -120,6 +158,13 @@ def verify_load_metrics(monitor, expected_resource_usage=None, timeout=30): # Sanity check we emitted a resize event. assert any("Resized to" in x for x in monitor.event_summarizer.summary()) + assert visited_atleast_once[0] == { + "memory", "object_store_memory", "node:" + } + assert visited_atleast_once[0] == visited_atleast_once[1] + + remove_placement_group(pg) + return resource_usage diff --git a/src/ray/protobuf/common.proto b/src/ray/protobuf/common.proto index 844f44bea723..7178fe7159d8 100644 --- a/src/ray/protobuf/common.proto +++ b/src/ray/protobuf/common.proto @@ -46,19 +46,6 @@ enum TaskType { DRIVER_TASK = 3; } -// Type of placement group strategy. -enum PlacementStrategy { - // Packs Bundles into as few nodes as possible. - PACK = 0; - // Places Bundles across distinct nodes or processes as even as possible. - SPREAD = 1; - // Packs Bundles within one node. The group is not allowed to span multiple nodes. - STRICT_PACK = 2; - // Places Bundles across distinct nodes. - // The group is not allowed to deploy more than one bundle on a node. - STRICT_SPREAD = 3; -} - // Address of a worker or node manager. message Address { bytes raylet_id = 1; @@ -456,3 +443,24 @@ enum WorkerExitType { // Worker exit due to placement group removal. PLACEMENT_GROUP_REMOVED = 3; } +/////////////////////////////////////////////////////////////////////////////// +/* Please do not modify/remove/change the following enum to maintain +backwards compatibility in autoscaler. This is necessary to make sure we can +run autoscaler with any version of ray. For example, the K8s operator runs +autoscaler in a separate pod, if the user upgrades the ray version on the head +pod autoscaler can crash (if the newer version of ray modified the messages +below). */ + +// Type of placement group strategy. +enum PlacementStrategy { + // Packs Bundles into as few nodes as possible. + PACK = 0; + // Places Bundles across distinct nodes or processes as even as possible. + SPREAD = 1; + // Packs Bundles within one node. The group is not allowed to span multiple nodes. + STRICT_PACK = 2; + // Places Bundles across distinct nodes. + // The group is not allowed to deploy more than one bundle on a node. + STRICT_SPREAD = 3; +} +/////////////////////////////////////////////////////////////////////////////// diff --git a/src/ray/protobuf/gcs.proto b/src/ray/protobuf/gcs.proto index a56bffbe1147..5da9842f9619 100644 --- a/src/ray/protobuf/gcs.proto +++ b/src/ray/protobuf/gcs.proto @@ -158,43 +158,6 @@ message ErrorTableData { double timestamp = 4; } -message PlacementGroupTableData { - // State of a placement group. - enum PlacementGroupState { - // Placement Group is pending or scheduling - PENDING = 0; - // Placement Group is created. - CREATED = 1; - // Placement Group is already removed and won't be reschedule. - REMOVED = 2; - // Placement Group is rescheduling because the node it placed is dead. - RESCHEDULING = 3; - } - - // ID of the PlacementGroup. - bytes placement_group_id = 1; - // The name of the placement group. - string name = 2; - // The array of the bundle in Placement Group. - repeated Bundle bundles = 3; - // The schedule strategy of this Placement Group. - PlacementStrategy strategy = 4; - // Current state of this placement group. - PlacementGroupState state = 5; - // Fields to detect the owner of the placement group - // for automatic lifecycle management. - // The job id that created this placement group. - bytes creator_job_id = 6; - // The actor id that created this placement group. - bytes creator_actor_id = 7; - // Whether or not if the creator job is dead. - bool creator_job_dead = 8; - // Whether or not if the creator actor is dead. - bool creator_actor_dead = 9; - // Whether the placement group is persistent. - bool is_detached = 10; -} - message ScheduleData { map schedule_plan = 1; } @@ -275,71 +238,11 @@ message GcsNodeInfo { int64 timestamp = 10; } -// Represents the demand for a particular resource shape. -message ResourceDemand { - // The resource shape requested. This is a map from the resource string - // (e.g., "CPU") to the amount requested. - map shape = 1; - // The number of requests that are ready to run (i.e., dependencies have been - // fulfilled), but that are waiting for resources. - uint64 num_ready_requests_queued = 2; - // The number of requests for which there is no node that is a superset of - // the requested resource shape. - uint64 num_infeasible_requests_queued = 3; - // The number of requests of this shape still queued in CoreWorkers that this - // raylet knows about. - int64 backlog_size = 4; -} - -// Represents the demand sorted by resource shape. -message ResourceLoad { - // A list of all resource demands. The resource shape in each demand is - // unique. - repeated ResourceDemand resource_demands = 1; -} - -message PlacementGroupLoad { - // The list of pending placement group specifications. - repeated PlacementGroupTableData placement_group_data = 1; -} - message HeartbeatTableData { // Node id. bytes node_id = 1; } -message ResourcesData { - // Node id. - bytes node_id = 1; - // Resource capacity currently available on this node manager. - map resources_available = 2; - // Indicates whether available resources is changed. Only used when light - // heartbeat enabled. - bool resources_available_changed = 3; - // Total resource capacity configured for this node manager. - map resources_total = 4; - // Aggregate outstanding resource load on this node manager. - map resource_load = 5; - // Indicates whether resource load is changed. Only used when - // light heartbeat enabled. - bool resource_load_changed = 6; - // The resource load on this node, sorted by resource shape. - ResourceLoad resource_load_by_shape = 7; - // Whether this node manager is requesting global GC. - bool should_global_gc = 8; - // IP address of the node. - string node_manager_address = 9; -} - -message ResourceUsageBatchData { - repeated ResourcesData batch = 1; - // The total resource demand on all nodes included in the batch, sorted by - // resource shape. - ResourceLoad resource_load_by_shape = 2; - // The pending list of placement groups. - PlacementGroupLoad placement_group_load = 3; -} - // Data for a lease on task execution. message TaskLeaseData { // The task ID. @@ -453,3 +356,109 @@ message PubSubMessage { bytes id = 1; bytes data = 2; } + +/////////////////////////////////////////////////////////////////////////////// +/* Please do not modify/remove/change the following messages to maintain +backwards compatibility in autoscaler. This is necessary to make sure we can +run autoscaler with any version of ray. For example, the K8s operator runs +autoscaler in a separate pod, if the user upgrades the ray version on the head +pod autoscaler can crash (if the newer version of ray modified the messages +below). */ + +// Represents the demand for a particular resource shape. +message ResourceDemand { + // The resource shape requested. This is a map from the resource string + // (e.g., "CPU") to the amount requested. + map shape = 1; + // The number of requests that are ready to run (i.e., dependencies have been + // fulfilled), but that are waiting for resources. + uint64 num_ready_requests_queued = 2; + // The number of requests for which there is no node that is a superset of + // the requested resource shape. + uint64 num_infeasible_requests_queued = 3; + // The number of requests of this shape still queued in CoreWorkers that this + // raylet knows about. + int64 backlog_size = 4; +} + +// Represents the demand sorted by resource shape. +message ResourceLoad { + // A list of all resource demands. The resource shape in each demand is + // unique. + repeated ResourceDemand resource_demands = 1; +} + +message ResourcesData { + // Node id. + bytes node_id = 1; + // Resource capacity currently available on this node manager. + map resources_available = 2; + // Indicates whether available resources is changed. Only used when light + // heartbeat enabled. + bool resources_available_changed = 3; + // Total resource capacity configured for this node manager. + map resources_total = 4; + // Aggregate outstanding resource load on this node manager. + map resource_load = 5; + // Indicates whether resource load is changed. Only used when + // light heartbeat enabled. + bool resource_load_changed = 6; + // The resource load on this node, sorted by resource shape. + ResourceLoad resource_load_by_shape = 7; + // Whether this node manager is requesting global GC. + bool should_global_gc = 8; + // IP address of the node. + string node_manager_address = 9; +} + +message ResourceUsageBatchData { + repeated ResourcesData batch = 1; + // The total resource demand on all nodes included in the batch, sorted by + // resource shape. + ResourceLoad resource_load_by_shape = 2; + // The pending list of placement groups. + PlacementGroupLoad placement_group_load = 3; +} + +message PlacementGroupLoad { + // The list of pending placement group specifications. + repeated PlacementGroupTableData placement_group_data = 1; +} + +message PlacementGroupTableData { + // State of a placement group. + enum PlacementGroupState { + // Placement Group is pending or scheduling + PENDING = 0; + // Placement Group is created. + CREATED = 1; + // Placement Group is already removed and won't be reschedule. + REMOVED = 2; + // Placement Group is rescheduling because the node it placed is dead. + RESCHEDULING = 3; + } + + // ID of the PlacementGroup. + bytes placement_group_id = 1; + // The name of the placement group. + string name = 2; + // The array of the bundle in Placement Group. + repeated Bundle bundles = 3; + // The schedule strategy of this Placement Group. + PlacementStrategy strategy = 4; + // Current state of this placement group. + PlacementGroupState state = 5; + // Fields to detect the owner of the placement group + // for automatic lifecycle management. + // The job id that created this placement group. + bytes creator_job_id = 6; + // The actor id that created this placement group. + bytes creator_actor_id = 7; + // Whether or not if the creator job is dead. + bool creator_job_dead = 8; + // Whether or not if the creator actor is dead. + bool creator_actor_dead = 9; + // Whether the placement group is persistent. + bool is_detached = 10; +} +/////////////////////////////////////////////////////////////////////////////// diff --git a/src/ray/protobuf/gcs_service.proto b/src/ray/protobuf/gcs_service.proto index 6e2c450dd111..41c71c7e05ca 100644 --- a/src/ray/protobuf/gcs_service.proto +++ b/src/ray/protobuf/gcs_service.proto @@ -19,11 +19,6 @@ package ray.rpc; import "src/ray/protobuf/common.proto"; import "src/ray/protobuf/gcs.proto"; -message GcsStatus { - int32 code = 1; - string message = 2; -} - message AddJobRequest { JobTableData data = 1; } @@ -231,31 +226,6 @@ message ReportResourceUsageReply { GcsStatus status = 1; } -message GetAllResourceUsageRequest { -} - -message GetAllResourceUsageReply { - GcsStatus status = 1; - ResourceUsageBatchData resource_usage_data = 2; -} - -// Service for node resource info access. -service NodeResourceInfoGcsService { - // Get node's resources from GCS Service. - rpc GetResources(GetResourcesRequest) returns (GetResourcesReply); - // Update resources of a node in GCS Service. - rpc UpdateResources(UpdateResourcesRequest) returns (UpdateResourcesReply); - // Delete resources of a node in GCS Service. - rpc DeleteResources(DeleteResourcesRequest) returns (DeleteResourcesReply); - // Get available resources of all nodes. - rpc GetAllAvailableResources(GetAllAvailableResourcesRequest) - returns (GetAllAvailableResourcesReply); - // Report resource usage of a node to GCS Service. - rpc ReportResourceUsage(ReportResourceUsageRequest) returns (ReportResourceUsageReply); - // Get resource usage of all nodes from GCS Service. - rpc GetAllResourceUsage(GetAllResourceUsageRequest) returns (GetAllResourceUsageReply); -} - // Service for heartbeat info access. service HeartbeatInfoGcsService { // Report heartbeat of a node to GCS Service. @@ -553,3 +523,41 @@ service PlacementGroupInfoGcsService { rpc WaitPlacementGroupUntilReady(WaitPlacementGroupUntilReadyRequest) returns (WaitPlacementGroupUntilReadyReply); } +/////////////////////////////////////////////////////////////////////////////// +/* Please do not modify/remove/change the following messages to maintain +backwards compatibility in autoscaler. This is necessary to make sure we can +run autoscaler with any version of ray. For example, the K8s operator runs +autoscaler in a separate pod, if the user upgrades the ray version on the head +pod autoscaler can crash (if the newer version of ray modified the messages +below). */ + +message GetAllResourceUsageRequest { +} + +message GetAllResourceUsageReply { + GcsStatus status = 1; + ResourceUsageBatchData resource_usage_data = 2; +} + +// Service for node resource info access. +service NodeResourceInfoGcsService { + // Get node's resources from GCS Service. + rpc GetResources(GetResourcesRequest) returns (GetResourcesReply); + // Update resources of a node in GCS Service. + rpc UpdateResources(UpdateResourcesRequest) returns (UpdateResourcesReply); + // Delete resources of a node in GCS Service. + rpc DeleteResources(DeleteResourcesRequest) returns (DeleteResourcesReply); + // Get available resources of all nodes. + rpc GetAllAvailableResources(GetAllAvailableResourcesRequest) + returns (GetAllAvailableResourcesReply); + // Report resource usage of a node to GCS Service. + rpc ReportResourceUsage(ReportResourceUsageRequest) returns (ReportResourceUsageReply); + // Get resource usage of all nodes from GCS Service. + rpc GetAllResourceUsage(GetAllResourceUsageRequest) returns (GetAllResourceUsageReply); +} + +message GcsStatus { + int32 code = 1; + string message = 2; +} +/////////////////////////////////////////////////////////////////////////////// From 2549efe3ca7682fe8000cd1d04343a05338b995b Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 039/244] Revert "[hotfix][docs] RayDP tensorflow != pytorch (#14044)" This reverts commit 10928b33bea5542fad21e3e226b2f03f8ecfe74f. --- doc/source/raydp.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/raydp.rst b/doc/source/raydp.rst index 9a8353ccc9f1..cee14234439c 100644 --- a/doc/source/raydp.rst +++ b/doc/source/raydp.rst @@ -75,9 +75,9 @@ Training a Spark DataFrame with TensorFlow tensorflow_model = estimator.get_model() -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Training a Spark DataFrame with PyTorch -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Training a Spark DataFrame with TensorFlow +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Similarly, ``raydp.torch.TorchEstimator`` provides an API for training with PyTorch. From dde0134d3b4efcf15f27c1db5d95ae6db4c77b14 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 040/244] Revert "[tune] add scalability release tests (#13986)" This reverts commit 3c321c47da1edf5da5db2df8aa201e9594337918. --- release/RELEASE_CHECKLIST.md | 10 +- release/RELEASE_PROCESS.rst | 10 +- .../tune_tests/scalability_tests/cluster.yaml | 31 ++++ .../scalability_tests/cluster_16x2.yaml | 47 ------ .../scalability_tests/cluster_16x64.yaml | 42 ----- .../scalability_tests/cluster_16x64_data.yaml | 53 ------ .../scalability_tests/cluster_1x16.yaml | 34 ---- .../scalability_tests/cluster_1x32_hd.yaml | 40 ----- .../scalability_tests/cluster_1x96.yaml | 34 ---- .../scalability_tests/cluster_200x2.yaml | 42 ----- .../scalability_tests/create_test_data.py | 61 ------- release/tune_tests/scalability_tests/run.sh | 22 +-- .../scalability_tests/workloads/_trainable.py | 153 ------------------ .../workloads/test_bookkeeping_overhead.py | 42 ----- .../workloads/test_durable_trainable.py | 47 ------ .../test_long_running_large_checkpoints.py | 44 ----- .../workloads/test_network_overhead.py | 41 ----- .../workloads/test_result_buffering.py | 54 +++++++ .../test_result_throughput_cluster.py | 49 ------ .../test_result_throughput_single_node.py | 42 ----- .../workloads/test_xgboost_sweep.py | 98 ----------- 21 files changed, 100 insertions(+), 896 deletions(-) create mode 100644 release/tune_tests/scalability_tests/cluster.yaml delete mode 100644 release/tune_tests/scalability_tests/cluster_16x2.yaml delete mode 100644 release/tune_tests/scalability_tests/cluster_16x64.yaml delete mode 100644 release/tune_tests/scalability_tests/cluster_16x64_data.yaml delete mode 100644 release/tune_tests/scalability_tests/cluster_1x16.yaml delete mode 100644 release/tune_tests/scalability_tests/cluster_1x32_hd.yaml delete mode 100644 release/tune_tests/scalability_tests/cluster_1x96.yaml delete mode 100644 release/tune_tests/scalability_tests/cluster_200x2.yaml delete mode 100644 release/tune_tests/scalability_tests/create_test_data.py delete mode 100644 release/tune_tests/scalability_tests/workloads/_trainable.py delete mode 100644 release/tune_tests/scalability_tests/workloads/test_bookkeeping_overhead.py delete mode 100644 release/tune_tests/scalability_tests/workloads/test_durable_trainable.py delete mode 100644 release/tune_tests/scalability_tests/workloads/test_long_running_large_checkpoints.py delete mode 100644 release/tune_tests/scalability_tests/workloads/test_network_overhead.py create mode 100644 release/tune_tests/scalability_tests/workloads/test_result_buffering.py delete mode 100644 release/tune_tests/scalability_tests/workloads/test_result_throughput_cluster.py delete mode 100644 release/tune_tests/scalability_tests/workloads/test_result_throughput_single_node.py delete mode 100644 release/tune_tests/scalability_tests/workloads/test_xgboost_sweep.py diff --git a/release/RELEASE_CHECKLIST.md b/release/RELEASE_CHECKLIST.md index 0c742a94d19f..da2d9145a825 100644 --- a/release/RELEASE_CHECKLIST.md +++ b/release/RELEASE_CHECKLIST.md @@ -62,14 +62,8 @@ This checklist is meant to be used in conjunction with the RELEASE_PROCESS.rst d - [ ] K8s operator test - [ ] Data processing tests - [ ] streaming_shuffle -- [ ] Tune tests - - [ ] test_bookkeeping_overhead - - [x] test_result_throughput_cluster (ignore final time) - - [x] test_result_throughput_single_node (ignore final time) - - [x] test_network_overhead (ignore final time) - - [ ] test_long_running_large_checkpoints - - [ ] test_xgboost_sweep - - [ ] test_durable_trainable +- [x] Tune tests + - [x] ignore for now - [ ] XGBoost Tests - [ ] distributed_api_test - [ ] train_small diff --git a/release/RELEASE_PROCESS.rst b/release/RELEASE_PROCESS.rst index 2502a08657ca..f1decb4b6f99 100644 --- a/release/RELEASE_PROCESS.rst +++ b/release/RELEASE_PROCESS.rst @@ -167,14 +167,8 @@ is generally the easiest way to run release tests. General Ray Tune functionality is implicitly tested via RLLib and XGBoost release tests. We are in the process of introducing scalability envelopes for Ray Tune. - - Of the seven existing tests, three are currently not reaching their target time. - These three tests (test_result_throughput_cluster, test_result_throughput_single_node, and - test_network_overhead) are marked in the release checklist and don't have to be run at this time. - - The other release tests are expected to run through without errors and to pass within a pre-specified time. - The time is checked in the test function and the output will let you know if a run was fast enough and - thus passed the test. + This is an ongoing effort and will only be introduced in the next release. + For now, **you can ignore the tune_tests directory**. 10. **XGBoost release tests** diff --git a/release/tune_tests/scalability_tests/cluster.yaml b/release/tune_tests/scalability_tests/cluster.yaml new file mode 100644 index 000000000000..fd966898b8a7 --- /dev/null +++ b/release/tune_tests/scalability_tests/cluster.yaml @@ -0,0 +1,31 @@ +cluster_name: ray-tune-scalability-tests + +min_workers: 15 +max_workers: 15 + +idle_timeout_minutes: 15 + +docker: + image: anyscale/ray:nightly + container_name: ray_container + pull_before_run: true + +provider: + type: aws + region: us-west-2 + availability_zone: us-west-2a + cache_stopped_nodes: false + +auth: + ssh_user: ubuntu + +head_node: + # 64 CPUs + InstanceType: m5.16xlarge + +worker_nodes: + # 64 CPUs + InstanceType: m5.16xlarge + +setup_commands: + - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl diff --git a/release/tune_tests/scalability_tests/cluster_16x2.yaml b/release/tune_tests/scalability_tests/cluster_16x2.yaml deleted file mode 100644 index e5e56e7c957d..000000000000 --- a/release/tune_tests/scalability_tests/cluster_16x2.yaml +++ /dev/null @@ -1,47 +0,0 @@ -cluster_name: ray-tune-scalability-tests-16x2 - -max_workers: 15 -upscaling_speed: 15 - -idle_timeout_minutes: 0 - -docker: - image: anyscale/ray:nightly - container_name: ray_container - pull_before_run: true - -provider: - type: aws - region: us-west-2 - availability_zone: us-west-2a - cache_stopped_nodes: false - -available_node_types: - cpu_2_ondemand: - node_config: - InstanceType: m5.large - resources: {"CPU": 2} - min_workers: 0 - max_workers: 0 - cpu_2_spot: - node_config: - InstanceType: m5.large - InstanceMarketOptions: - MarketType: spot - resources: {"CPU": 2} - min_workers: 15 - max_workers: 15 - -auth: - ssh_user: ubuntu - -head_node_type: cpu_2_ondemand -worker_default_node_type: cpu_2_spot - -setup_commands: - - ray install-nightly - - pip install -U awscli - -file_mounts: { - "~/release-automation-tune_scalability_tests": "." -} diff --git a/release/tune_tests/scalability_tests/cluster_16x64.yaml b/release/tune_tests/scalability_tests/cluster_16x64.yaml deleted file mode 100644 index fbe954b6c789..000000000000 --- a/release/tune_tests/scalability_tests/cluster_16x64.yaml +++ /dev/null @@ -1,42 +0,0 @@ -cluster_name: ray-tune-scalability-tests-16x64 - -max_workers: 15 -upscaling_speed: 15 - -idle_timeout_minutes: 0 - -docker: - image: anyscale/ray:nightly - container_name: ray_container - pull_before_run: true - -provider: - type: aws - region: us-west-2 - availability_zone: us-west-2a - cache_stopped_nodes: false - -available_node_types: - cpu_64_ondemand: - node_config: - InstanceType: m5.16xlarge - resources: {"CPU": 64} - min_workers: 0 - max_workers: 0 - cpu_64_spot: - node_config: - InstanceType: m5.16xlarge - InstanceMarketOptions: - MarketType: spot - resources: {"CPU": 64} - min_workers: 15 - max_workers: 15 - -auth: - ssh_user: ubuntu - -head_node_type: cpu_64_ondemand -worker_default_node_type: cpu_64_spot - -setup_commands: - - ray install-nightly diff --git a/release/tune_tests/scalability_tests/cluster_16x64_data.yaml b/release/tune_tests/scalability_tests/cluster_16x64_data.yaml deleted file mode 100644 index 56db5a349065..000000000000 --- a/release/tune_tests/scalability_tests/cluster_16x64_data.yaml +++ /dev/null @@ -1,53 +0,0 @@ -cluster_name: ray-tune-scalability-tests-16x64_data - -max_workers: 16 -upscaling_speed: 16 - -idle_timeout_minutes: 0 - -docker: - image: anyscale/ray:nightly - container_name: ray_container - pull_before_run: true - -provider: - type: aws - region: us-west-2 - availability_zone: us-west-2a - cache_stopped_nodes: false - -available_node_types: - cpu_64_ondemand: - node_config: - InstanceType: m5.16xlarge - resources: {"CPU": 64} - min_workers: 0 - max_workers: 0 - cpu_64_spot: - node_config: - InstanceType: m5.16xlarge - InstanceMarketOptions: - MarketType: spot - resources: {"CPU": 64} - min_workers: 15 - max_workers: 15 - -auth: - ssh_user: ubuntu - -head_node_type: cpu_64_ondemand -worker_default_node_type: cpu_64_spot - -file_mounts: { - "~/release-automation-tune_scalability_tests": "." -} - -setup_commands: - - ray install-nightly - - pip install pytest xgboost_ray - - mkdir -p ~/data || true - - rm -rf ~/data/train.parquet || true - - rm -rf ~/data/test.parquet || true - - cp -R /tmp/ray_tmp_mount/release-automation-tune_scalability_tests ~/release-automation-tune_scalability_tests || echo "Copy failed" - - python ~/release-automation-tune_scalability_tests/create_test_data.py ~/data/train.parquet --seed 1234 --num-rows 40000000 --num-cols 40 --num-partitions 128 --num-classes 2 - - python ~/release-automation-tune_scalability_tests/create_test_data.py ~/data/test.parquet --seed 1234 --num-rows 10000000 --num-cols 40 --num-partitions 128 --num-classes 2 diff --git a/release/tune_tests/scalability_tests/cluster_1x16.yaml b/release/tune_tests/scalability_tests/cluster_1x16.yaml deleted file mode 100644 index a40e0d0a0711..000000000000 --- a/release/tune_tests/scalability_tests/cluster_1x16.yaml +++ /dev/null @@ -1,34 +0,0 @@ -cluster_name: ray-tune-scalability-tests-1x16 - -max_workers: 0 -upscaling_speed: 1 - -idle_timeout_minutes: 0 - -docker: - image: anyscale/ray:nightly - container_name: ray_container - pull_before_run: true - -provider: - type: aws - region: us-west-2 - availability_zone: us-west-2a - cache_stopped_nodes: false - -available_node_types: - cpu_4_ondemand: - node_config: - InstanceType: m5.xlarge - resources: {"CPU": 4} - min_workers: 0 - max_workers: 0 - -auth: - ssh_user: ubuntu - -head_node_type: cpu_4_ondemand -worker_default_node_type: cpu_4_ondemand - -setup_commands: - - ray install-nightly diff --git a/release/tune_tests/scalability_tests/cluster_1x32_hd.yaml b/release/tune_tests/scalability_tests/cluster_1x32_hd.yaml deleted file mode 100644 index e909c138c90b..000000000000 --- a/release/tune_tests/scalability_tests/cluster_1x32_hd.yaml +++ /dev/null @@ -1,40 +0,0 @@ -cluster_name: ray-tune-scalability-tests-1x32_hd - -max_workers: 0 -upscaling_speed: 1 - -idle_timeout_minutes: 0 - -docker: - image: anyscale/ray:nightly - container_name: ray_container - pull_before_run: true - -provider: - type: aws - region: us-west-2 - availability_zone: us-west-2a - cache_stopped_nodes: false - -available_node_types: - cpu_32_hd_ondemand: - node_config: - InstanceType: m5.8xlarge - - BlockDeviceMappings: - - DeviceName: /dev/sda1 - Ebs: - VolumeSize: 160 - - resources: {"CPU": 32} # 128 GB memory - min_workers: 0 - max_workers: 0 - -auth: - ssh_user: ubuntu - -head_node_type: cpu_32_hd_ondemand -worker_default_node_type: cpu_32_hd_ondemand - -setup_commands: - - ray install-nightly diff --git a/release/tune_tests/scalability_tests/cluster_1x96.yaml b/release/tune_tests/scalability_tests/cluster_1x96.yaml deleted file mode 100644 index ec01ede17926..000000000000 --- a/release/tune_tests/scalability_tests/cluster_1x96.yaml +++ /dev/null @@ -1,34 +0,0 @@ -cluster_name: ray-tune-scalability-tests-1x96 - -max_workers: 0 -upscaling_speed: 1 - -idle_timeout_minutes: 0 - -docker: - image: anyscale/ray:nightly - container_name: ray_container - pull_before_run: true - -provider: - type: aws - region: us-west-2 - availability_zone: us-west-2a - cache_stopped_nodes: false - -available_node_types: - cpu_96_ondemand: - node_config: - InstanceType: m5.24xlarge - resources: {"CPU": 96} - min_workers: 0 - max_workers: 0 - -auth: - ssh_user: ubuntu - -head_node_type: cpu_96_ondemand -worker_default_node_type: cpu_96_ondemand - -setup_commands: - - ray install-nightly diff --git a/release/tune_tests/scalability_tests/cluster_200x2.yaml b/release/tune_tests/scalability_tests/cluster_200x2.yaml deleted file mode 100644 index 143505ab2d14..000000000000 --- a/release/tune_tests/scalability_tests/cluster_200x2.yaml +++ /dev/null @@ -1,42 +0,0 @@ -cluster_name: ray-tune-scalability-tests-200x2 - -max_workers: 199 -upscaling_speed: 199 - -idle_timeout_minutes: 0 - -docker: - image: anyscale/ray:nightly - container_name: ray_container - pull_before_run: true - -provider: - type: aws - region: us-west-2 - availability_zone: us-west-2a - cache_stopped_nodes: false - -available_node_types: - cpu_2_ondemand: - node_config: - InstanceType: m5.large - resources: {"CPU": 2} - min_workers: 0 - max_workers: 0 - cpu_2_spot: - node_config: - InstanceType: m5.large - InstanceMarketOptions: - MarketType: spot - resources: {"CPU": 2} - min_workers: 199 - max_workers: 199 - -auth: - ssh_user: ubuntu - -head_node_type: cpu_2_ondemand -worker_default_node_type: cpu_2_spot - -setup_commands: - - ray install-nightly diff --git a/release/tune_tests/scalability_tests/create_test_data.py b/release/tune_tests/scalability_tests/create_test_data.py deleted file mode 100644 index f7a450105426..000000000000 --- a/release/tune_tests/scalability_tests/create_test_data.py +++ /dev/null @@ -1,61 +0,0 @@ -import argparse -import numpy as np -import os - -from xgboost_ray.tests.utils import create_parquet - -if __name__ == "__main__": - if "OMP_NUM_THREADS" in os.environ: - del os.environ["OMP_NUM_THREADS"] - - parser = argparse.ArgumentParser(description="Create fake data.") - parser.add_argument( - "filename", type=str, default="/data/parted.parquet/", help="ray/dask") - parser.add_argument( - "-r", - "--num-rows", - required=False, - type=int, - default=1e8, - help="num rows") - parser.add_argument( - "-p", - "--num-partitions", - required=False, - type=int, - default=100, - help="num partitions") - parser.add_argument( - "-c", - "--num-cols", - required=False, - type=int, - default=4, - help="num columns (features)") - parser.add_argument( - "-C", - "--num-classes", - required=False, - type=int, - default=2, - help="num classes") - parser.add_argument( - "-s", - "--seed", - required=False, - type=int, - default=1234, - help="random seed") - - args = parser.parse_args() - - if os.path.exists(args.filename): - print(f"File already exists: {args.filename}. Skipping creation.") - - np.random.seed(args.seed) - create_parquet( - args.filename, - num_rows=int(args.num_rows), - num_partitions=int(args.num_partitions), - num_features=int(args.num_cols), - num_classes=int(args.num_classes)) diff --git a/release/tune_tests/scalability_tests/run.sh b/release/tune_tests/scalability_tests/run.sh index 6c7172bfcc00..e4f5698aa6a9 100755 --- a/release/tune_tests/scalability_tests/run.sh +++ b/release/tune_tests/scalability_tests/run.sh @@ -1,7 +1,6 @@ #!/usr/bin/env bash -nodes="" -ray_version="" +ray_version="" commit="" ray_branch="" @@ -9,11 +8,9 @@ for i in "$@" do echo "$i" case "$i" in - --nodes=*) - nodes="${i#*=}" - ;; --ray-version=*) ray_version="${i#*=}" + ;; --commit=*) commit="${i#*=}" @@ -35,22 +32,25 @@ case "$i" in esac done -if [[ $nodes == "" || $ray_version == "" || $commit == "" || $ray_branch == "" ]] +if [[ $ray_version == "" || $commit == "" || $ray_branch == "" ]] then - echo "Provide --nodes --ray-version, --commit, and --ray-branch" + echo "Provide --ray-version, --commit, and --ray-branch" exit 1 fi -echo "nodes: $nodes" echo "version: $ray_version" echo "commit: $commit" echo "branch: $ray_branch" echo "workload: ignored" -# wheel="https://s3-us-west-2.amazonaws.com/ray-wheels/$ray_branch/$commit/ray-$ray_version-cp37-cp37m-manylinux2014_x86_64.whl" -# pip install -U "$wheel" +wheel="https://s3-us-west-2.amazonaws.com/ray-wheels/$ray_branch/$commit/ray-$ray_version-cp37-cp37m-manylinux2014_x86_64.whl" + +pip install -U pip +pip install -U "$wheel" +pip install "ray[tune]" "ray" +pip install boto3==1.4.8 cython==0.29.0 -if ! python "wait_cluster.py" "$nodes" 600; then +if ! python "wait_cluster.py" 16 450; then echo "Cluster did not come up in time. Aborting test." exit 1 fi diff --git a/release/tune_tests/scalability_tests/workloads/_trainable.py b/release/tune_tests/scalability_tests/workloads/_trainable.py deleted file mode 100644 index c5ce8c005f79..000000000000 --- a/release/tune_tests/scalability_tests/workloads/_trainable.py +++ /dev/null @@ -1,153 +0,0 @@ -import os -import time - -import numpy as np -import pickle - -from ray import tune - -from ray.tune.durable_trainable import DurableTrainable - - -class TestDurableTrainable(DurableTrainable): - def __init__(self, remote_checkpoint_dir, config, logger_creator=None): - self.setup_env() - - super(TestDurableTrainable, self).__init__( - remote_checkpoint_dir, - config=config, - logger_creator=logger_creator) - - def setup_env(self): - pass - - def setup(self, config): - self._num_iters = int(config["num_iters"]) - self._sleep_time = config["sleep_time"] - self._score = config["score"] - - self._checkpoint_iters = config["checkpoint_iters"] - self._checkpoint_size_b = config["checkpoint_size_b"] - self._checkpoint_num_items = self._checkpoint_size_b // 8 # np.float64 - - self._iter = 0 - - def step(self): - if self._iter > 0: - time.sleep(self._sleep_time) - - res = dict(score=self._iter + self._score) - - if self._iter >= self._num_iters: - res["done"] = True - - self._iter += 1 - return res - - def save_checkpoint(self, tmp_checkpoint_dir): - checkpoint_file = os.path.join(tmp_checkpoint_dir, "bogus.ckpt") - checkpoint_data = np.random.uniform( - 0, 1, size=self._checkpoint_num_items) - with open(checkpoint_file, "wb") as fp: - pickle.dump(checkpoint_data, fp) - return checkpoint_file - - def load_checkpoint(self, checkpoint): - pass - - -def function_trainable(config): - num_iters = int(config["num_iters"]) - sleep_time = config["sleep_time"] - score = config["score"] - - checkpoint_iters = config["checkpoint_iters"] - checkpoint_size_b = config["checkpoint_size_b"] - checkpoint_num_items = checkpoint_size_b // 8 # np.float64 - - for i in range(num_iters): - if checkpoint_iters >= 0 and checkpoint_size_b > 0 and \ - i % checkpoint_iters == 0: - with tune.checkpoint_dir(step=i) as dir: - checkpoint_file = os.path.join(dir, "bogus.ckpt") - checkpoint_data = np.random.uniform( - 0, 1, size=checkpoint_num_items) - with open(checkpoint_file, "wb") as fp: - pickle.dump(checkpoint_data, fp) - - tune.report(score=i + score) - time.sleep(sleep_time) - - -def timed_tune_run(name: str, - num_samples: int, - results_per_second: int = 1, - trial_length_s: int = 1, - max_runtime: int = 300, - checkpoint_freq_s: int = -1, - checkpoint_size_b: int = 0, - **tune_kwargs): - durable = "sync_config" in tune_kwargs and \ - tune_kwargs["sync_config"].upload_dir.startswith("s3://") - - sleep_time = 1. / results_per_second - num_iters = int(trial_length_s / sleep_time) - checkpoint_iters = -1 - if checkpoint_freq_s >= 0: - checkpoint_iters = int(checkpoint_freq_s / sleep_time) - - config = { - "score": tune.uniform(0., 1.), - "num_iters": num_iters, - "sleep_time": sleep_time, - "checkpoint_iters": checkpoint_iters, - "checkpoint_size_b": checkpoint_size_b, - } - - print(f"Starting benchmark with config: {config}") - - run_kwargs = {"reuse_actors": True, "verbose": 2} - run_kwargs.update(tune_kwargs) - - _train = function_trainable - - aws_key_id = os.getenv("AWS_ACCESS_KEY_ID", "") - aws_secret = os.getenv("AWS_SECRET_ACCESS_KEY", "") - aws_session = os.getenv("AWS_SESSION_TOKEN", "") - - if durable: - - class AwsDurableTrainable(TestDurableTrainable): - AWS_ACCESS_KEY_ID = aws_key_id - AWS_SECRET_ACCESS_KEY = aws_secret - AWS_SESSION_TOKEN = aws_session - - def setup_env(self): - os.environ["AWS_ACCESS_KEY_ID"] = self.AWS_ACCESS_KEY_ID - os.environ[ - "AWS_SECRET_ACCESS_KEY"] = self.AWS_SECRET_ACCESS_KEY - os.environ["AWS_SESSION_TOKEN"] = self.AWS_SESSION_TOKEN - - _train = AwsDurableTrainable - run_kwargs["checkpoint_freq"] = checkpoint_iters - - start_time = time.monotonic() - tune.run( - _train, - config=config, - num_samples=num_samples, - raise_on_failed_trial=False, - **run_kwargs) - time_taken = time.monotonic() - start_time - - assert time_taken < max_runtime, \ - f"The {name} test took {time_taken:.2f} seconds, but should not " \ - f"have exceeded {max_runtime:.2f} seconds. Test failed. \n\n" \ - f"--- FAILED: {name.upper()} ::: " \ - f"{time_taken:.2f} > {max_runtime:.2f} ---" - - print(f"The {name} test took {time_taken:.2f} seconds, which " - f"is below the budget of {max_runtime:.2f} seconds. " - f"Test successful. \n\n" - f"--- PASSED: {name.upper()} ::: " - f"{time_taken:.2f} <= {max_runtime:.2f} ---") diff --git a/release/tune_tests/scalability_tests/workloads/test_bookkeeping_overhead.py b/release/tune_tests/scalability_tests/workloads/test_bookkeeping_overhead.py deleted file mode 100644 index 2792c18d8830..000000000000 --- a/release/tune_tests/scalability_tests/workloads/test_bookkeeping_overhead.py +++ /dev/null @@ -1,42 +0,0 @@ -"""Bookkeeping overhead (1 node, 10k trials) - -In this run, we will start a large number of trials (10k) that take just a -second to run. We thus measure overhead that comes with dealing with a -large number of trials, e.g. experiment checkpointing. - -Cluster: cluster_1x16.yaml - -Test owner: krfricke - -Acceptance criteria: Should run faster than 800 seconds. - -Theoretical minimum time: 10000/16 = 625 seconds -""" -import os - -import ray - -from _trainable import timed_tune_run - - -def main(): - os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "100" # Tweak - - ray.init(address="auto") - - num_samples = 10000 - results_per_second = 1 - trial_length_s = 1 - - max_runtime = 800 - - timed_tune_run( - name="bookkeeping overhead", - num_samples=num_samples, - results_per_second=results_per_second, - trial_length_s=trial_length_s, - max_runtime=max_runtime) - - -if __name__ == "__main__": - main() diff --git a/release/tune_tests/scalability_tests/workloads/test_durable_trainable.py b/release/tune_tests/scalability_tests/workloads/test_durable_trainable.py deleted file mode 100644 index b37fd596f6fe..000000000000 --- a/release/tune_tests/scalability_tests/workloads/test_durable_trainable.py +++ /dev/null @@ -1,47 +0,0 @@ -"""Durable trainable (16 trials, checkpoint to cloud) - -In this run, we will start 16 trials on a cluster. The trials create -10 MB checkpoints every 10 seconds and should only keep 2 of these. This test -ensures that durable checkpoints don't slow down experiment progress too much. - -Cluster: cluster_16x2.yaml - -Test owner: krfricke - -Acceptance criteria: Should run faster than 500 seconds. - -Theoretical minimum time: 300 seconds -""" -import ray -from ray import tune - -from _trainable import timed_tune_run - - -def main(): - ray.init(address="auto") - - num_samples = 16 - results_per_second = 10 / 60 - trial_length_s = 300 - - max_runtime = 500 - - timed_tune_run( - name="durable trainable", - num_samples=num_samples, - results_per_second=results_per_second, - trial_length_s=trial_length_s, - max_runtime=max_runtime, - checkpoint_freq_s=10, # Once every 10 seconds - checkpoint_size_b=int(10 * 1000**2), # 10 MB - keep_checkpoints_num=2, - resources_per_trial={"cpu": 2}, - sync_config=tune.SyncConfig( - sync_to_driver=False, - upload_dir="s3://ray-tune-scalability-test/durable/", - )) - - -if __name__ == "__main__": - main() diff --git a/release/tune_tests/scalability_tests/workloads/test_long_running_large_checkpoints.py b/release/tune_tests/scalability_tests/workloads/test_long_running_large_checkpoints.py deleted file mode 100644 index 05484431c700..000000000000 --- a/release/tune_tests/scalability_tests/workloads/test_long_running_large_checkpoints.py +++ /dev/null @@ -1,44 +0,0 @@ -"""Large checkpoints in long running trials (16 trials, 4 GB checkpoints). - -In this run, we will start 16 trials on a single node. The trials create -4 GB checkpoints every 15 minutes and should only keep 2 of these. This test -ensures that handling large checkpoints don't lead to much overhead. - -Cluster: cluster_1x32_hd.yaml - -Test owner: krfricke - -Acceptance criteria: Should run faster than 90,000 seconds. - -Theoretical minimum time: 86,400 seconds -""" -import ray -from ray import tune - -from _trainable import timed_tune_run - - -def main(): - ray.init(address="auto") - - num_samples = 16 - results_per_second = 1 / 60 - trial_length_s = 86400 - - max_runtime = 90000 - - timed_tune_run( - name="long running large checkpoints", - num_samples=num_samples, - results_per_second=results_per_second, - trial_length_s=trial_length_s, - max_runtime=max_runtime, - checkpoint_freq_s=900, # Once every 15 minutes - checkpoint_size_b=int(3.75 * 1000**3), - keep_checkpoints_num=2, # 2 * 16 * 4 = 128 GB - resources_per_trial={"cpu": 1}, - sync_config=tune.SyncConfig(sync_to_driver=True)) - - -if __name__ == "__main__": - main() diff --git a/release/tune_tests/scalability_tests/workloads/test_network_overhead.py b/release/tune_tests/scalability_tests/workloads/test_network_overhead.py deleted file mode 100644 index 3222b6eca97d..000000000000 --- a/release/tune_tests/scalability_tests/workloads/test_network_overhead.py +++ /dev/null @@ -1,41 +0,0 @@ -"""Networking overhead (200 trials on 200 nodes) - -In this run, we will start 200 trials and run them on 200 different nodes. -This test will thus measure the overhead that comes with network communication -and specifically log synchronization. - -Cluster: cluster_200x2.yaml - -Test owner: krfricke - -Acceptance criteria: Should run faster than 500 seconds. - -Theoretical minimum time: 300 seconds -""" -import ray -from ray import tune - -from _trainable import timed_tune_run - - -def main(): - ray.init(address="auto") - - num_samples = 200 - results_per_second = 1 - trial_length_s = 300 - - max_runtime = 500 - - timed_tune_run( - name="result network overhead", - num_samples=num_samples, - results_per_second=results_per_second, - trial_length_s=trial_length_s, - max_runtime=max_runtime, - resources_per_trial={"cpu": 2}, # One per node - sync_config=tune.SyncConfig(sync_to_driver=True)) - - -if __name__ == "__main__": - main() diff --git a/release/tune_tests/scalability_tests/workloads/test_result_buffering.py b/release/tune_tests/scalability_tests/workloads/test_result_buffering.py new file mode 100644 index 000000000000..e6ea1762f9b2 --- /dev/null +++ b/release/tune_tests/scalability_tests/workloads/test_result_buffering.py @@ -0,0 +1,54 @@ +import time + +import ray +from ray import tune +from ray.tune.cluster_info import is_ray_cluster + + +def my_naive_trainable(config): + for i in range(int(config["num_iters"])): + tune.report(score=i + config["score"]) + time.sleep(config["sleep_time"]) + + +def main(): + ray.init(address="auto") + + num_samples = 1000 + + sleep_time = 0.1 + num_iters = 300 + + expected_run_time = num_iters * sleep_time + + # Allow minimum of 20 % overhead (or 10 seconds for short runs) + expected_run_time += max(expected_run_time * 0.2, 10.) + + if is_ray_cluster(): + # Add constant overhead for SSH connection + expected_run_time += 0.3 * num_samples + + start_time = time.time() + tune.run( + my_naive_trainable, + config={ + "score": tune.uniform(0., 1.), + "num_iters": num_iters, + "sleep_time": sleep_time + }, + reuse_actors=True, + verbose=2, + num_samples=num_samples) + time_taken = time.time() - start_time + + assert time_taken < expected_run_time, \ + f"The buffering test took {time_taken:.2f} seconds, but should not " \ + f"have exceeded {expected_run_time:.2f} seconds. Test failed." + + print(f"The buffering test took {time_taken:.2f} seconds, which " + f"is below the budget of {expected_run_time:.2f} seconds. " + f"Test successful.") + + +if __name__ == "__main__": + main() diff --git a/release/tune_tests/scalability_tests/workloads/test_result_throughput_cluster.py b/release/tune_tests/scalability_tests/workloads/test_result_throughput_cluster.py deleted file mode 100644 index 8a3ba682ca89..000000000000 --- a/release/tune_tests/scalability_tests/workloads/test_result_throughput_cluster.py +++ /dev/null @@ -1,49 +0,0 @@ -"""Result throughput on a cluster - -In this run, we will start 1000 trials concurrently that report often -(10 results per second). We thus measure the amount of overhead incurred when -dealing with a large number of results from distributed trials. - -Cluster: cluster_16x64.yaml - -Test owner: krfricke - -Acceptance criteria: Should run faster than 120 seconds. - -Theoretical minimum time: 100 seconds -""" -import os - -import ray -from ray import tune -from ray.tune.cluster_info import is_ray_cluster - -from _trainable import timed_tune_run - - -def main(): - os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = "1" # Tweak - - ray.init(address="auto") - - num_samples = 1000 - results_per_second = 10 - trial_length_s = 100 - - max_runtime = 120 - - if is_ray_cluster(): - # Add constant overhead for SSH connection - max_runtime = 120 - - timed_tune_run( - name="result throughput cluster", - num_samples=num_samples, - results_per_second=results_per_second, - trial_length_s=trial_length_s, - max_runtime=max_runtime, - sync_config=tune.SyncConfig(sync_to_driver=False)) # Tweak! - - -if __name__ == "__main__": - main() diff --git a/release/tune_tests/scalability_tests/workloads/test_result_throughput_single_node.py b/release/tune_tests/scalability_tests/workloads/test_result_throughput_single_node.py deleted file mode 100644 index 288b28d5f9a5..000000000000 --- a/release/tune_tests/scalability_tests/workloads/test_result_throughput_single_node.py +++ /dev/null @@ -1,42 +0,0 @@ -"""Result throughput on a single node - -In this run, we will start 96 trials concurrently that report very often -(500 results per second). We thus measure the amount of overhead incurred when -dealing with a large number of results. - -Cluster: cluster_1x96.yaml - -Test owner: krfricke - -Acceptance criteria: Should run faster than 120 seconds. - -Theoretical minimum time: 100 seconds -""" -import os - -import ray - -from _trainable import timed_tune_run - - -def main(): - os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = "1" # Tweak - - ray.init(address="auto") - - num_samples = 96 - results_per_second = 500 - trial_length_s = 100 - - max_runtime = 120 - - timed_tune_run( - name="result throughput single node", - num_samples=num_samples, - results_per_second=results_per_second, - trial_length_s=trial_length_s, - max_runtime=max_runtime) - - -if __name__ == "__main__": - main() diff --git a/release/tune_tests/scalability_tests/workloads/test_xgboost_sweep.py b/release/tune_tests/scalability_tests/workloads/test_xgboost_sweep.py deleted file mode 100644 index 16a1f261693a..000000000000 --- a/release/tune_tests/scalability_tests/workloads/test_xgboost_sweep.py +++ /dev/null @@ -1,98 +0,0 @@ -"""Large-scale XGBoost parameter sweep - -In this run, we will start 32 trials of 32 actors each running distributed -XGBoost training. This test is more about making sure that the run succeeds -than about total runtime. However, it is expected that this is faster than -1 hour. - -We fix the max_depth to 4 and the number of boosting rounds to 100. The -fastest observed training time for 32 actors (1 CPU each) was about 2000 -seconds. We allow up to 10 minutes of slack, so aim for 2600 seconds total -tuning time. - -Cluster: cluster_16x64_data.yaml - -Test owner: krfricke - -Acceptance criteria: Should run faster than 2600 seconds. Should run without -errors. -""" -import os -import time - -import ray -from ray import tune - -from xgboost_ray import train, RayParams, RayDMatrix - - -def xgboost_train(config, num_actors=128, num_boost_round=200): - train_set = RayDMatrix( - os.path.expanduser("~/data/train.parquet"), "labels") - test_set = RayDMatrix(os.path.expanduser("~/data/test.parquet"), "labels") - - evals_result = {} - - bst = train( - params=config, - dtrain=train_set, - evals=[(test_set, "eval")], - evals_result=evals_result, - ray_params=RayParams( - max_actor_restarts=1, - gpus_per_actor=0, - cpus_per_actor=1, - num_actors=num_actors), - verbose_eval=False, - num_boost_round=num_boost_round) - - model_path = "tuned.xgb" - bst.save_model(model_path) - print("Final validation error: {:.4f}".format( - evals_result["eval"]["error"][-1])) - - -def main(): - name = "large xgboost sweep" - - ray.init(address="auto") - - num_samples = 32 - num_actors_per_sample = 32 - - max_runtime = 2600 - - config = { - "tree_method": "approx", - "objective": "binary:logistic", - "eval_metric": ["logloss", "error"], - "eta": tune.loguniform(1e-4, 1e-1), - "subsample": tune.uniform(0.5, 1.0), - "max_depth": 4 - } - - start_time = time.monotonic() - tune.run( - tune.with_parameters( - xgboost_train, - num_actors=num_actors_per_sample, - num_boost_round=100), - config=config, - num_samples=num_samples) - time_taken = time.monotonic() - start_time - - assert time_taken < max_runtime, \ - f"The {name} test took {time_taken:.2f} seconds, but should not " \ - f"have exceeded {max_runtime:.2f} seconds. Test failed. \n\n" \ - f"--- FAILED: {name.upper()} ::: " \ - f"{time_taken:.2f} > {max_runtime:.2f} ---" - - print(f"The {name} test took {time_taken:.2f} seconds, which " - f"is below the budget of {max_runtime:.2f} seconds. " - f"Test successful. \n\n" - f"--- PASSED: {name.upper()} ::: " - f"{time_taken:.2f} <= {max_runtime:.2f} ---") - - -if __name__ == "__main__": - main() From 575fe6ed14c05ea9b987244e9125e96a6f7e25ae Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 041/244] Revert "[RLlib] TFPolicy.export_model: Add timestep placeholder to model's signature, if needed. (#13988)" This reverts commit b922e47ffe42154c3a55d224c1a62e2f6d42c09b. --- rllib/policy/tf_policy.py | 5 ----- rllib/tests/test_export.py | 8 -------- 2 files changed, 13 deletions(-) diff --git a/rllib/policy/tf_policy.py b/rllib/policy/tf_policy.py index e71cd2b44971..f16f3f72adfd 100644 --- a/rllib/policy/tf_policy.py +++ b/rllib/policy/tf_policy.py @@ -709,14 +709,9 @@ def _build_signature_def(self): input_signature["prev_reward"] = \ tf1.saved_model.utils.build_tensor_info( self._prev_reward_input) - input_signature["is_training"] = \ tf1.saved_model.utils.build_tensor_info(self._is_training) - if self._timestep is not None: - input_signature["timestep"] = \ - tf1.saved_model.utils.build_tensor_info(self._timestep) - for state_input in self._state_inputs: input_signature[state_input.name] = \ tf1.saved_model.utils.build_tensor_info(state_input) diff --git a/rllib/tests/test_export.py b/rllib/tests/test_export.py index bb8bde8e15e6..711cc85b5956 100644 --- a/rllib/tests/test_export.py +++ b/rllib/tests/test_export.py @@ -6,11 +6,8 @@ import ray from ray.rllib.agents.registry import get_trainer_class -from ray.rllib.utils.framework import try_import_tf from ray.tune.trial import ExportFormat -tf1, tf, tfv = try_import_tf() - CONFIGS = { "A3C": { "explore": False, @@ -108,11 +105,6 @@ def valid_tf_checkpoint(checkpoint_dir): or not valid_tf_checkpoint(os.path.join(export_dir, ExportFormat.CHECKPOINT)): failures.append(alg_name) - - # Test loading the exported model. - model = tf.saved_model.load(os.path.join(export_dir, ExportFormat.MODEL)) - assert model - shutil.rmtree(export_dir) From 2733f37284456c506805001c108ce718f7f26965 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 042/244] Revert "[RLlib] DDPG: Support simplex action space. (#14011)" This reverts commit c00e000c4d8c561b239a4e7fbb9b5849fd78bf69. --- rllib/agents/ddpg/ddpg_tf_policy.py | 16 +++------ rllib/agents/ddpg/ddpg_torch_policy.py | 10 ++---- rllib/agents/ddpg/tests/test_ddpg.py | 11 ++++-- rllib/agents/sac/sac_torch_policy.py | 46 +++++++++++++------------- rllib/agents/sac/tests/test_sac.py | 11 ++++-- 5 files changed, 48 insertions(+), 46 deletions(-) diff --git a/rllib/agents/ddpg/ddpg_tf_policy.py b/rllib/agents/ddpg/ddpg_tf_policy.py index 203add618ce6..414910cc33f8 100644 --- a/rllib/agents/ddpg/ddpg_tf_policy.py +++ b/rllib/agents/ddpg/ddpg_tf_policy.py @@ -13,15 +13,13 @@ PRIO_WEIGHTS from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.models import ModelCatalog -from ray.rllib.models.tf.tf_action_dist import Deterministic, Dirichlet -from ray.rllib.models.torch.torch_action_dist import TorchDeterministic, \ - TorchDirichlet +from ray.rllib.models.tf.tf_action_dist import Deterministic +from ray.rllib.models.torch.torch_action_dist import TorchDeterministic from ray.rllib.utils.annotations import override from ray.rllib.policy.tf_policy import TFPolicy from ray.rllib.policy.tf_policy_template import build_tf_policy from ray.rllib.utils.error import UnsupportedSpaceException from ray.rllib.utils.framework import get_variable, try_import_tf -from ray.rllib.utils.spaces.simplex import Simplex from ray.rllib.utils.tf_ops import huber_loss, make_tf_callable tf1, tf, tfv = try_import_tf() @@ -93,13 +91,9 @@ def get_distribution_inputs_and_class(policy, }, [], None) dist_inputs = model.get_policy_output(model_out) - if isinstance(policy.action_space, Simplex): - distr_class = TorchDirichlet if policy.config["framework"] == "torch" \ - else Dirichlet - else: - distr_class = TorchDeterministic if \ - policy.config["framework"] == "torch" else Deterministic - return dist_inputs, distr_class, [] # []=state out + return dist_inputs, (TorchDeterministic + if policy.config["framework"] == "torch" else + Deterministic), [] # []=state out def ddpg_actor_critic_loss(policy, model, _, train_batch): diff --git a/rllib/agents/ddpg/ddpg_torch_policy.py b/rllib/agents/ddpg/ddpg_torch_policy.py index 5041ae5fed46..f6c73f912da7 100644 --- a/rllib/agents/ddpg/ddpg_torch_policy.py +++ b/rllib/agents/ddpg/ddpg_torch_policy.py @@ -5,12 +5,10 @@ get_distribution_inputs_and_class, validate_spaces from ray.rllib.agents.dqn.dqn_tf_policy import postprocess_nstep_and_prio, \ PRIO_WEIGHTS -from ray.rllib.models.torch.torch_action_dist import TorchDeterministic, \ - TorchDirichlet +from ray.rllib.models.torch.torch_action_dist import TorchDeterministic from ray.rllib.policy.policy_template import build_policy_class from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.spaces.simplex import Simplex from ray.rllib.utils.torch_ops import apply_grad_clipping, huber_loss, l2_loss torch, nn = try_import_torch() @@ -26,11 +24,7 @@ def build_ddpg_models_and_action_dist(policy, obs_space, action_space, config): device = (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")) policy.target_model = policy.target_model.to(device) - - if isinstance(action_space, Simplex): - return model, TorchDirichlet - else: - return model, TorchDeterministic + return model, TorchDeterministic def ddpg_actor_critic_loss(policy, model, _, train_batch): diff --git a/rllib/agents/ddpg/tests/test_ddpg.py b/rllib/agents/ddpg/tests/test_ddpg.py index 0d5ddb8c5b0e..339f36fb537c 100644 --- a/rllib/agents/ddpg/tests/test_ddpg.py +++ b/rllib/agents/ddpg/tests/test_ddpg.py @@ -184,8 +184,15 @@ def test_ddpg_loss_function(self): env = SimpleEnv batch_size = 100 - obs_size = (batch_size, 1) - actions = np.random.random(size=(batch_size, 1)) + if env is SimpleEnv: + obs_size = (batch_size, 1) + actions = np.random.random(size=(batch_size, 1)) + elif env == "CartPole-v0": + obs_size = (batch_size, 4) + actions = np.random.randint(0, 2, size=(batch_size, )) + else: + obs_size = (batch_size, 3) + actions = np.random.random(size=(batch_size, 1)) # Batch of size=n. input_ = self._get_batch_helper(obs_size, actions, batch_size) diff --git a/rllib/agents/sac/sac_torch_policy.py b/rllib/agents/sac/sac_torch_policy.py index 60a206e91453..d000e183913c 100644 --- a/rllib/agents/sac/sac_torch_policy.py +++ b/rllib/agents/sac/sac_torch_policy.py @@ -32,29 +32,6 @@ logger = logging.getLogger(__name__) -def _get_dist_class(config: TrainerConfigDict, action_space: gym.spaces.Space - ) -> Type[TorchDistributionWrapper]: - """Helper function to return a dist class based on config and action space. - - Args: - config (TrainerConfigDict): The Trainer's config dict. - action_space (gym.spaces.Space): The action space used. - - Returns: - Type[TFActionDistribution]: A TF distribution class. - """ - if isinstance(action_space, Discrete): - return TorchCategorical - elif isinstance(action_space, Simplex): - return TorchDirichlet - else: - if config["normalize_actions"]: - return TorchSquashedGaussian if \ - not config["_use_beta_distribution"] else TorchBeta - else: - return TorchDiagGaussian - - def build_sac_model_and_action_dist( policy: Policy, obs_space: gym.spaces.Space, @@ -79,6 +56,29 @@ def build_sac_model_and_action_dist( return model, action_dist_class +def _get_dist_class(config: TrainerConfigDict, action_space: gym.spaces.Space + ) -> Type[TorchDistributionWrapper]: + """Helper function to return a dist class based on config and action space. + + Args: + config (TrainerConfigDict): The Trainer's config dict. + action_space (gym.spaces.Space): The action space used. + + Returns: + Type[TFActionDistribution]: A TF distribution class. + """ + if isinstance(action_space, Discrete): + return TorchCategorical + elif isinstance(action_space, Simplex): + return TorchDirichlet + else: + if config["normalize_actions"]: + return TorchSquashedGaussian if \ + not config["_use_beta_distribution"] else TorchBeta + else: + return TorchDiagGaussian + + def action_distribution_fn( policy: Policy, model: ModelV2, diff --git a/rllib/agents/sac/tests/test_sac.py b/rllib/agents/sac/tests/test_sac.py index b32beaac13fd..1ec87370982d 100644 --- a/rllib/agents/sac/tests/test_sac.py +++ b/rllib/agents/sac/tests/test_sac.py @@ -186,8 +186,15 @@ def test_sac_loss_function(self): env = SimpleEnv batch_size = 100 - obs_size = (batch_size, 1) - actions = np.random.random(size=(batch_size, 2)) + if env is SimpleEnv: + obs_size = (batch_size, 1) + actions = np.random.random(size=(batch_size, 2)) + elif env == "CartPole-v0": + obs_size = (batch_size, 4) + actions = np.random.randint(0, 2, size=(batch_size, )) + else: + obs_size = (batch_size, 3) + actions = np.random.random(size=(batch_size, 1)) # Batch of size=n. input_ = self._get_batch_helper(obs_size, actions, batch_size) From 5e193d65c6bbf6c6e9f35cdbf3d41b841d149fb1 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 043/244] Revert "[Core]Fix ray.kill doesn't cancel pending actor bug (#14025)" This reverts commit cbfa5436f758cf5ab2a57477f3ded789e2100d7b. --- .../main/java/io/ray/test/KillActorTest.java | 2 - python/ray/tests/test_actor_advanced.py | 84 ------------ python/ray/tests/test_failure.py | 7 +- python/ray/tests/test_placement_group.py | 12 +- python/ray/tests/test_queue.py | 6 +- python/ray/tests/test_reference_counting.py | 4 +- src/ray/core_worker/core_worker.cc | 4 +- src/ray/core_worker/core_worker.h | 1 - src/ray/gcs/accessor.h | 10 -- .../gcs/gcs_client/service_based_accessor.cc | 20 --- .../gcs/gcs_client/service_based_accessor.h | 3 - src/ray/gcs/gcs_server/gcs_actor_manager.cc | 126 +++++------------- src/ray/gcs/gcs_server/gcs_actor_manager.h | 26 +--- src/ray/gcs/gcs_server/gcs_actor_scheduler.cc | 36 +---- src/ray/gcs/gcs_server/gcs_actor_scheduler.h | 6 +- .../gcs_server/test/gcs_actor_manager_test.cc | 9 +- .../test/gcs_actor_scheduler_test.cc | 3 +- src/ray/protobuf/gcs_service.proto | 18 --- src/ray/rpc/gcs_server/gcs_rpc_client.h | 4 - src/ray/rpc/gcs_server/gcs_rpc_server.h | 5 - 20 files changed, 56 insertions(+), 330 deletions(-) diff --git a/java/test/src/main/java/io/ray/test/KillActorTest.java b/java/test/src/main/java/io/ray/test/KillActorTest.java index 753b00a9c59c..fd92b97118ef 100644 --- a/java/test/src/main/java/io/ray/test/KillActorTest.java +++ b/java/test/src/main/java/io/ray/test/KillActorTest.java @@ -59,8 +59,6 @@ private static void remoteKill(ActorHandle actor, boolean noRestart) { private void testKillActor(BiConsumer, Boolean> kill, boolean noRestart) { ActorHandle actor = Ray.actor(HangActor::new).setMaxRestarts(1).remote(); - // Wait for the actor to be created. - actor.task(HangActor::ping).remote().get(); ObjectRef result = actor.task(HangActor::hang).remote(); // The actor will hang in this task. Assert.assertEquals(0, Ray.wait(ImmutableList.of(result), 1, 500).getReady().size()); diff --git a/python/ray/tests/test_actor_advanced.py b/python/ray/tests/test_actor_advanced.py index 496e977fe9cd..1913decf83df 100644 --- a/python/ray/tests/test_actor_advanced.py +++ b/python/ray/tests/test_actor_advanced.py @@ -1093,90 +1093,6 @@ class Actor2: global_state_accessor.disconnect() -def test_kill_pending_actor_with_no_restart_true(): - cluster = ray.init() - global_state_accessor = GlobalStateAccessor( - cluster["redis_address"], ray.ray_constants.REDIS_DEFAULT_PASSWORD) - global_state_accessor.connect() - - @ray.remote(resources={"WORKER": 1.0}) - class PendingActor: - pass - - # Kill actor with `no_restart=True`. - actor = PendingActor.remote() - # TODO(ffbin): The raylet doesn't guarantee the order when dealing with - # RequestWorkerLease and CancelWorkerLease. If we kill the actor - # immediately after creating the actor, we may not be able to clean up - # the request cached by the raylet. - # See https://github.com/ray-project/ray/issues/13545 for details. - time.sleep(1) - ray.kill(actor, no_restart=True) - - def condition1(): - message = global_state_accessor.get_all_resource_usage() - resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString( - message) - if len(resource_usages.resource_load_by_shape.resource_demands) == 0: - return True - return False - - # Actor is dead, so the infeasible task queue length is 0. - wait_for_condition(condition1, timeout=10) - - global_state_accessor.disconnect() - ray.shutdown() - - -def test_kill_pending_actor_with_no_restart_false(): - cluster = ray.init() - global_state_accessor = GlobalStateAccessor( - cluster["redis_address"], ray.ray_constants.REDIS_DEFAULT_PASSWORD) - global_state_accessor.connect() - - @ray.remote(resources={"WORKER": 1.0}, max_restarts=1) - class PendingActor: - pass - - # Kill actor with `no_restart=False`. - actor = PendingActor.remote() - # TODO(ffbin): The raylet doesn't guarantee the order when dealing with - # RequestWorkerLease and CancelWorkerLease. If we kill the actor - # immediately after creating the actor, we may not be able to clean up - # the request cached by the raylet. - # See https://github.com/ray-project/ray/issues/13545 for details. - time.sleep(1) - ray.kill(actor, no_restart=False) - - def condition1(): - message = global_state_accessor.get_all_resource_usage() - resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString( - message) - if len(resource_usages.resource_load_by_shape.resource_demands) == 0: - return False - return True - - # Actor restarts, so the infeasible task queue length is 1. - wait_for_condition(condition1, timeout=10) - - # Kill actor again and actor is dead, - # so the infeasible task queue length is 0. - ray.kill(actor, no_restart=False) - - def condition2(): - message = global_state_accessor.get_all_resource_usage() - resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString( - message) - if len(resource_usages.resource_load_by_shape.resource_demands) == 0: - return True - return False - - wait_for_condition(condition2, timeout=10) - - global_state_accessor.disconnect() - ray.shutdown() - - if __name__ == "__main__": import pytest # Test suite is timing out. Disable on windows for now. diff --git a/python/ray/tests/test_failure.py b/python/ray/tests/test_failure.py index fca209743129..f6aad1fa3185 100644 --- a/python/ray/tests/test_failure.py +++ b/python/ray/tests/test_failure.py @@ -754,15 +754,12 @@ class Foo: def __init__(self): time.sleep(1000) - # NOTE: We should save actor, otherwise it will be out of scope. - actors = [Foo.remote() for _ in range(num_cpus * 3)] - assert len(actors) == num_cpus * 3 + [Foo.remote() for _ in range(num_cpus * 3)] errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR - actors = [Foo.remote() for _ in range(num_cpus)] - assert len(actors) == num_cpus + [Foo.remote() for _ in range(num_cpus)] errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR diff --git a/python/ray/tests/test_placement_group.py b/python/ray/tests/test_placement_group.py index 92ef90ca4e1e..024ff6c5557a 100644 --- a/python/ray/tests/test_placement_group.py +++ b/python/ray/tests/test_placement_group.py @@ -902,10 +902,8 @@ def schedule_nested_actor_outside_pg(self): # Kill an actor and wait until it is killed. ray.kill(a) - try: + with pytest.raises(ray.exceptions.RayActorError): ray.get(a.ready.remote()) - except ray.exceptions.RayActorError: - pass # Now create an actor, but do not capture the current tasks a = Actor.options( @@ -927,10 +925,8 @@ def schedule_nested_actor_outside_pg(self): # Kill an actor and wait until it is killed. ray.kill(a) - try: + with pytest.raises(ray.exceptions.RayActorError): ray.get(a.ready.remote()) - except ray.exceptions.RayActorError: - pass # Lastly, make sure when None is specified, actors are not scheduled # on the same placement group. @@ -1420,10 +1416,8 @@ def schedule_nested_actor_with_detached_pg(self): # Kill an actor and wait until it is killed. ray.kill(a) - try: + with pytest.raises(ray.exceptions.RayActorError): ray.get(a.ready.remote()) - except ray.exceptions.RayActorError: - pass # We should have 2 alive pgs and 4 alive actors. assert assert_alive_num_pg(2) diff --git a/python/ray/tests/test_queue.py b/python/ray/tests/test_queue.py index 88cf6d7b647f..6c2fb5cf0ec9 100644 --- a/python/ray/tests/test_queue.py +++ b/python/ray/tests/test_queue.py @@ -199,19 +199,17 @@ def test_custom_resources(ray_start_regular_shared): assert current_resources["CPU"] == 1.0 # By default an actor should not reserve any resources. - q = Queue() + Queue() current_resources = ray.available_resources() assert current_resources["CPU"] == 1.0 - q.shutdown() # Specify resource requirement. The queue should now reserve 1 CPU. - q = Queue(actor_options={"num_cpus": 1}) + Queue(actor_options={"num_cpus": 1}) def no_cpu_in_resources(): return "CPU" not in ray.available_resources() wait_for_condition(no_cpu_in_resources) - q.shutdown() if __name__ == "__main__": diff --git a/python/ray/tests/test_reference_counting.py b/python/ray/tests/test_reference_counting.py index 9fcd3c25f4c4..02638ed3dea8 100644 --- a/python/ray/tests/test_reference_counting.py +++ b/python/ray/tests/test_reference_counting.py @@ -470,10 +470,8 @@ def delete_ref2(self): # Test that the actor exiting stops the reference from being pinned. ray.kill(actor) # Wait for the actor to exit. - try: + with pytest.raises(ray.exceptions.RayActorError): ray.get(actor.delete_ref1.remote()) - except ray.exceptions.RayActorError: - pass else: # Test that deleting the second reference stops it from being pinned. ray.get(actor.delete_ref2.remote()) diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 73b8b89815f2..cf5a1f532cb9 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -1637,9 +1637,7 @@ Status CoreWorker::KillActor(const ActorID &actor_id, bool force_kill, bool no_r stream << "Failed to find a corresponding actor handle for " << actor_id; return Status::Invalid(stream.str()); } - - RAY_CHECK_OK( - gcs_client_->Actors().AsyncKillActor(actor_id, force_kill, no_restart, nullptr)); + direct_actor_submitter_->KillActor(actor_id, force_kill, no_restart); return Status::OK(); } diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h index e1632644195d..72ef4f36ca7b 100644 --- a/src/ray/core_worker/core_worker.h +++ b/src/ray/core_worker/core_worker.h @@ -730,7 +730,6 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { /// Tell an actor to exit immediately, without completing outstanding work. /// /// \param[in] actor_id ID of the actor to kill. - /// \param[in] force_kill Whether to force kill an actor by killing the worker. /// \param[in] no_restart If set to true, the killed actor will not be /// restarted anymore. /// \param[out] Status diff --git a/src/ray/gcs/accessor.h b/src/ray/gcs/accessor.h index db240b411cdf..be929ec3ff0d 100644 --- a/src/ray/gcs/accessor.h +++ b/src/ray/gcs/accessor.h @@ -64,16 +64,6 @@ class ActorInfoAccessor { virtual Status AsyncRegisterActor(const TaskSpecification &task_spec, const StatusCallback &callback) = 0; - /// Kill actor via GCS asynchronously. - /// - /// \param actor_id The ID of actor to destroy. - /// \param force_kill Whether to force kill an actor by killing the worker. - /// \param no_restart If set to true, the killed actor will not be restarted anymore. - /// \param callback Callback that will be called after the actor is destroyed. - /// \return Status - virtual Status AsyncKillActor(const ActorID &actor_id, bool force_kill, bool no_restart, - const StatusCallback &callback) = 0; - /// Asynchronously request GCS to create the actor. /// /// This should be called after the worker has resolved the actor dependencies. diff --git a/src/ray/gcs/gcs_client/service_based_accessor.cc b/src/ray/gcs/gcs_client/service_based_accessor.cc index 5905966cb92a..a82e0ab6bcdd 100644 --- a/src/ray/gcs/gcs_client/service_based_accessor.cc +++ b/src/ray/gcs/gcs_client/service_based_accessor.cc @@ -200,26 +200,6 @@ Status ServiceBasedActorInfoAccessor::AsyncRegisterActor( return Status::OK(); } -Status ServiceBasedActorInfoAccessor::AsyncKillActor( - const ActorID &actor_id, bool force_kill, bool no_restart, - const ray::gcs::StatusCallback &callback) { - rpc::KillActorViaGcsRequest request; - request.set_actor_id(actor_id.Binary()); - request.set_force_kill(force_kill); - request.set_no_restart(no_restart); - client_impl_->GetGcsRpcClient().KillActorViaGcs( - request, [callback](const Status &, const rpc::KillActorViaGcsReply &reply) { - if (callback) { - auto status = - reply.status().code() == (int)StatusCode::OK - ? Status() - : Status(StatusCode(reply.status().code()), reply.status().message()); - callback(status); - } - }); - return Status::OK(); -} - Status ServiceBasedActorInfoAccessor::AsyncCreateActor( const ray::TaskSpecification &task_spec, const ray::gcs::StatusCallback &callback) { RAY_CHECK(task_spec.IsActorCreationTask() && callback); diff --git a/src/ray/gcs/gcs_client/service_based_accessor.h b/src/ray/gcs/gcs_client/service_based_accessor.h index 8aab5198f28e..c883e7b626a7 100644 --- a/src/ray/gcs/gcs_client/service_based_accessor.h +++ b/src/ray/gcs/gcs_client/service_based_accessor.h @@ -85,9 +85,6 @@ class ServiceBasedActorInfoAccessor : public ActorInfoAccessor { Status AsyncCreateActor(const TaskSpecification &task_spec, const StatusCallback &callback) override; - Status AsyncKillActor(const ActorID &actor_id, bool force_kill, bool no_restart, - const StatusCallback &callback) override; - Status AsyncSubscribeAll( const SubscribeCallback &subscribe, const StatusCallback &done) override; diff --git a/src/ray/gcs/gcs_server/gcs_actor_manager.cc b/src/ray/gcs/gcs_server/gcs_actor_manager.cc index 338fc149c327..2f3740654c8b 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_actor_manager.cc @@ -214,25 +214,6 @@ void GcsActorManager::HandleGetNamedActorInfo( ++counts_[CountType::GET_NAMED_ACTOR_INFO_REQUEST]; } -void GcsActorManager::HandleKillActorViaGcs(const rpc::KillActorViaGcsRequest &request, - rpc::KillActorViaGcsReply *reply, - rpc::SendReplyCallback send_reply_callback) { - const auto &actor_id = ActorID::FromBinary(request.actor_id()); - bool force_kill = request.force_kill(); - bool no_restart = request.no_restart(); - if (no_restart) { - DestroyActor(actor_id); - } else { - KillActor(actor_id, force_kill, no_restart); - } - - GCS_RPC_SEND_REPLY(send_reply_callback, reply, Status::OK()); - RAY_LOG(DEBUG) << "Finished killing actor, job id = " << actor_id.JobId() - << ", actor id = " << actor_id << ", force_kill = " << force_kill - << ", no_restart = " << no_restart; - ++counts_[CountType::KILL_ACTOR_REQUEST]; -} - Status GcsActorManager::RegisterActor(const ray::rpc::RegisterActorRequest &request, RegisterActorCallback success_callback) { // NOTE: After the abnormal recovery of the network between GCS client and GCS server or @@ -436,11 +417,8 @@ void GcsActorManager::DestroyActor(const ActorID &actor_id) { actor_to_register_callbacks_.erase(actor_id); actor_to_create_callbacks_.erase(actor_id); auto it = registered_actors_.find(actor_id); - if (it == registered_actors_.end()) { - RAY_LOG(INFO) << "Tried to destroy actor that does not exist " << actor_id; - return; - } - const auto &task_id = it->second->GetCreationTaskSpecification().TaskId(); + RAY_CHECK(it != registered_actors_.end()) + << "Tried to destroy actor that does not exist " << actor_id; it->second->GetMutableActorTableData()->mutable_task_spec()->Clear(); it->second->GetMutableActorTableData()->set_timestamp(current_sys_time_ms()); AddDestroyedActorToCache(it->second); @@ -478,13 +456,38 @@ void GcsActorManager::DestroyActor(const ActorID &actor_id) { if (node_it != created_actors_.end() && node_it->second.count(worker_id)) { // The actor has already been created. Destroy the process by force-killing // it. - NotifyCoreWorkerToKillActor(actor); + KillActor(actor); RAY_CHECK(node_it->second.erase(actor->GetWorkerID())); if (node_it->second.empty()) { created_actors_.erase(node_it); } } else { - CancelActorInScheduling(actor, task_id); + // The actor has not been created yet. It is either being scheduled or is + // pending scheduling. + auto canceled_actor_id = + gcs_actor_scheduler_->CancelOnWorker(actor->GetNodeID(), actor->GetWorkerID()); + if (!canceled_actor_id.IsNil()) { + // The actor was being scheduled and has now been canceled. + RAY_CHECK(canceled_actor_id == actor_id); + } else { + auto pending_it = + std::find_if(pending_actors_.begin(), pending_actors_.end(), + [actor_id](const std::shared_ptr &actor) { + return actor->GetActorID() == actor_id; + }); + + // The actor was pending scheduling. Remove it from the queue. + if (pending_it != pending_actors_.end()) { + pending_actors_.erase(pending_it); + } else { + // When actor creation request of this actor id is pending in raylet, + // it doesn't responds, and the actor should be still in leasing state. + // NOTE: Raylet will cancel the lease request once it receives the + // actor state notification. So this method doesn't have to cancel + // outstanding lease request by calling raylet_client->CancelWorkerLease + gcs_actor_scheduler_->CancelOnLeasing(node_id, actor_id); + } + } } } @@ -703,7 +706,7 @@ void GcsActorManager::ReconstructActor(const ActorID &actor_id, bool need_resche RAY_CHECK_OK(gcs_table_storage_->ActorTable().Put( actor_id, *mutable_actor_table_data, [this, actor, actor_id, mutable_actor_table_data](Status status) { - // If actor was an detached actor, make sure to destroy it. + // if actor was an detached actor, make sure to destroy it. // We need to do this because detached actors are not destroyed // when its owners are dead because it doesn't have owners. if (actor->IsDetached()) { @@ -931,47 +934,15 @@ void GcsActorManager::RemoveActorFromOwner(const std::shared_ptr &acto } } -void GcsActorManager::NotifyCoreWorkerToKillActor(const std::shared_ptr &actor, - bool force_kill, bool no_restart) { +void GcsActorManager::KillActor(const std::shared_ptr &actor) { auto actor_client = worker_client_factory_(actor->GetAddress()); rpc::KillActorRequest request; request.set_intended_actor_id(actor->GetActorID().Binary()); - request.set_force_kill(force_kill); - request.set_no_restart(no_restart); + request.set_force_kill(true); + request.set_no_restart(true); RAY_UNUSED(actor_client->KillActor(request, nullptr)); } -void GcsActorManager::KillActor(const ActorID &actor_id, bool force_kill, - bool no_restart) { - RAY_LOG(DEBUG) << "Killing actor, job id = " << actor_id.JobId() - << ", actor id = " << actor_id << ", force_kill = " << force_kill; - const auto &it = registered_actors_.find(actor_id); - if (it == registered_actors_.end()) { - RAY_LOG(INFO) << "Tried to kill actor that does not exist " << actor_id; - return; - } - - const auto &actor = it->second; - if (actor->GetState() == rpc::ActorTableData::DEAD || - actor->GetState() == rpc::ActorTableData::DEPENDENCIES_UNREADY) { - return; - } - - // The actor is still alive or pending creation. - const auto &node_id = actor->GetNodeID(); - const auto &worker_id = actor->GetWorkerID(); - auto node_it = created_actors_.find(node_id); - if (node_it != created_actors_.end() && node_it->second.count(worker_id)) { - // The actor has already been created. Destroy the process by force-killing - // it. - NotifyCoreWorkerToKillActor(actor, force_kill, no_restart); - } else { - const auto &task_id = actor->GetCreationTaskSpecification().TaskId(); - CancelActorInScheduling(actor, task_id); - ReconstructActor(actor_id, /*need_reschedule=*/true); - } -} - void GcsActorManager::AddDestroyedActorToCache(const std::shared_ptr &actor) { if (destroyed_actors_.size() >= RayConfig::instance().maximum_gcs_destroyed_actor_cached_count()) { @@ -985,36 +956,6 @@ void GcsActorManager::AddDestroyedActorToCache(const std::shared_ptr & actor->GetActorID(), (int64_t)actor->GetActorTableData().timestamp()); } -void GcsActorManager::CancelActorInScheduling(const std::shared_ptr &actor, - const TaskID &task_id) { - const auto &actor_id = actor->GetActorID(); - const auto &node_id = actor->GetNodeID(); - // The actor has not been created yet. It is either being scheduled or is - // pending scheduling. - auto canceled_actor_id = - gcs_actor_scheduler_->CancelOnWorker(actor->GetNodeID(), actor->GetWorkerID()); - if (!canceled_actor_id.IsNil()) { - // The actor was being scheduled and has now been canceled. - RAY_CHECK(canceled_actor_id == actor_id); - } else { - auto pending_it = std::find_if(pending_actors_.begin(), pending_actors_.end(), - [actor_id](const std::shared_ptr &actor) { - return actor->GetActorID() == actor_id; - }); - - // The actor was pending scheduling. Remove it from the queue. - if (pending_it != pending_actors_.end()) { - pending_actors_.erase(pending_it); - } else { - // When actor creation request of this actor id is pending in raylet, - // it doesn't responds, and the actor should be still in leasing state. - // NOTE: We will cancel outstanding lease request by calling - // `raylet_client->CancelWorkerLease`. - gcs_actor_scheduler_->CancelOnLeasing(node_id, actor_id, task_id); - } - } -} - std::string GcsActorManager::DebugString() const { std::ostringstream stream; stream << "GcsActorManager: {RegisterActor request count: " @@ -1023,7 +964,6 @@ std::string GcsActorManager::DebugString() const { << ", GetActorInfo request count: " << counts_[CountType::GET_ACTOR_INFO_REQUEST] << ", GetNamedActorInfo request count: " << counts_[CountType::GET_NAMED_ACTOR_INFO_REQUEST] - << ", KillActor request count: " << counts_[CountType::KILL_ACTOR_REQUEST] << ", Registered actors count: " << registered_actors_.size() << ", Destroyed actors count: " << destroyed_actors_.size() << ", Named actors count: " << named_actors_.size() diff --git a/src/ray/gcs/gcs_server/gcs_actor_manager.h b/src/ray/gcs/gcs_server/gcs_actor_manager.h index f2db9345f0ba..d3ffc309793e 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_manager.h +++ b/src/ray/gcs/gcs_server/gcs_actor_manager.h @@ -190,10 +190,6 @@ class GcsActorManager : public rpc::ActorInfoHandler { rpc::GetAllActorInfoReply *reply, rpc::SendReplyCallback send_reply_callback) override; - void HandleKillActorViaGcs(const rpc::KillActorViaGcsRequest &request, - rpc::KillActorViaGcsReply *reply, - rpc::SendReplyCallback send_reply_callback) override; - /// Register actor asynchronously. /// /// \param request Contains the meta info to create the actor. @@ -340,18 +336,8 @@ class GcsActorManager : public rpc::ActorInfoHandler { /// Kill the specified actor. /// - /// \param actor_id ID of the actor to kill. - /// \param force_kill Whether to force kill an actor by killing the worker. - /// \param no_restart If set to true, the killed actor will not be restarted anymore. - void KillActor(const ActorID &actor_id, bool force_kill, bool no_restart); - - /// Notify CoreWorker to kill the specified actor. - /// /// \param actor The actor to be killed. - /// \param force_kill Whether to force kill an actor by killing the worker. - /// \param no_restart If set to true, the killed actor will not be restarted anymore. - void NotifyCoreWorkerToKillActor(const std::shared_ptr &actor, - bool force_kill = true, bool no_restart = true); + void KillActor(const std::shared_ptr &actor); /// Add the destroyed actor to the cache. If the cache is full, one actor is randomly /// evicted. @@ -370,13 +356,6 @@ class GcsActorManager : public rpc::ActorInfoHandler { return actor_delta; } - /// Cancel actor which is either being scheduled or is pending scheduling. - /// - /// \param actor The actor to be cancelled. - /// \param task_id The id of actor creation task to be cancelled. - void CancelActorInScheduling(const std::shared_ptr &actor, - const TaskID &task_id); - /// Callbacks of pending `RegisterActor` requests. /// Maps actor ID to actor registration callbacks, which is used to filter duplicated /// messages from a driver/worker caused by some network problems. @@ -434,8 +413,7 @@ class GcsActorManager : public rpc::ActorInfoHandler { GET_ACTOR_INFO_REQUEST = 2, GET_NAMED_ACTOR_INFO_REQUEST = 3, GET_ALL_ACTOR_INFO_REQUEST = 4, - KILL_ACTOR_REQUEST = 5, - CountType_MAX = 6, + CountType_MAX = 10, }; uint64_t counts_[CountType::CountType_MAX] = {0}; }; diff --git a/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc b/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc index 1b4201c4f573..9c81c8c0e98d 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc +++ b/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc @@ -127,27 +127,13 @@ std::vector GcsActorScheduler::CancelOnNode(const NodeID &node_id) { return actor_ids; } -void GcsActorScheduler::CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id, - const TaskID &task_id) { - // NOTE: This method will cancel the outstanding lease request and remove leasing - // information from the internal state. +void GcsActorScheduler::CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id) { + // NOTE: This method does not currently cancel the outstanding lease request. + // It only removes leasing information from the internal state so that + // RequestWorkerLease ignores the response from raylet. auto node_it = node_to_actors_when_leasing_.find(node_id); - if (node_it != node_to_actors_when_leasing_.end()) { - node_it->second.erase(actor_id); - } - - const auto &alive_nodes = gcs_node_manager_.GetAllAliveNodes(); - const auto &iter = alive_nodes.find(node_id); - if (iter != alive_nodes.end()) { - const auto &node_info = iter->second; - rpc::Address address; - address.set_raylet_id(node_info->node_id()); - address.set_ip_address(node_info->node_manager_address()); - address.set_port(node_info->node_manager_port()); - auto lease_client = GetOrConnectLeaseClient(address); - lease_client->CancelWorkerLease( - task_id, [](const Status &status, const rpc::CancelWorkerLeaseReply &reply) {}); - } + RAY_CHECK(node_it != node_to_actors_when_leasing_.end()); + node_it->second.erase(actor_id); } ActorID GcsActorScheduler::CancelOnWorker(const NodeID &node_id, @@ -252,16 +238,6 @@ void GcsActorScheduler::LeaseWorkerFromNode(std::shared_ptr actor, } if (status.ok()) { - if (reply.worker_address().raylet_id().empty() && - reply.retry_at_raylet_address().raylet_id().empty()) { - // Actor creation task has been cancelled. It is triggered by `ray.kill`. If - // the number of remaining restarts of the actor is not equal to 0, GCS will - // reschedule the actor, so it return directly here. - RAY_LOG(DEBUG) << "Actor " << actor->GetActorID() - << " creation task has been cancelled."; - return; - } - // Remove the actor from the leasing map as the reply is returned from the // remote node. iter->second.erase(actor_iter); diff --git a/src/ray/gcs/gcs_server/gcs_actor_scheduler.h b/src/ray/gcs/gcs_server/gcs_actor_scheduler.h index c0e3d430ecbf..71dd351087e0 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_scheduler.h +++ b/src/ray/gcs/gcs_server/gcs_actor_scheduler.h @@ -59,8 +59,7 @@ class GcsActorSchedulerInterface { /// /// \param node_id ID of the node where the actor leasing request has been sent. /// \param actor_id ID of an actor. - virtual void CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id, - const TaskID &task_id) = 0; + virtual void CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id) = 0; /// Cancel the actor that is being scheduled to the specified worker. /// @@ -131,8 +130,7 @@ class GcsActorScheduler : public GcsActorSchedulerInterface { /// /// \param node_id ID of the node where the actor leasing request has been sent. /// \param actor_id ID of an actor. - void CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id, - const TaskID &task_id) override; + void CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id) override; /// Cancel the actor that is being scheduled to the specified worker. /// diff --git a/src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc index b8edb6e82164..b88c6702bfeb 100644 --- a/src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc @@ -35,8 +35,7 @@ class MockActorScheduler : public gcs::GcsActorSchedulerInterface { MOCK_METHOD1(CancelOnNode, std::vector(const NodeID &node_id)); MOCK_METHOD2(CancelOnWorker, ActorID(const NodeID &node_id, const WorkerID &worker_id)); - MOCK_METHOD3(CancelOnLeasing, void(const NodeID &node_id, const ActorID &actor_id, - const TaskID &task_id)); + MOCK_METHOD2(CancelOnLeasing, void(const NodeID &node_id, const ActorID &actor_id)); std::vector> actors; }; @@ -736,10 +735,8 @@ TEST_F(GcsActorManagerTest, TestRaceConditionCancelLease) { address.set_raylet_id(node_id.Binary()); address.set_worker_id(worker_id.Binary()); actor->UpdateAddress(address); - const auto &actor_id = actor->GetActorID(); - const auto &task_id = - TaskID::FromBinary(registered_actor->GetActorTableData().task_spec().task_id()); - EXPECT_CALL(*mock_actor_scheduler_, CancelOnLeasing(node_id, actor_id, task_id)); + const auto actor_id = actor->GetActorID(); + EXPECT_CALL(*mock_actor_scheduler_, CancelOnLeasing(node_id, actor_id)); gcs_actor_manager_->OnWorkerDead(owner_node_id, owner_worker_id); } diff --git a/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc b/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc index bd98d65ef0f9..d84f99b3fe88 100644 --- a/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc @@ -262,8 +262,7 @@ TEST_F(GcsActorSchedulerTest, TestLeasingCancelledWhenLeasing) { ASSERT_EQ(1, raylet_client_->callbacks.size()); // Cancel the lease request. - const auto &task_id = TaskID::FromBinary(create_actor_request.task_spec().task_id()); - gcs_actor_scheduler_->CancelOnLeasing(node_id, actor->GetActorID(), task_id); + gcs_actor_scheduler_->CancelOnLeasing(node_id, actor->GetActorID()); ASSERT_EQ(1, raylet_client_->num_workers_requested); ASSERT_EQ(1, raylet_client_->callbacks.size()); diff --git a/src/ray/protobuf/gcs_service.proto b/src/ray/protobuf/gcs_service.proto index 41c71c7e05ca..78462cb2a5c3 100644 --- a/src/ray/protobuf/gcs_service.proto +++ b/src/ray/protobuf/gcs_service.proto @@ -87,22 +87,6 @@ message GetAllActorInfoReply { repeated ActorTableData actor_table_data = 2; } -// `KillActorViaGcsRequest` is sent to GCS Service to ask to kill an actor. -// `KillActorViaGcsRequest` is different from `KillActorRequest`. -// `KillActorRequest` is send to core worker to ask to kill an actor. -message KillActorViaGcsRequest { - // ID of this actor. - bytes actor_id = 1; - // Whether to force kill the actor. - bool force_kill = 2; - // If set to true, the killed actor will not be restarted anymore. - bool no_restart = 3; -} - -message KillActorViaGcsReply { - GcsStatus status = 1; -} - // Service for actor info access. service ActorInfoGcsService { // Register actor to gcs service. @@ -115,8 +99,6 @@ service ActorInfoGcsService { rpc GetNamedActorInfo(GetNamedActorInfoRequest) returns (GetNamedActorInfoReply); // Get information of all actor from GCS Service. rpc GetAllActorInfo(GetAllActorInfoRequest) returns (GetAllActorInfoReply); - // Kill actor via GCS Service. - rpc KillActorViaGcs(KillActorViaGcsRequest) returns (KillActorViaGcsReply); } message RegisterNodeRequest { diff --git a/src/ray/rpc/gcs_server/gcs_rpc_client.h b/src/ray/rpc/gcs_server/gcs_rpc_client.h index bae0e56bd9ae..bf9a72bed7db 100644 --- a/src/ray/rpc/gcs_server/gcs_rpc_client.h +++ b/src/ray/rpc/gcs_server/gcs_rpc_client.h @@ -144,10 +144,6 @@ class GcsRpcClient { VOID_GCS_RPC_CLIENT_METHOD(ActorInfoGcsService, GetAllActorInfo, actor_info_grpc_client_, ) - /// Kill actor via GCS Service. - VOID_GCS_RPC_CLIENT_METHOD(ActorInfoGcsService, KillActorViaGcs, - actor_info_grpc_client_, ) - /// Register a node to GCS Service. VOID_GCS_RPC_CLIENT_METHOD(NodeInfoGcsService, RegisterNode, node_info_grpc_client_, ) diff --git a/src/ray/rpc/gcs_server/gcs_rpc_server.h b/src/ray/rpc/gcs_server/gcs_rpc_server.h index 246a5ee9e306..328aa5f7382d 100644 --- a/src/ray/rpc/gcs_server/gcs_rpc_server.h +++ b/src/ray/rpc/gcs_server/gcs_rpc_server.h @@ -125,10 +125,6 @@ class ActorInfoGcsServiceHandler { virtual void HandleGetAllActorInfo(const GetAllActorInfoRequest &request, GetAllActorInfoReply *reply, SendReplyCallback send_reply_callback) = 0; - - virtual void HandleKillActorViaGcs(const KillActorViaGcsRequest &request, - KillActorViaGcsReply *reply, - SendReplyCallback send_reply_callback) = 0; }; /// The `GrpcService` for `ActorInfoGcsService`. @@ -152,7 +148,6 @@ class ActorInfoGrpcService : public GrpcService { ACTOR_INFO_SERVICE_RPC_HANDLER(GetActorInfo); ACTOR_INFO_SERVICE_RPC_HANDLER(GetNamedActorInfo); ACTOR_INFO_SERVICE_RPC_HANDLER(GetAllActorInfo); - ACTOR_INFO_SERVICE_RPC_HANDLER(KillActorViaGcs); } private: From 5f9f0745d0f43a423a45bd31a12c48fb0aa1bb1c Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 044/244] Revert "[Docs] RayDP Documentation (#14018)" This reverts commit 36904a43ea5b49f379480cd3fd6ccc066ac0cc40. --- doc/source/index.rst | 1 - doc/source/raydp.rst | 104 ------------------------------------------- 2 files changed, 105 deletions(-) delete mode 100644 doc/source/raydp.rst diff --git a/doc/source/index.rst b/doc/source/index.rst index 277c82e55a69..a37ff8d6b9a8 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -308,7 +308,6 @@ Papers modin/index.rst dask-on-ray.rst mars-on-ray.rst - raydp.rst ray-client.rst .. toctree:: diff --git a/doc/source/raydp.rst b/doc/source/raydp.rst deleted file mode 100644 index cee14234439c..000000000000 --- a/doc/source/raydp.rst +++ /dev/null @@ -1,104 +0,0 @@ -******************** -RayDP (Spark on Ray) -******************** - -RayDP combines your Spark and Ray clusters, making it easy to do large scale -data processing using the PySpark API and seemlessly use that data to train -your models using TensorFlow and PyTorch. - -For more information and examples, see the RayDP Github page: -https://github.com/oap_project/raydp - -================ -Installing RayDP -================ - -RayDP can be installed from PyPI and supports PySpark 3.0 and 3.1. - -.. code-block bash - - pip install raydp - -.. note:: - RayDP requires ray >= 1.2.0 - -.. note:: - In order to run Spark, the head and worker nodes will need Java installed. - -======================== -Creating a Spark Session -======================== - -To create a spark session, call ``raydp.init_spark`` - -For example, - -.. code-block:: python - - import raydp - - spark = raydp.init_spark( - app_name = "example", - num_executors = 10, - executor_cores = 64, - memory_per_executor = "256GB" - ) - -==================================== -Deep Learning with a Spark DataFrame -==================================== - -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Training a Spark DataFrame with TensorFlow -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -``raydp.tf.TFEstimator`` provides an API for training with TensorFlow. - -.. code-block:: python - - d = [{'age': 17 , 'grade': 12}] - df = spark.createDataFrame(d).collect() - - - from tensorflow import keras - model = keras.Sequential([]) - - estimator = raydp.tf.TFEstimator( - model = model, - num_worker = 10, - feature_columns = ["age"], - label_column = ["grade"] - ) - - estimator.fit_on_spark(df, test_df=None) - - tensorflow_model = estimator.get_model() - - -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Training a Spark DataFrame with TensorFlow -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Similarly, ``raydp.torch.TorchEstimator`` provides an API for training with -PyTorch. - -.. code-block:: python - - d = [{'age': 17 , 'grade': 12}] - df = spark.createDataFrame(d).collect() - - - import torch - model = torch.nn.Sequential() - - estimator = raydp.tf.TFEstimator( - model = model, - num_worker = 10, - feature_columns = ["age"], - label_column = ["grade"] - ) - - estimator.fit_on_spark(df, test_df=None) - - pytorch_model = estimator.get_model() - From 92cecd8c611043748c6dcd52e6ba4cfa0e98e21f Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 045/244] Revert "HotFix k8s autoscaling (#14024)" This reverts commit 74e87a060ff04b9a01f8d72e14ec073c5c9347ba. --- python/ray/autoscaler/_private/kubernetes/config.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python/ray/autoscaler/_private/kubernetes/config.py b/python/ray/autoscaler/_private/kubernetes/config.py index dcc315bc9c92..b285e7701ff6 100644 --- a/python/ray/autoscaler/_private/kubernetes/config.py +++ b/python/ray/autoscaler/_private/kubernetes/config.py @@ -94,11 +94,6 @@ def get_autodetected_resources(container_data): for resource_name in ["cpu", "gpu"] } - # Throw out GPU from resource dict if the amount is 0. - for key in copy.deepcopy(node_type_resources): - if node_type_resources[key] == 0: - del node_type_resources[key] - return node_type_resources From ab26ee9f40509aab8134ba7c69263bcf567570a5 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 046/244] Revert "Add tip on how to disable Ray OOM handler (#14017)" This reverts commit 77d3f055d43a4d5fec0b100f55b1aa87a8ae0a7a. --- python/ray/memory_monitor.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/python/ray/memory_monitor.py b/python/ray/memory_monitor.py index 448678d0283f..9381c506459e 100644 --- a/python/ray/memory_monitor.py +++ b/python/ray/memory_monitor.py @@ -54,9 +54,7 @@ def get_message(used_gb, total_gb, threshold): round(get_shared(psutil.virtual_memory()) / (1024**3), 2)) + "currently being used by the Ray object store.\n---\n" "--- Tip: Use the `ray memory` command to list active " - "objects in the cluster.\n" - "--- To disable OOM exceptions, set " - "RAY_DISABLE_MEMORY_MONITOR=1.\n---\n") + "objects in the cluster.\n---\n") class MemoryMonitor: @@ -122,9 +120,8 @@ def get_memory_usage(self): def raise_if_low_memory(self): if time.time() - self.last_checked > self.check_interval: - if ("RAY_DEBUG_DISABLE_MEMORY_MONITOR" in os.environ - or "RAY_DISABLE_MEMORY_MONITOR" in os.environ): - return + if "RAY_DEBUG_DISABLE_MEMORY_MONITOR" in os.environ: + return # escape hatch, not intended for user use self.last_checked = time.time() used_gb, total_gb = self.get_memory_usage() From 7948a9f56fe107ae6883ce2f86eec7993109c4de Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 047/244] Revert "[Autoscaler] Monitor refactor for backward compatability. (#13970)" This reverts commit 76fc65f7700d12f678230ac333f894b63e6c60ca. --- python/ray/monitor.py | 75 +++++++--- python/ray/tests/test_multi_node_2.py | 51 +------ src/ray/protobuf/common.proto | 34 ++--- src/ray/protobuf/gcs.proto | 203 ++++++++++++-------------- src/ray/protobuf/gcs_service.proto | 68 ++++----- 5 files changed, 195 insertions(+), 236 deletions(-) diff --git a/python/ray/monitor.py b/python/ray/monitor.py index 72de4e87099b..fe1edad6380d 100644 --- a/python/ray/monitor.py +++ b/python/ray/monitor.py @@ -8,8 +8,6 @@ import traceback import json -import grpc - import ray from ray.autoscaler._private.autoscaler import StandardAutoscaler from ray.autoscaler._private.commands import teardown_cluster @@ -19,10 +17,11 @@ from ray.autoscaler._private.constants import \ AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE from ray.autoscaler._private.util import DEBUG_AUTOSCALING_STATUS - -from ray.core.generated import gcs_service_pb2, gcs_service_pb2_grpc +import ray.gcs_utils +import ray.utils import ray.ray_constants as ray_constants from ray.ray_logging import setup_component_logger +from ray._raylet import GlobalStateAccessor from ray.experimental.internal_kv import _internal_kv_put, \ _internal_kv_initialized, _internal_kv_get @@ -91,17 +90,16 @@ def __init__(self, redis_address, redis_password=redis_password) self.redis = ray._private.services.create_redis_client( redis_address, password=redis_password) - - # Initialize the gcs stub for getting all node resource usage. - gcs_address = self.redis.get("GcsServerAddress").decode("utf-8") - gcs_channel = grpc.insecure_channel(gcs_address) - self.gcs_node_resources_stub = \ - gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel) - + self.global_state_accessor = GlobalStateAccessor( + redis_address, redis_password, False) + self.global_state_accessor.connect() # Set the redis client and mode so _internal_kv works for autoscaler. worker = ray.worker.global_worker worker.redis_client = self.redis worker.mode = 0 + # Keep a mapping from raylet client ID to IP address to use + # for updating the load metrics. + self.raylet_id_to_ip_map = {} head_node_ip = redis_address.split(":")[0] self.load_metrics = LoadMetrics(local_ip=head_node_ip) self.last_avail_resources = None @@ -119,14 +117,19 @@ def __init__(self, logger.info("Monitor: Started") + def __del__(self): + """Destruct the monitor object.""" + # We close the pubsub client to avoid leaking file descriptors. + if self.global_state_accessor is not None: + self.global_state_accessor.disconnect() + self.global_state_accessor = None + def update_load_metrics(self): """Fetches resource usage data from GCS and updates load metrics.""" - request = gcs_service_pb2.GetAllResourceUsageRequest() - response = self.gcs_node_resources_stub.GetAllResourceUsage( - request, timeout=3) - resources_batch_data = response.resource_usage_data - + all_resources = self.global_state_accessor.get_all_resource_usage() + resources_batch_data = \ + ray.gcs_utils.ResourceUsageBatchData.FromString(all_resources) for resource_message in resources_batch_data.batch: resource_load = dict(resource_message.resource_load) total_resources = dict(resource_message.resources_total) @@ -138,10 +141,17 @@ def update_load_metrics(self): pending_placement_groups = list( resources_batch_data.placement_group_load.placement_group_data) - ip = resource_message.node_manager_address - self.load_metrics.update( - ip, total_resources, available_resources, resource_load, - waiting_bundles, infeasible_bundles, pending_placement_groups) + # Update the load metrics for this raylet. + node_id = ray.utils.binary_to_hex(resource_message.node_id) + ip = self.raylet_id_to_ip_map.get(node_id) + if ip: + self.load_metrics.update(ip, total_resources, + available_resources, resource_load, + waiting_bundles, infeasible_bundles, + pending_placement_groups) + else: + logger.warning( + f"Monitor: could not find ip for node {node_id}") def update_resource_requests(self): """Fetches resource requests from the internal KV and updates load.""" @@ -156,10 +166,29 @@ def update_resource_requests(self): except Exception: logger.exception("Error parsing resource requests") + def update_raylet_map(self, _append_port=False): + """Updates internal raylet map. + + Args: + _append_port (bool): Defaults to False. Appending the port is + useful in testing, as mock clusters have many nodes with + the same IP and cannot be uniquely identified. + """ + all_raylet_nodes = ray.nodes() + self.raylet_id_to_ip_map = {} + for raylet_info in all_raylet_nodes: + node_id = (raylet_info.get("DBClientID") or raylet_info["NodeID"]) + ip_address = (raylet_info.get("AuxAddress") + or raylet_info["NodeManagerAddress"]).split(":")[0] + if _append_port: + ip_address += ":" + str(raylet_info["NodeManagerPort"]) + self.raylet_id_to_ip_map[node_id] = ip_address + def _run(self): """Run the monitor loop.""" while True: + self.update_raylet_map() self.update_load_metrics() self.update_resource_requests() self.update_event_summary() @@ -335,9 +364,9 @@ def run(self): # Something went wrong, so push an error to all drivers. redis_client = ray._private.services.create_redis_client( args.redis_address, password=args.redis_password) + traceback_str = ray.utils.format_error_message(traceback.format_exc()) message = ("The monitor failed with the " - f"following error:\n{traceback.format_exc()}") - from ray.utils import push_error_to_driver_through_redis - push_error_to_driver_through_redis( + f"following error:\n{traceback_str}") + ray.utils.push_error_to_driver_through_redis( redis_client, ray_constants.MONITOR_DIED_ERROR, message) raise e diff --git a/python/ray/tests/test_multi_node_2.py b/python/ray/tests/test_multi_node_2.py index 7569dff68113..b3e739e643eb 100644 --- a/python/ray/tests/test_multi_node_2.py +++ b/python/ray/tests/test_multi_node_2.py @@ -4,7 +4,6 @@ import ray import ray.ray_constants as ray_constants -from ray.util.placement_group import placement_group, remove_placement_group from ray.autoscaler.sdk import request_resources from ray.monitor import Monitor from ray.cluster_utils import Cluster @@ -69,45 +68,16 @@ def f(): def setup_monitor(address): monitor = Monitor( address, None, redis_password=ray_constants.REDIS_DEFAULT_PASSWORD) + monitor.update_raylet_map(_append_port=True) return monitor -def assert_correct_pg(pg_response_data, pg_demands, strategy): - assert len(pg_response_data) == 1 - pg_response_data = pg_response_data[0] - strategy_mapping_dict_protobuf = { - "PACK": 0, - "SPREAD": 1, - "STRICT_PACK": 2, - "STRICT_SPREAD": 3 - } - assert pg_response_data.strategy == strategy_mapping_dict_protobuf[ - strategy] - assert pg_response_data.creator_job_id - assert pg_response_data.creator_actor_id - assert pg_response_data.creator_actor_dead - assert pg_response_data.placement_group_id - - for i, bundle in enumerate(pg_demands): - assert pg_response_data.bundles[i].unit_resources == bundle - assert pg_response_data.bundles[i].bundle_id.placement_group_id - - -# DO NOT CHANGE THIS VERIFICATION WITHOUT NOTIFYING (Eric/Ameer/Alex). def verify_load_metrics(monitor, expected_resource_usage=None, timeout=30): request_resources(num_cpus=42) - # add placement groups. - pg_demands = [{"GPU": 2}, {"extra_resource": 2}] - strategy = "STRICT_PACK" - pg = placement_group(pg_demands, strategy=strategy) - pg.ready() - time.sleep(2) # wait for placemnt groups to propogate. - # Disable event clearing for test. monitor.event_summarizer.clear = lambda *a: None - visited_atleast_once = [set(), set()] while True: monitor.update_load_metrics() monitor.update_resource_requests() @@ -118,29 +88,21 @@ def verify_load_metrics(monitor, expected_resource_usage=None, timeout=30): req = monitor.load_metrics.resource_requests assert req == [{"CPU": 1}] * 42, req - pg_response_data = monitor.load_metrics.pending_placement_groups - assert_correct_pg(pg_response_data, pg_demands, strategy) - if "memory" in resource_usage[0]: del resource_usage[0]["memory"] - visited_atleast_once[0].add("memory") - if "object_store_memory" in resource_usage[0]: + if "object_store_memory" in resource_usage[1]: del resource_usage[0]["object_store_memory"] - visited_atleast_once[0].add("object_store_memory") if "memory" in resource_usage[1]: del resource_usage[1]["memory"] - visited_atleast_once[1].add("memory") if "object_store_memory" in resource_usage[1]: del resource_usage[1]["object_store_memory"] - visited_atleast_once[1].add("object_store_memory") for key in list(resource_usage[0].keys()): if key.startswith("node:"): del resource_usage[0][key] - visited_atleast_once[0].add("node:") for key in list(resource_usage[1].keys()): if key.startswith("node:"): del resource_usage[1][key] - visited_atleast_once[1].add("node:") + if expected_resource_usage is None: if all(x for x in resource_usage[0:]): break @@ -158,13 +120,6 @@ def verify_load_metrics(monitor, expected_resource_usage=None, timeout=30): # Sanity check we emitted a resize event. assert any("Resized to" in x for x in monitor.event_summarizer.summary()) - assert visited_atleast_once[0] == { - "memory", "object_store_memory", "node:" - } - assert visited_atleast_once[0] == visited_atleast_once[1] - - remove_placement_group(pg) - return resource_usage diff --git a/src/ray/protobuf/common.proto b/src/ray/protobuf/common.proto index 7178fe7159d8..844f44bea723 100644 --- a/src/ray/protobuf/common.proto +++ b/src/ray/protobuf/common.proto @@ -46,6 +46,19 @@ enum TaskType { DRIVER_TASK = 3; } +// Type of placement group strategy. +enum PlacementStrategy { + // Packs Bundles into as few nodes as possible. + PACK = 0; + // Places Bundles across distinct nodes or processes as even as possible. + SPREAD = 1; + // Packs Bundles within one node. The group is not allowed to span multiple nodes. + STRICT_PACK = 2; + // Places Bundles across distinct nodes. + // The group is not allowed to deploy more than one bundle on a node. + STRICT_SPREAD = 3; +} + // Address of a worker or node manager. message Address { bytes raylet_id = 1; @@ -443,24 +456,3 @@ enum WorkerExitType { // Worker exit due to placement group removal. PLACEMENT_GROUP_REMOVED = 3; } -/////////////////////////////////////////////////////////////////////////////// -/* Please do not modify/remove/change the following enum to maintain -backwards compatibility in autoscaler. This is necessary to make sure we can -run autoscaler with any version of ray. For example, the K8s operator runs -autoscaler in a separate pod, if the user upgrades the ray version on the head -pod autoscaler can crash (if the newer version of ray modified the messages -below). */ - -// Type of placement group strategy. -enum PlacementStrategy { - // Packs Bundles into as few nodes as possible. - PACK = 0; - // Places Bundles across distinct nodes or processes as even as possible. - SPREAD = 1; - // Packs Bundles within one node. The group is not allowed to span multiple nodes. - STRICT_PACK = 2; - // Places Bundles across distinct nodes. - // The group is not allowed to deploy more than one bundle on a node. - STRICT_SPREAD = 3; -} -/////////////////////////////////////////////////////////////////////////////// diff --git a/src/ray/protobuf/gcs.proto b/src/ray/protobuf/gcs.proto index 5da9842f9619..a56bffbe1147 100644 --- a/src/ray/protobuf/gcs.proto +++ b/src/ray/protobuf/gcs.proto @@ -158,6 +158,43 @@ message ErrorTableData { double timestamp = 4; } +message PlacementGroupTableData { + // State of a placement group. + enum PlacementGroupState { + // Placement Group is pending or scheduling + PENDING = 0; + // Placement Group is created. + CREATED = 1; + // Placement Group is already removed and won't be reschedule. + REMOVED = 2; + // Placement Group is rescheduling because the node it placed is dead. + RESCHEDULING = 3; + } + + // ID of the PlacementGroup. + bytes placement_group_id = 1; + // The name of the placement group. + string name = 2; + // The array of the bundle in Placement Group. + repeated Bundle bundles = 3; + // The schedule strategy of this Placement Group. + PlacementStrategy strategy = 4; + // Current state of this placement group. + PlacementGroupState state = 5; + // Fields to detect the owner of the placement group + // for automatic lifecycle management. + // The job id that created this placement group. + bytes creator_job_id = 6; + // The actor id that created this placement group. + bytes creator_actor_id = 7; + // Whether or not if the creator job is dead. + bool creator_job_dead = 8; + // Whether or not if the creator actor is dead. + bool creator_actor_dead = 9; + // Whether the placement group is persistent. + bool is_detached = 10; +} + message ScheduleData { map schedule_plan = 1; } @@ -238,11 +275,71 @@ message GcsNodeInfo { int64 timestamp = 10; } +// Represents the demand for a particular resource shape. +message ResourceDemand { + // The resource shape requested. This is a map from the resource string + // (e.g., "CPU") to the amount requested. + map shape = 1; + // The number of requests that are ready to run (i.e., dependencies have been + // fulfilled), but that are waiting for resources. + uint64 num_ready_requests_queued = 2; + // The number of requests for which there is no node that is a superset of + // the requested resource shape. + uint64 num_infeasible_requests_queued = 3; + // The number of requests of this shape still queued in CoreWorkers that this + // raylet knows about. + int64 backlog_size = 4; +} + +// Represents the demand sorted by resource shape. +message ResourceLoad { + // A list of all resource demands. The resource shape in each demand is + // unique. + repeated ResourceDemand resource_demands = 1; +} + +message PlacementGroupLoad { + // The list of pending placement group specifications. + repeated PlacementGroupTableData placement_group_data = 1; +} + message HeartbeatTableData { // Node id. bytes node_id = 1; } +message ResourcesData { + // Node id. + bytes node_id = 1; + // Resource capacity currently available on this node manager. + map resources_available = 2; + // Indicates whether available resources is changed. Only used when light + // heartbeat enabled. + bool resources_available_changed = 3; + // Total resource capacity configured for this node manager. + map resources_total = 4; + // Aggregate outstanding resource load on this node manager. + map resource_load = 5; + // Indicates whether resource load is changed. Only used when + // light heartbeat enabled. + bool resource_load_changed = 6; + // The resource load on this node, sorted by resource shape. + ResourceLoad resource_load_by_shape = 7; + // Whether this node manager is requesting global GC. + bool should_global_gc = 8; + // IP address of the node. + string node_manager_address = 9; +} + +message ResourceUsageBatchData { + repeated ResourcesData batch = 1; + // The total resource demand on all nodes included in the batch, sorted by + // resource shape. + ResourceLoad resource_load_by_shape = 2; + // The pending list of placement groups. + PlacementGroupLoad placement_group_load = 3; +} + // Data for a lease on task execution. message TaskLeaseData { // The task ID. @@ -356,109 +453,3 @@ message PubSubMessage { bytes id = 1; bytes data = 2; } - -/////////////////////////////////////////////////////////////////////////////// -/* Please do not modify/remove/change the following messages to maintain -backwards compatibility in autoscaler. This is necessary to make sure we can -run autoscaler with any version of ray. For example, the K8s operator runs -autoscaler in a separate pod, if the user upgrades the ray version on the head -pod autoscaler can crash (if the newer version of ray modified the messages -below). */ - -// Represents the demand for a particular resource shape. -message ResourceDemand { - // The resource shape requested. This is a map from the resource string - // (e.g., "CPU") to the amount requested. - map shape = 1; - // The number of requests that are ready to run (i.e., dependencies have been - // fulfilled), but that are waiting for resources. - uint64 num_ready_requests_queued = 2; - // The number of requests for which there is no node that is a superset of - // the requested resource shape. - uint64 num_infeasible_requests_queued = 3; - // The number of requests of this shape still queued in CoreWorkers that this - // raylet knows about. - int64 backlog_size = 4; -} - -// Represents the demand sorted by resource shape. -message ResourceLoad { - // A list of all resource demands. The resource shape in each demand is - // unique. - repeated ResourceDemand resource_demands = 1; -} - -message ResourcesData { - // Node id. - bytes node_id = 1; - // Resource capacity currently available on this node manager. - map resources_available = 2; - // Indicates whether available resources is changed. Only used when light - // heartbeat enabled. - bool resources_available_changed = 3; - // Total resource capacity configured for this node manager. - map resources_total = 4; - // Aggregate outstanding resource load on this node manager. - map resource_load = 5; - // Indicates whether resource load is changed. Only used when - // light heartbeat enabled. - bool resource_load_changed = 6; - // The resource load on this node, sorted by resource shape. - ResourceLoad resource_load_by_shape = 7; - // Whether this node manager is requesting global GC. - bool should_global_gc = 8; - // IP address of the node. - string node_manager_address = 9; -} - -message ResourceUsageBatchData { - repeated ResourcesData batch = 1; - // The total resource demand on all nodes included in the batch, sorted by - // resource shape. - ResourceLoad resource_load_by_shape = 2; - // The pending list of placement groups. - PlacementGroupLoad placement_group_load = 3; -} - -message PlacementGroupLoad { - // The list of pending placement group specifications. - repeated PlacementGroupTableData placement_group_data = 1; -} - -message PlacementGroupTableData { - // State of a placement group. - enum PlacementGroupState { - // Placement Group is pending or scheduling - PENDING = 0; - // Placement Group is created. - CREATED = 1; - // Placement Group is already removed and won't be reschedule. - REMOVED = 2; - // Placement Group is rescheduling because the node it placed is dead. - RESCHEDULING = 3; - } - - // ID of the PlacementGroup. - bytes placement_group_id = 1; - // The name of the placement group. - string name = 2; - // The array of the bundle in Placement Group. - repeated Bundle bundles = 3; - // The schedule strategy of this Placement Group. - PlacementStrategy strategy = 4; - // Current state of this placement group. - PlacementGroupState state = 5; - // Fields to detect the owner of the placement group - // for automatic lifecycle management. - // The job id that created this placement group. - bytes creator_job_id = 6; - // The actor id that created this placement group. - bytes creator_actor_id = 7; - // Whether or not if the creator job is dead. - bool creator_job_dead = 8; - // Whether or not if the creator actor is dead. - bool creator_actor_dead = 9; - // Whether the placement group is persistent. - bool is_detached = 10; -} -/////////////////////////////////////////////////////////////////////////////// diff --git a/src/ray/protobuf/gcs_service.proto b/src/ray/protobuf/gcs_service.proto index 78462cb2a5c3..ed5ca92e2a42 100644 --- a/src/ray/protobuf/gcs_service.proto +++ b/src/ray/protobuf/gcs_service.proto @@ -19,6 +19,11 @@ package ray.rpc; import "src/ray/protobuf/common.proto"; import "src/ray/protobuf/gcs.proto"; +message GcsStatus { + int32 code = 1; + string message = 2; +} + message AddJobRequest { JobTableData data = 1; } @@ -208,6 +213,31 @@ message ReportResourceUsageReply { GcsStatus status = 1; } +message GetAllResourceUsageRequest { +} + +message GetAllResourceUsageReply { + GcsStatus status = 1; + ResourceUsageBatchData resource_usage_data = 2; +} + +// Service for node resource info access. +service NodeResourceInfoGcsService { + // Get node's resources from GCS Service. + rpc GetResources(GetResourcesRequest) returns (GetResourcesReply); + // Update resources of a node in GCS Service. + rpc UpdateResources(UpdateResourcesRequest) returns (UpdateResourcesReply); + // Delete resources of a node in GCS Service. + rpc DeleteResources(DeleteResourcesRequest) returns (DeleteResourcesReply); + // Get available resources of all nodes. + rpc GetAllAvailableResources(GetAllAvailableResourcesRequest) + returns (GetAllAvailableResourcesReply); + // Report resource usage of a node to GCS Service. + rpc ReportResourceUsage(ReportResourceUsageRequest) returns (ReportResourceUsageReply); + // Get resource usage of all nodes from GCS Service. + rpc GetAllResourceUsage(GetAllResourceUsageRequest) returns (GetAllResourceUsageReply); +} + // Service for heartbeat info access. service HeartbeatInfoGcsService { // Report heartbeat of a node to GCS Service. @@ -505,41 +535,3 @@ service PlacementGroupInfoGcsService { rpc WaitPlacementGroupUntilReady(WaitPlacementGroupUntilReadyRequest) returns (WaitPlacementGroupUntilReadyReply); } -/////////////////////////////////////////////////////////////////////////////// -/* Please do not modify/remove/change the following messages to maintain -backwards compatibility in autoscaler. This is necessary to make sure we can -run autoscaler with any version of ray. For example, the K8s operator runs -autoscaler in a separate pod, if the user upgrades the ray version on the head -pod autoscaler can crash (if the newer version of ray modified the messages -below). */ - -message GetAllResourceUsageRequest { -} - -message GetAllResourceUsageReply { - GcsStatus status = 1; - ResourceUsageBatchData resource_usage_data = 2; -} - -// Service for node resource info access. -service NodeResourceInfoGcsService { - // Get node's resources from GCS Service. - rpc GetResources(GetResourcesRequest) returns (GetResourcesReply); - // Update resources of a node in GCS Service. - rpc UpdateResources(UpdateResourcesRequest) returns (UpdateResourcesReply); - // Delete resources of a node in GCS Service. - rpc DeleteResources(DeleteResourcesRequest) returns (DeleteResourcesReply); - // Get available resources of all nodes. - rpc GetAllAvailableResources(GetAllAvailableResourcesRequest) - returns (GetAllAvailableResourcesReply); - // Report resource usage of a node to GCS Service. - rpc ReportResourceUsage(ReportResourceUsageRequest) returns (ReportResourceUsageReply); - // Get resource usage of all nodes from GCS Service. - rpc GetAllResourceUsage(GetAllResourceUsageRequest) returns (GetAllResourceUsageReply); -} - -message GcsStatus { - int32 code = 1; - string message = 2; -} -/////////////////////////////////////////////////////////////////////////////// From 17626a96ef6f23a538d2e8d3846eb565475aae33 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 048/244] Revert "Update example shuffle script (#14021)" This reverts commit 7bf413455403091426e43c93e4c71261c5cd80f9. --- python/ray/experimental/shuffle.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/ray/experimental/shuffle.py b/python/ray/experimental/shuffle.py index 0a3f0165609f..6b7936ddf85b 100644 --- a/python/ray/experimental/shuffle.py +++ b/python/ray/experimental/shuffle.py @@ -169,10 +169,8 @@ def main(): parser.add_argument("--partition-size", type=float, default=200e6) args = parser.parse_args() - if args.ray_address: - ray.init(address=args.ray_address) - else: - ray.init(object_store_memory=args.object_store_memory) + ray.init( + address=args.ray_address, object_store_memory=args.object_store_memory) partition_size = int(args.partition_size) num_partitions = args.num_partitions From a28231310258c831cc7cb9948b0b68f3879e1b59 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 049/244] Revert "[dask-on-ray] Add multiple return DataFrame shuffle optimization. (#13951)" This reverts commit 99b20d0099894ed08b59e7959f1eac9717cac190. --- python/ray/tests/test_dask_optimization.py | 63 -------- python/ray/util/dask/__init__.py | 5 - python/ray/util/dask/optimizations.py | 160 --------------------- python/ray/util/dask/scheduler.py | 36 +---- 4 files changed, 5 insertions(+), 259 deletions(-) delete mode 100644 python/ray/tests/test_dask_optimization.py delete mode 100644 python/ray/util/dask/optimizations.py diff --git a/python/ray/tests/test_dask_optimization.py b/python/ray/tests/test_dask_optimization.py deleted file mode 100644 index e8a045aeee24..000000000000 --- a/python/ray/tests/test_dask_optimization.py +++ /dev/null @@ -1,63 +0,0 @@ -import dask -import dask.dataframe as dd -from dask.dataframe.shuffle import SimpleShuffleLayer -import mock -import numpy as np -import pandas as pd -import pytest - -from ray.util.dask import dataframe_optimize -from ray.util.dask.optimizations import (rewrite_simple_shuffle_layer, - MultipleReturnSimpleShuffleLayer) - - -def test_rewrite_simple_shuffle_layer(): - npartitions = 10 - df = dd.from_pandas( - pd.DataFrame( - np.random.randint(0, 100, size=(100, 2)), columns=["age", - "grade"]), - npartitions=npartitions) - # We set max_branch=npartitions in order to ensure that the task-based - # shuffle happens in a single stage, which is required in order for our - # optimization to work. - a = df.set_index(["age"], shuffle="tasks", max_branch=npartitions) - - dsk = a.__dask_graph__() - keys = a.__dask_keys__() - assert any(type(v) is SimpleShuffleLayer for k, v in dsk.layers.items()) - dsk = rewrite_simple_shuffle_layer(dsk, keys) - assert all( - type(v) is not SimpleShuffleLayer for k, v in dsk.layers.items()) - assert any( - type(v) is MultipleReturnSimpleShuffleLayer - for k, v in dsk.layers.items()) - - -@mock.patch("ray.util.dask.optimizations.rewrite_simple_shuffle_layer") -def test_dataframe_optimize(mock_rewrite): - def side_effect(dsk, keys): - return rewrite_simple_shuffle_layer(dsk, keys) - - mock_rewrite.side_effect = side_effect - with dask.config.set(dataframe_optimize=dataframe_optimize): - npartitions = 10 - df = dd.from_pandas( - pd.DataFrame( - np.random.randint(0, 100, size=(100, 2)), - columns=["age", "grade"]), - npartitions=npartitions) - # We set max_branch=npartitions in order to ensure that the task-based - # shuffle happens in a single stage, which is required in order for our - # optimization to work. - a = df.set_index( - ["age"], shuffle="tasks", max_branch=npartitions).compute() - - assert mock_rewrite.call_count == 2 - assert a.index.is_monotonic_increasing - - -if __name__ == "__main__": - import sys - - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/util/dask/__init__.py b/python/ray/util/dask/__init__.py index 10a08379c847..bfe28571ad75 100644 --- a/python/ray/util/dask/__init__.py +++ b/python/ray/util/dask/__init__.py @@ -4,16 +4,11 @@ local_ray_callbacks, unpack_ray_callbacks, ) -from .optimizations import dataframe_optimize __all__ = [ - # Schedulers "ray_dask_get", "ray_dask_get_sync", - # Callbacks "RayDaskCallback", "local_ray_callbacks", "unpack_ray_callbacks", - # Optimizations - "dataframe_optimize", ] diff --git a/python/ray/util/dask/optimizations.py b/python/ray/util/dask/optimizations.py deleted file mode 100644 index c36757af691f..000000000000 --- a/python/ray/util/dask/optimizations.py +++ /dev/null @@ -1,160 +0,0 @@ -import operator -import warnings - -import dask -from dask import core -from dask.core import istask -from dask.dataframe.core import _concat -from dask.dataframe.optimize import optimize -from dask.dataframe.shuffle import shuffle_group -from dask.highlevelgraph import HighLevelGraph - -from .scheduler import MultipleReturnFunc, multiple_return_get - -try: - from dask.dataframe.shuffle import SimpleShuffleLayer -except ImportError: - # SimpleShuffleLayer doesn't exist in this version of Dask. - SimpleShuffleLayer = None - -if SimpleShuffleLayer is not None: - - class MultipleReturnSimpleShuffleLayer(SimpleShuffleLayer): - @classmethod - def clone(cls, layer: SimpleShuffleLayer): - # TODO(Clark): Probably don't need this since SimpleShuffleLayer - # implements __copy__() and the shallow clone should be enough? - return cls( - name=layer.name, - column=layer.column, - npartitions=layer.npartitions, - npartitions_input=layer.npartitions_input, - ignore_index=layer.ignore_index, - name_input=layer.name_input, - meta_input=layer.meta_input, - parts_out=layer.parts_out, - annotations=layer.annotations, - ) - - def __repr__(self): - return (f"MultipleReturnSimpleShuffleLayer") - - def __reduce__(self): - attrs = [ - "name", - "column", - "npartitions", - "npartitions_input", - "ignore_index", - "name_input", - "meta_input", - "parts_out", - "annotations", - ] - return (MultipleReturnSimpleShuffleLayer, - tuple(getattr(self, attr) for attr in attrs)) - - def _cull(self, parts_out): - return MultipleReturnSimpleShuffleLayer( - self.name, - self.column, - self.npartitions, - self.npartitions_input, - self.ignore_index, - self.name_input, - self.meta_input, - parts_out=parts_out, - ) - - def _construct_graph(self): - """Construct graph for a simple shuffle operation.""" - - shuffle_group_name = "group-" + self.name - shuffle_split_name = "split-" + self.name - - dsk = {} - n_parts_out = len(self.parts_out) - for part_out in self.parts_out: - # TODO(Clark): Find better pattern than in-scheduler concat. - _concat_list = [(shuffle_split_name, part_out, part_in) - for part_in in range(self.npartitions_input)] - dsk[(self.name, part_out)] = (_concat, _concat_list, - self.ignore_index) - for _, _part_out, _part_in in _concat_list: - dsk[(shuffle_split_name, _part_out, _part_in)] = ( - multiple_return_get, - (shuffle_group_name, _part_in), - _part_out, - ) - if (shuffle_group_name, _part_in) not in dsk: - dsk[(shuffle_group_name, _part_in)] = ( - MultipleReturnFunc( - shuffle_group, - n_parts_out, - ), - (self.name_input, _part_in), - self.column, - 0, - self.npartitions, - self.npartitions, - self.ignore_index, - self.npartitions, - ) - - return dsk - - def rewrite_simple_shuffle_layer(dsk, keys): - if not isinstance(dsk, HighLevelGraph): - dsk = HighLevelGraph.from_collections( - id(dsk), dsk, dependencies=()) - else: - dsk = dsk.copy() - - layers = dsk.layers.copy() - for key, layer in layers.items(): - if type(layer) is SimpleShuffleLayer: - dsk.layers[key] = MultipleReturnSimpleShuffleLayer.clone(layer) - return dsk - - def dataframe_optimize(dsk, keys, **kwargs): - if not isinstance(keys, (list, set)): - keys = [keys] - keys = list(core.flatten(keys)) - - if not isinstance(dsk, HighLevelGraph): - dsk = HighLevelGraph.from_collections( - id(dsk), dsk, dependencies=()) - - dsk = rewrite_simple_shuffle_layer(dsk, keys=keys) - return optimize(dsk, keys, **kwargs) -else: - - def dataframe_optimize(dsk, keys, **kwargs): - warnings.warn("Custom dataframe shuffle optimization only works on " - "dask>=2020.12.0, you are on version " - f"{dask.__version__}, please upgrade Dask." - "Falling back to default dataframe optimizer.") - return optimize(dsk, keys, **kwargs) - - -# Stale approaches below. - - -def fuse_splits_into_multiple_return(dsk, keys): - if not isinstance(dsk, HighLevelGraph): - dsk = HighLevelGraph.from_collections(id(dsk), dsk, dependencies=()) - else: - dsk = dsk.copy() - dependencies = dsk.dependencies.copy() - for k, v in dsk.items(): - if istask(v) and v[0] == shuffle_group: - task_deps = dependencies[k] - # Only rewrite shuffle group split if all downstream dependencies - # are splits. - if all( - istask(dsk[dep]) and dsk[dep][0] == operator.getitem - for dep in task_deps): - for dep in task_deps: - # Rewrite split - pass diff --git a/python/ray/util/dask/scheduler.py b/python/ray/util/dask/scheduler.py index d6a8a6edc132..0614d35641ec 100644 --- a/python/ray/util/dask/scheduler.py +++ b/python/ray/util/dask/scheduler.py @@ -1,7 +1,6 @@ import atexit from collections import defaultdict from multiprocessing.pool import ThreadPool -from dataclasses import dataclass import threading import ray @@ -271,31 +270,19 @@ def _rayify_task( return alternate_return func, args = task[0], task[1:] - if func is multiple_return_get: - return _execute_task(task, deps) # If the function's arguments contain nested object references, we must # unpack said object references into a flat set of arguments so that # Ray properly tracks the object dependencies between Ray tasks. - arg_object_refs, repack = unpack_object_refs(args, deps) + object_refs, repack = unpack_object_refs(args, deps) # Submit the task using a wrapper function. - object_refs = dask_task_wrapper.options( - name=f"dask:{key!s}", - num_returns=(1 if not isinstance(func, MultipleReturnFunc) else - func.num_returns), - ).remote( - func, - repack, - key, - ray_pretask_cbs, - ray_posttask_cbs, - *arg_object_refs, - ) + object_ref = dask_task_wrapper.options(name=f"dask:{key!s}").remote( + func, repack, key, ray_pretask_cbs, ray_posttask_cbs, *object_refs) if ray_postsubmit_cbs is not None: for cb in ray_postsubmit_cbs: - cb(task, key, deps, object_refs) + cb(task, key, deps, object_ref) - return object_refs + return object_ref elif not ishashable(task): return task elif task in deps: @@ -447,16 +434,3 @@ def ray_dask_get_sync(dsk, keys, **kwargs): cb(result) return result - - -@dataclass -class MultipleReturnFunc: - func: callable - num_returns: int - - def __call__(self, *args, **kwargs): - return self.func(*args, **kwargs) - - -def multiple_return_get(multiple_returns, idx): - return multiple_returns[idx] From 0029491b72f416e312f888817c59d8562919b045 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 050/244] Revert "Revert "Revert "[Java] fix test hang occasionally when running FailureTest (#13934)" (#13992)" (#14008)" This reverts commit 71aac4e604bc5de7c4b46d2bd0587e20dde67032. --- .../io/ray/runtime/runner/RunManager.java | 2 +- java/test.sh | 57 +++--- .../io/ray/test/TestProgressListener.java | 166 ++---------------- java/testng.xml | 2 +- src/ray/core_worker/core_worker.cc | 18 +- src/ray/core_worker/core_worker.h | 2 - 6 files changed, 44 insertions(+), 203 deletions(-) diff --git a/java/runtime/src/main/java/io/ray/runtime/runner/RunManager.java b/java/runtime/src/main/java/io/ray/runtime/runner/RunManager.java index 192e5550ceb4..2307b0489d3c 100644 --- a/java/runtime/src/main/java/io/ray/runtime/runner/RunManager.java +++ b/java/runtime/src/main/java/io/ray/runtime/runner/RunManager.java @@ -96,7 +96,7 @@ public static void getAddressInfoAndFillConfig(RayConfig rayConfig) { * * @param command The command to start the process with. */ - public static String runCommand(List command) throws IOException, InterruptedException { + private static String runCommand(List command) throws IOException, InterruptedException { if (LOGGER.isDebugEnabled()) { LOGGER.debug("Starting process with command: {}", Joiner.on(" ").join(command)); } diff --git a/java/test.sh b/java/test.sh index b49f06037c10..a842194e67fb 100755 --- a/java/test.sh +++ b/java/test.sh @@ -16,27 +16,30 @@ pushd "$ROOT_DIR" mvn -T16 checkstyle:check popd +on_exit() { + exit_code=$? + if [ $exit_code -ne 0 ]; then + echo "Exit trap, printing ray logs" + cat /tmp/ray/session_latest/logs/* + fi +} + +trap on_exit EXIT + run_testng() { - local pid local exit_code - "$@" & - pid=$! - if wait $pid; then + if "$@"; then exit_code=0 else exit_code=$? fi # exit_code == 2 means there are skipped tests. if [ $exit_code -ne 2 ] && [ $exit_code -ne 0 ] ; then - # Only print log files if it ran in cluster mode - if [[ ! "$*" =~ SINGLE_PROCESS ]]; then - if [ $exit_code -gt 128 ] ; then - # Test crashed. Print the driver log for diagnosis. - cat /tmp/ray/session_latest/logs/java-core-driver-*$pid* - fi + if [ $exit_code -gt 128 ] ; then + # Test crashed. Print the driver log for diagnosis. + cat /tmp/ray/session_latest/logs/java-core-driver-* fi - # Only print the hs_err_pid file of TestNG process - find . -name "hs_err_pid$pid.log" -exec cat {} + + find . -name "hs_err_*log" -exec cat {} + exit $exit_code fi } @@ -57,31 +60,11 @@ if ! git diff --exit-code -- java src/ray/core_worker/lib/java; then exit 1 fi -# NOTE(kfstrom): Java test troubleshooting only. -# Set MAX_ROUNDS to a big number (e.g. 1000) to run Java tests repeatedly. -# You may also want to modify java/testng.xml to run only a subset of test cases. -MAX_ROUNDS=1 -if [ $MAX_ROUNDS -gt 1 ]; then - export RAY_BACKEND_LOG_LEVEL=debug -fi - -round=1 -while true; do - echo Starting cluster mode test round $round - - echo "Running tests under cluster mode." - # TODO(hchen): Ideally, we should use the following bazel command to run Java tests. However, if there're skipped tests, - # TestNG will exit with code 2. And bazel treats it as test failure. - # bazel test //java:all_tests --config=ci || cluster_exit_code=$? - run_testng java -cp "$ROOT_DIR"/../bazel-bin/java/all_tests_deploy.jar org.testng.TestNG -d /tmp/ray_java_test_output "$ROOT_DIR"/testng.xml - - echo Finished cluster mode test round $round - date - round=$((round+1)) - if (( round > MAX_ROUNDS )); then - break - fi -done +echo "Running tests under cluster mode." +# TODO(hchen): Ideally, we should use the following bazel command to run Java tests. However, if there're skipped tests, +# TestNG will exit with code 2. And bazel treats it as test failure. +# bazel test //java:all_tests --config=ci || cluster_exit_code=$? +run_testng java -cp "$ROOT_DIR"/../bazel-bin/java/all_tests_deploy.jar org.testng.TestNG -d /tmp/ray_java_test_output "$ROOT_DIR"/testng.xml echo "Running tests under single-process mode." # bazel test //java:all_tests --jvmopt="-Dray.run-mode=SINGLE_PROCESS" --config=ci || single_exit_code=$? diff --git a/java/test/src/main/java/io/ray/test/TestProgressListener.java b/java/test/src/main/java/io/ray/test/TestProgressListener.java index 915d82af317b..1fed5ac21375 100644 --- a/java/test/src/main/java/io/ray/test/TestProgressListener.java +++ b/java/test/src/main/java/io/ray/test/TestProgressListener.java @@ -1,42 +1,27 @@ package io.ray.test; -import com.google.common.collect.ImmutableList; -import io.ray.runtime.runner.RunManager; -import java.io.File; import java.time.LocalDateTime; -import java.util.Collection; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import org.apache.commons.io.FileUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.lang3.SystemUtils; import org.testng.IInvokedMethod; import org.testng.IInvokedMethodListener; import org.testng.ITestContext; import org.testng.ITestListener; import org.testng.ITestResult; -import org.testng.SkipException; public class TestProgressListener implements IInvokedMethodListener, ITestListener { - // Travis aborts CI if no outputs for 10 minutes. So threshold needs to be smaller than 10m. - private static final long hangDetectionThresholdMillis = 5 * 60 * 1000; - private static final int TAIL_NO_OF_LINES = 500; - private Thread testMainThread; - private long testStartTimeMillis; - private String getFullTestName(ITestResult testResult) { return testResult.getTestClass().getName() + "." + testResult.getMethod().getMethodName(); } - private void printSection(String sectionName) { + private void printInfo(String tag, String content) { System.out.println( - "============ [" + LocalDateTime.now().toString() + "] " + sectionName + " ============"); - } - - private void printTestStage(String tag, String content) { - printSection("[" + tag + "] " + content); + "============ [" + + LocalDateTime.now().toString() + + "] [" + + tag + + "] " + + content + + " ============"); } @Override @@ -47,50 +32,31 @@ public void afterInvocation(IInvokedMethod method, ITestResult testResult) {} @Override public void onTestStart(ITestResult result) { - printTestStage("TEST START", getFullTestName(result)); - testStartTimeMillis = System.currentTimeMillis(); - // TODO(kfstorm): Add a timer to detect hang - if (testMainThread == null) { - testMainThread = Thread.currentThread(); - Thread hangDetectionThread = - new Thread( - () -> { - try { - // If current task case has ran for more than 5 minutes. - while (System.currentTimeMillis() - testStartTimeMillis - < hangDetectionThresholdMillis) { - Thread.sleep(1000); - } - printDebugInfo(null, /*testHanged=*/ true); - } catch (InterruptedException e) { - // ignored - } - }); - hangDetectionThread.setDaemon(true); - hangDetectionThread.start(); - } + printInfo("TEST START", getFullTestName(result)); } @Override public void onTestSuccess(ITestResult result) { - printTestStage("TEST SUCCESS", getFullTestName(result)); + printInfo("TEST SUCCESS", getFullTestName(result)); } @Override public void onTestFailure(ITestResult result) { - printTestStage("TEST FAILURE", getFullTestName(result)); - printDebugInfo(result, /*testHanged=*/ false); + printInfo("TEST FAILURE", getFullTestName(result)); + Throwable throwable = result.getThrowable(); + if (throwable != null) { + throwable.printStackTrace(); + } } @Override public void onTestSkipped(ITestResult result) { - printTestStage("TEST SKIPPED", getFullTestName(result)); - printDebugInfo(result, /*testHanged=*/ false); + printInfo("TEST SKIPPED", getFullTestName(result)); } @Override public void onTestFailedButWithinSuccessPercentage(ITestResult result) { - printTestStage("TEST FAILED BUT WITHIN SUCCESS PERCENTAGE", getFullTestName(result)); + printInfo("TEST FAILED BUT WITHIN SUCCESS PERCENTAGE", getFullTestName(result)); } @Override @@ -98,102 +64,4 @@ public void onStart(ITestContext context) {} @Override public void onFinish(ITestContext context) {} - - private void printDebugInfo(ITestResult result, boolean testHanged) { - boolean testFailed = false; - if (result != null) { - Throwable throwable = result.getThrowable(); - if (throwable != null && !(throwable instanceof SkipException)) { - testFailed = true; - throwable.printStackTrace(); - } - } - if (!testFailed && !testHanged) { - return; - } - - if (testHanged) { - printSection("TEST CASE HANGED"); - printSection("STACK TRACE OF TEST THREAD"); - for (StackTraceElement element : testMainThread.getStackTrace()) { - System.out.println(element.toString()); - } - Set javaPids = getJavaPids(); - for (Integer pid : javaPids) { - runCommandSafely(ImmutableList.of("jstack", pid.toString())); - // TODO(kfstorm): Check lldb or gdb exists rather than detecting OS type. - if (SystemUtils.IS_OS_MAC) { - runCommandSafely( - ImmutableList.of("lldb", "--batch", "-o", "bt all", "-p", pid.toString())); - } else { - runCommandSafely( - ImmutableList.of( - "sudo", "gdb", "-batch", "-ex", "thread apply all bt", "-p", pid.toString())); - } - } - } - - printLogFiles(); - - if (testHanged) { - printSection("ABORT TEST"); - System.exit(1); - } - } - - private String runCommandSafely(List command) { - String output; - String commandString = String.join(" ", command); - printSection(commandString); - try { - output = RunManager.runCommand(command); - System.out.println(output); - } catch (Exception e) { - System.out.println("Failed to execute command: " + commandString); - e.printStackTrace(); - output = ""; - } - return output; - } - - private Set getJavaPids() { - Set javaPids = new HashSet<>(); - String jpsOutput = runCommandSafely(ImmutableList.of("jps", "-v")); - try { - for (String line : StringUtils.split(jpsOutput, "\n")) { - String[] parts = StringUtils.split(line); - if (parts.length > 1 && parts[1].toLowerCase().equals("jps")) { - // Skip jps. - continue; - } - Integer pid = Integer.valueOf(parts[0]); - javaPids.add(pid); - } - } catch (Exception e) { - System.out.println("Failed to parse jps output."); - e.printStackTrace(); - } - - String pgrepJavaResult = runCommandSafely(ImmutableList.of("pgrep", "java")); - try { - for (String line : StringUtils.split(pgrepJavaResult, "\n")) { - Integer pid = Integer.valueOf(line); - javaPids.add(pid); - } - } catch (Exception e) { - System.out.println("Failed to parse pgrep java output."); - e.printStackTrace(); - } - - return javaPids; - } - - private void printLogFiles() { - Collection logFiles = - FileUtils.listFiles(new File("/tmp/ray/session_latest/logs"), null, false); - for (File file : logFiles) { - runCommandSafely( - ImmutableList.of("tail", "-n", String.valueOf(TAIL_NO_OF_LINES), file.getAbsolutePath())); - } - } } diff --git a/java/testng.xml b/java/testng.xml index 0db2704845d4..6cc10b9ab24a 100644 --- a/java/testng.xml +++ b/java/testng.xml @@ -1,6 +1,6 @@ - + diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index cf5a1f532cb9..6c8287c1507b 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -161,21 +161,15 @@ CoreWorkerProcess::CoreWorkerProcess(const CoreWorkerOptions &options) // RayConfig is generated in Java_io_ray_runtime_RayNativeRuntime_nativeInitialize // for java worker or in constructor of CoreWorker for python worker. ray::stats::Init(global_tags, options_.metrics_agent_port); - -#ifndef _WIN32 - // NOTE(kfstorm): std::atexit should be put at the end of `CoreWorkerProcess` - // constructor. We assume that spdlog has been initialized before this line. When the - // process is exiting, `HandleAtExit` will be invoked before destructing spdlog static - // variables. We explicitly destruct `CoreWorkerProcess` instance in the callback to - // ensure the static `CoreWorkerProcess` instance is destructed while spdlog is still - // usable. This prevents crashing (or hanging) when using `RAY_LOG` in - // `CoreWorkerProcess` destructor. - RAY_CHECK(std::atexit(CoreWorkerProcess::HandleAtExit) == 0); -#endif } CoreWorkerProcess::~CoreWorkerProcess() { RAY_LOG(INFO) << "Destructing CoreWorkerProcess. pid: " << getpid(); + { + // Check that all `CoreWorker` instances have been removed. + absl::ReaderMutexLock lock(&worker_map_mutex_); + RAY_CHECK(workers_.empty()); + } RAY_LOG(DEBUG) << "Stats stop in core worker."; // Shutdown stats module if worker process exits. ray::stats::Shutdown(); @@ -189,8 +183,6 @@ void CoreWorkerProcess::EnsureInitialized() { << "shutdown."; } -void CoreWorkerProcess::HandleAtExit() { instance_.reset(); } - std::shared_ptr CoreWorkerProcess::TryGetWorker(const WorkerID &worker_id) { if (!instance_) { return nullptr; diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h index 72ef4f36ca7b..6fa24c29e94e 100644 --- a/src/ray/core_worker/core_worker.h +++ b/src/ray/core_worker/core_worker.h @@ -265,8 +265,6 @@ class CoreWorkerProcess { /// \return Void. static void EnsureInitialized(); - static void HandleAtExit(); - /// Get the `CoreWorker` instance by worker ID. /// /// \param[in] workerId The worker ID. From b749e0a9044122ca0af987d1773a375a8f56e3da Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 051/244] Revert "Revert "[Core]Fix ray.kill doesn't cancel pending actor bug (#13254)" (#14013)" This reverts commit 7293125d69e39144aa32a1861b97222f7f656745. --- .../main/java/io/ray/test/KillActorTest.java | 2 + python/ray/tests/test_actor_advanced.py | 84 ++++++++++++ python/ray/tests/test_placement_group.py | 12 +- python/ray/tests/test_queue.py | 6 +- python/ray/tests/test_reference_counting.py | 4 +- src/ray/core_worker/core_worker.cc | 4 +- src/ray/core_worker/core_worker.h | 1 + src/ray/gcs/accessor.h | 10 ++ .../gcs/gcs_client/service_based_accessor.cc | 20 +++ .../gcs/gcs_client/service_based_accessor.h | 3 + src/ray/gcs/gcs_server/gcs_actor_manager.cc | 126 +++++++++++++----- src/ray/gcs/gcs_server/gcs_actor_manager.h | 26 +++- src/ray/gcs/gcs_server/gcs_actor_scheduler.cc | 36 ++++- src/ray/gcs/gcs_server/gcs_actor_scheduler.h | 6 +- .../gcs_server/test/gcs_actor_manager_test.cc | 9 +- .../test/gcs_actor_scheduler_test.cc | 3 +- src/ray/protobuf/gcs_service.proto | 18 +++ src/ray/rpc/gcs_server/gcs_rpc_client.h | 4 + src/ray/rpc/gcs_server/gcs_rpc_server.h | 5 + 19 files changed, 325 insertions(+), 54 deletions(-) diff --git a/java/test/src/main/java/io/ray/test/KillActorTest.java b/java/test/src/main/java/io/ray/test/KillActorTest.java index fd92b97118ef..753b00a9c59c 100644 --- a/java/test/src/main/java/io/ray/test/KillActorTest.java +++ b/java/test/src/main/java/io/ray/test/KillActorTest.java @@ -59,6 +59,8 @@ private static void remoteKill(ActorHandle actor, boolean noRestart) { private void testKillActor(BiConsumer, Boolean> kill, boolean noRestart) { ActorHandle actor = Ray.actor(HangActor::new).setMaxRestarts(1).remote(); + // Wait for the actor to be created. + actor.task(HangActor::ping).remote().get(); ObjectRef result = actor.task(HangActor::hang).remote(); // The actor will hang in this task. Assert.assertEquals(0, Ray.wait(ImmutableList.of(result), 1, 500).getReady().size()); diff --git a/python/ray/tests/test_actor_advanced.py b/python/ray/tests/test_actor_advanced.py index 1913decf83df..496e977fe9cd 100644 --- a/python/ray/tests/test_actor_advanced.py +++ b/python/ray/tests/test_actor_advanced.py @@ -1093,6 +1093,90 @@ class Actor2: global_state_accessor.disconnect() +def test_kill_pending_actor_with_no_restart_true(): + cluster = ray.init() + global_state_accessor = GlobalStateAccessor( + cluster["redis_address"], ray.ray_constants.REDIS_DEFAULT_PASSWORD) + global_state_accessor.connect() + + @ray.remote(resources={"WORKER": 1.0}) + class PendingActor: + pass + + # Kill actor with `no_restart=True`. + actor = PendingActor.remote() + # TODO(ffbin): The raylet doesn't guarantee the order when dealing with + # RequestWorkerLease and CancelWorkerLease. If we kill the actor + # immediately after creating the actor, we may not be able to clean up + # the request cached by the raylet. + # See https://github.com/ray-project/ray/issues/13545 for details. + time.sleep(1) + ray.kill(actor, no_restart=True) + + def condition1(): + message = global_state_accessor.get_all_resource_usage() + resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString( + message) + if len(resource_usages.resource_load_by_shape.resource_demands) == 0: + return True + return False + + # Actor is dead, so the infeasible task queue length is 0. + wait_for_condition(condition1, timeout=10) + + global_state_accessor.disconnect() + ray.shutdown() + + +def test_kill_pending_actor_with_no_restart_false(): + cluster = ray.init() + global_state_accessor = GlobalStateAccessor( + cluster["redis_address"], ray.ray_constants.REDIS_DEFAULT_PASSWORD) + global_state_accessor.connect() + + @ray.remote(resources={"WORKER": 1.0}, max_restarts=1) + class PendingActor: + pass + + # Kill actor with `no_restart=False`. + actor = PendingActor.remote() + # TODO(ffbin): The raylet doesn't guarantee the order when dealing with + # RequestWorkerLease and CancelWorkerLease. If we kill the actor + # immediately after creating the actor, we may not be able to clean up + # the request cached by the raylet. + # See https://github.com/ray-project/ray/issues/13545 for details. + time.sleep(1) + ray.kill(actor, no_restart=False) + + def condition1(): + message = global_state_accessor.get_all_resource_usage() + resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString( + message) + if len(resource_usages.resource_load_by_shape.resource_demands) == 0: + return False + return True + + # Actor restarts, so the infeasible task queue length is 1. + wait_for_condition(condition1, timeout=10) + + # Kill actor again and actor is dead, + # so the infeasible task queue length is 0. + ray.kill(actor, no_restart=False) + + def condition2(): + message = global_state_accessor.get_all_resource_usage() + resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString( + message) + if len(resource_usages.resource_load_by_shape.resource_demands) == 0: + return True + return False + + wait_for_condition(condition2, timeout=10) + + global_state_accessor.disconnect() + ray.shutdown() + + if __name__ == "__main__": import pytest # Test suite is timing out. Disable on windows for now. diff --git a/python/ray/tests/test_placement_group.py b/python/ray/tests/test_placement_group.py index 024ff6c5557a..92ef90ca4e1e 100644 --- a/python/ray/tests/test_placement_group.py +++ b/python/ray/tests/test_placement_group.py @@ -902,8 +902,10 @@ def schedule_nested_actor_outside_pg(self): # Kill an actor and wait until it is killed. ray.kill(a) - with pytest.raises(ray.exceptions.RayActorError): + try: ray.get(a.ready.remote()) + except ray.exceptions.RayActorError: + pass # Now create an actor, but do not capture the current tasks a = Actor.options( @@ -925,8 +927,10 @@ def schedule_nested_actor_outside_pg(self): # Kill an actor and wait until it is killed. ray.kill(a) - with pytest.raises(ray.exceptions.RayActorError): + try: ray.get(a.ready.remote()) + except ray.exceptions.RayActorError: + pass # Lastly, make sure when None is specified, actors are not scheduled # on the same placement group. @@ -1416,8 +1420,10 @@ def schedule_nested_actor_with_detached_pg(self): # Kill an actor and wait until it is killed. ray.kill(a) - with pytest.raises(ray.exceptions.RayActorError): + try: ray.get(a.ready.remote()) + except ray.exceptions.RayActorError: + pass # We should have 2 alive pgs and 4 alive actors. assert assert_alive_num_pg(2) diff --git a/python/ray/tests/test_queue.py b/python/ray/tests/test_queue.py index 6c2fb5cf0ec9..88cf6d7b647f 100644 --- a/python/ray/tests/test_queue.py +++ b/python/ray/tests/test_queue.py @@ -199,17 +199,19 @@ def test_custom_resources(ray_start_regular_shared): assert current_resources["CPU"] == 1.0 # By default an actor should not reserve any resources. - Queue() + q = Queue() current_resources = ray.available_resources() assert current_resources["CPU"] == 1.0 + q.shutdown() # Specify resource requirement. The queue should now reserve 1 CPU. - Queue(actor_options={"num_cpus": 1}) + q = Queue(actor_options={"num_cpus": 1}) def no_cpu_in_resources(): return "CPU" not in ray.available_resources() wait_for_condition(no_cpu_in_resources) + q.shutdown() if __name__ == "__main__": diff --git a/python/ray/tests/test_reference_counting.py b/python/ray/tests/test_reference_counting.py index 02638ed3dea8..9fcd3c25f4c4 100644 --- a/python/ray/tests/test_reference_counting.py +++ b/python/ray/tests/test_reference_counting.py @@ -470,8 +470,10 @@ def delete_ref2(self): # Test that the actor exiting stops the reference from being pinned. ray.kill(actor) # Wait for the actor to exit. - with pytest.raises(ray.exceptions.RayActorError): + try: ray.get(actor.delete_ref1.remote()) + except ray.exceptions.RayActorError: + pass else: # Test that deleting the second reference stops it from being pinned. ray.get(actor.delete_ref2.remote()) diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 6c8287c1507b..f7c663b5043b 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -1629,7 +1629,9 @@ Status CoreWorker::KillActor(const ActorID &actor_id, bool force_kill, bool no_r stream << "Failed to find a corresponding actor handle for " << actor_id; return Status::Invalid(stream.str()); } - direct_actor_submitter_->KillActor(actor_id, force_kill, no_restart); + + RAY_CHECK_OK( + gcs_client_->Actors().AsyncKillActor(actor_id, force_kill, no_restart, nullptr)); return Status::OK(); } diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h index 6fa24c29e94e..83242c00059b 100644 --- a/src/ray/core_worker/core_worker.h +++ b/src/ray/core_worker/core_worker.h @@ -728,6 +728,7 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { /// Tell an actor to exit immediately, without completing outstanding work. /// /// \param[in] actor_id ID of the actor to kill. + /// \param[in] force_kill Whether to force kill an actor by killing the worker. /// \param[in] no_restart If set to true, the killed actor will not be /// restarted anymore. /// \param[out] Status diff --git a/src/ray/gcs/accessor.h b/src/ray/gcs/accessor.h index be929ec3ff0d..db240b411cdf 100644 --- a/src/ray/gcs/accessor.h +++ b/src/ray/gcs/accessor.h @@ -64,6 +64,16 @@ class ActorInfoAccessor { virtual Status AsyncRegisterActor(const TaskSpecification &task_spec, const StatusCallback &callback) = 0; + /// Kill actor via GCS asynchronously. + /// + /// \param actor_id The ID of actor to destroy. + /// \param force_kill Whether to force kill an actor by killing the worker. + /// \param no_restart If set to true, the killed actor will not be restarted anymore. + /// \param callback Callback that will be called after the actor is destroyed. + /// \return Status + virtual Status AsyncKillActor(const ActorID &actor_id, bool force_kill, bool no_restart, + const StatusCallback &callback) = 0; + /// Asynchronously request GCS to create the actor. /// /// This should be called after the worker has resolved the actor dependencies. diff --git a/src/ray/gcs/gcs_client/service_based_accessor.cc b/src/ray/gcs/gcs_client/service_based_accessor.cc index a82e0ab6bcdd..5905966cb92a 100644 --- a/src/ray/gcs/gcs_client/service_based_accessor.cc +++ b/src/ray/gcs/gcs_client/service_based_accessor.cc @@ -200,6 +200,26 @@ Status ServiceBasedActorInfoAccessor::AsyncRegisterActor( return Status::OK(); } +Status ServiceBasedActorInfoAccessor::AsyncKillActor( + const ActorID &actor_id, bool force_kill, bool no_restart, + const ray::gcs::StatusCallback &callback) { + rpc::KillActorViaGcsRequest request; + request.set_actor_id(actor_id.Binary()); + request.set_force_kill(force_kill); + request.set_no_restart(no_restart); + client_impl_->GetGcsRpcClient().KillActorViaGcs( + request, [callback](const Status &, const rpc::KillActorViaGcsReply &reply) { + if (callback) { + auto status = + reply.status().code() == (int)StatusCode::OK + ? Status() + : Status(StatusCode(reply.status().code()), reply.status().message()); + callback(status); + } + }); + return Status::OK(); +} + Status ServiceBasedActorInfoAccessor::AsyncCreateActor( const ray::TaskSpecification &task_spec, const ray::gcs::StatusCallback &callback) { RAY_CHECK(task_spec.IsActorCreationTask() && callback); diff --git a/src/ray/gcs/gcs_client/service_based_accessor.h b/src/ray/gcs/gcs_client/service_based_accessor.h index c883e7b626a7..8aab5198f28e 100644 --- a/src/ray/gcs/gcs_client/service_based_accessor.h +++ b/src/ray/gcs/gcs_client/service_based_accessor.h @@ -85,6 +85,9 @@ class ServiceBasedActorInfoAccessor : public ActorInfoAccessor { Status AsyncCreateActor(const TaskSpecification &task_spec, const StatusCallback &callback) override; + Status AsyncKillActor(const ActorID &actor_id, bool force_kill, bool no_restart, + const StatusCallback &callback) override; + Status AsyncSubscribeAll( const SubscribeCallback &subscribe, const StatusCallback &done) override; diff --git a/src/ray/gcs/gcs_server/gcs_actor_manager.cc b/src/ray/gcs/gcs_server/gcs_actor_manager.cc index 2f3740654c8b..338fc149c327 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_actor_manager.cc @@ -214,6 +214,25 @@ void GcsActorManager::HandleGetNamedActorInfo( ++counts_[CountType::GET_NAMED_ACTOR_INFO_REQUEST]; } +void GcsActorManager::HandleKillActorViaGcs(const rpc::KillActorViaGcsRequest &request, + rpc::KillActorViaGcsReply *reply, + rpc::SendReplyCallback send_reply_callback) { + const auto &actor_id = ActorID::FromBinary(request.actor_id()); + bool force_kill = request.force_kill(); + bool no_restart = request.no_restart(); + if (no_restart) { + DestroyActor(actor_id); + } else { + KillActor(actor_id, force_kill, no_restart); + } + + GCS_RPC_SEND_REPLY(send_reply_callback, reply, Status::OK()); + RAY_LOG(DEBUG) << "Finished killing actor, job id = " << actor_id.JobId() + << ", actor id = " << actor_id << ", force_kill = " << force_kill + << ", no_restart = " << no_restart; + ++counts_[CountType::KILL_ACTOR_REQUEST]; +} + Status GcsActorManager::RegisterActor(const ray::rpc::RegisterActorRequest &request, RegisterActorCallback success_callback) { // NOTE: After the abnormal recovery of the network between GCS client and GCS server or @@ -417,8 +436,11 @@ void GcsActorManager::DestroyActor(const ActorID &actor_id) { actor_to_register_callbacks_.erase(actor_id); actor_to_create_callbacks_.erase(actor_id); auto it = registered_actors_.find(actor_id); - RAY_CHECK(it != registered_actors_.end()) - << "Tried to destroy actor that does not exist " << actor_id; + if (it == registered_actors_.end()) { + RAY_LOG(INFO) << "Tried to destroy actor that does not exist " << actor_id; + return; + } + const auto &task_id = it->second->GetCreationTaskSpecification().TaskId(); it->second->GetMutableActorTableData()->mutable_task_spec()->Clear(); it->second->GetMutableActorTableData()->set_timestamp(current_sys_time_ms()); AddDestroyedActorToCache(it->second); @@ -456,38 +478,13 @@ void GcsActorManager::DestroyActor(const ActorID &actor_id) { if (node_it != created_actors_.end() && node_it->second.count(worker_id)) { // The actor has already been created. Destroy the process by force-killing // it. - KillActor(actor); + NotifyCoreWorkerToKillActor(actor); RAY_CHECK(node_it->second.erase(actor->GetWorkerID())); if (node_it->second.empty()) { created_actors_.erase(node_it); } } else { - // The actor has not been created yet. It is either being scheduled or is - // pending scheduling. - auto canceled_actor_id = - gcs_actor_scheduler_->CancelOnWorker(actor->GetNodeID(), actor->GetWorkerID()); - if (!canceled_actor_id.IsNil()) { - // The actor was being scheduled and has now been canceled. - RAY_CHECK(canceled_actor_id == actor_id); - } else { - auto pending_it = - std::find_if(pending_actors_.begin(), pending_actors_.end(), - [actor_id](const std::shared_ptr &actor) { - return actor->GetActorID() == actor_id; - }); - - // The actor was pending scheduling. Remove it from the queue. - if (pending_it != pending_actors_.end()) { - pending_actors_.erase(pending_it); - } else { - // When actor creation request of this actor id is pending in raylet, - // it doesn't responds, and the actor should be still in leasing state. - // NOTE: Raylet will cancel the lease request once it receives the - // actor state notification. So this method doesn't have to cancel - // outstanding lease request by calling raylet_client->CancelWorkerLease - gcs_actor_scheduler_->CancelOnLeasing(node_id, actor_id); - } - } + CancelActorInScheduling(actor, task_id); } } @@ -706,7 +703,7 @@ void GcsActorManager::ReconstructActor(const ActorID &actor_id, bool need_resche RAY_CHECK_OK(gcs_table_storage_->ActorTable().Put( actor_id, *mutable_actor_table_data, [this, actor, actor_id, mutable_actor_table_data](Status status) { - // if actor was an detached actor, make sure to destroy it. + // If actor was an detached actor, make sure to destroy it. // We need to do this because detached actors are not destroyed // when its owners are dead because it doesn't have owners. if (actor->IsDetached()) { @@ -934,15 +931,47 @@ void GcsActorManager::RemoveActorFromOwner(const std::shared_ptr &acto } } -void GcsActorManager::KillActor(const std::shared_ptr &actor) { +void GcsActorManager::NotifyCoreWorkerToKillActor(const std::shared_ptr &actor, + bool force_kill, bool no_restart) { auto actor_client = worker_client_factory_(actor->GetAddress()); rpc::KillActorRequest request; request.set_intended_actor_id(actor->GetActorID().Binary()); - request.set_force_kill(true); - request.set_no_restart(true); + request.set_force_kill(force_kill); + request.set_no_restart(no_restart); RAY_UNUSED(actor_client->KillActor(request, nullptr)); } +void GcsActorManager::KillActor(const ActorID &actor_id, bool force_kill, + bool no_restart) { + RAY_LOG(DEBUG) << "Killing actor, job id = " << actor_id.JobId() + << ", actor id = " << actor_id << ", force_kill = " << force_kill; + const auto &it = registered_actors_.find(actor_id); + if (it == registered_actors_.end()) { + RAY_LOG(INFO) << "Tried to kill actor that does not exist " << actor_id; + return; + } + + const auto &actor = it->second; + if (actor->GetState() == rpc::ActorTableData::DEAD || + actor->GetState() == rpc::ActorTableData::DEPENDENCIES_UNREADY) { + return; + } + + // The actor is still alive or pending creation. + const auto &node_id = actor->GetNodeID(); + const auto &worker_id = actor->GetWorkerID(); + auto node_it = created_actors_.find(node_id); + if (node_it != created_actors_.end() && node_it->second.count(worker_id)) { + // The actor has already been created. Destroy the process by force-killing + // it. + NotifyCoreWorkerToKillActor(actor, force_kill, no_restart); + } else { + const auto &task_id = actor->GetCreationTaskSpecification().TaskId(); + CancelActorInScheduling(actor, task_id); + ReconstructActor(actor_id, /*need_reschedule=*/true); + } +} + void GcsActorManager::AddDestroyedActorToCache(const std::shared_ptr &actor) { if (destroyed_actors_.size() >= RayConfig::instance().maximum_gcs_destroyed_actor_cached_count()) { @@ -956,6 +985,36 @@ void GcsActorManager::AddDestroyedActorToCache(const std::shared_ptr & actor->GetActorID(), (int64_t)actor->GetActorTableData().timestamp()); } +void GcsActorManager::CancelActorInScheduling(const std::shared_ptr &actor, + const TaskID &task_id) { + const auto &actor_id = actor->GetActorID(); + const auto &node_id = actor->GetNodeID(); + // The actor has not been created yet. It is either being scheduled or is + // pending scheduling. + auto canceled_actor_id = + gcs_actor_scheduler_->CancelOnWorker(actor->GetNodeID(), actor->GetWorkerID()); + if (!canceled_actor_id.IsNil()) { + // The actor was being scheduled and has now been canceled. + RAY_CHECK(canceled_actor_id == actor_id); + } else { + auto pending_it = std::find_if(pending_actors_.begin(), pending_actors_.end(), + [actor_id](const std::shared_ptr &actor) { + return actor->GetActorID() == actor_id; + }); + + // The actor was pending scheduling. Remove it from the queue. + if (pending_it != pending_actors_.end()) { + pending_actors_.erase(pending_it); + } else { + // When actor creation request of this actor id is pending in raylet, + // it doesn't responds, and the actor should be still in leasing state. + // NOTE: We will cancel outstanding lease request by calling + // `raylet_client->CancelWorkerLease`. + gcs_actor_scheduler_->CancelOnLeasing(node_id, actor_id, task_id); + } + } +} + std::string GcsActorManager::DebugString() const { std::ostringstream stream; stream << "GcsActorManager: {RegisterActor request count: " @@ -964,6 +1023,7 @@ std::string GcsActorManager::DebugString() const { << ", GetActorInfo request count: " << counts_[CountType::GET_ACTOR_INFO_REQUEST] << ", GetNamedActorInfo request count: " << counts_[CountType::GET_NAMED_ACTOR_INFO_REQUEST] + << ", KillActor request count: " << counts_[CountType::KILL_ACTOR_REQUEST] << ", Registered actors count: " << registered_actors_.size() << ", Destroyed actors count: " << destroyed_actors_.size() << ", Named actors count: " << named_actors_.size() diff --git a/src/ray/gcs/gcs_server/gcs_actor_manager.h b/src/ray/gcs/gcs_server/gcs_actor_manager.h index d3ffc309793e..f2db9345f0ba 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_manager.h +++ b/src/ray/gcs/gcs_server/gcs_actor_manager.h @@ -190,6 +190,10 @@ class GcsActorManager : public rpc::ActorInfoHandler { rpc::GetAllActorInfoReply *reply, rpc::SendReplyCallback send_reply_callback) override; + void HandleKillActorViaGcs(const rpc::KillActorViaGcsRequest &request, + rpc::KillActorViaGcsReply *reply, + rpc::SendReplyCallback send_reply_callback) override; + /// Register actor asynchronously. /// /// \param request Contains the meta info to create the actor. @@ -336,8 +340,18 @@ class GcsActorManager : public rpc::ActorInfoHandler { /// Kill the specified actor. /// + /// \param actor_id ID of the actor to kill. + /// \param force_kill Whether to force kill an actor by killing the worker. + /// \param no_restart If set to true, the killed actor will not be restarted anymore. + void KillActor(const ActorID &actor_id, bool force_kill, bool no_restart); + + /// Notify CoreWorker to kill the specified actor. + /// /// \param actor The actor to be killed. - void KillActor(const std::shared_ptr &actor); + /// \param force_kill Whether to force kill an actor by killing the worker. + /// \param no_restart If set to true, the killed actor will not be restarted anymore. + void NotifyCoreWorkerToKillActor(const std::shared_ptr &actor, + bool force_kill = true, bool no_restart = true); /// Add the destroyed actor to the cache. If the cache is full, one actor is randomly /// evicted. @@ -356,6 +370,13 @@ class GcsActorManager : public rpc::ActorInfoHandler { return actor_delta; } + /// Cancel actor which is either being scheduled or is pending scheduling. + /// + /// \param actor The actor to be cancelled. + /// \param task_id The id of actor creation task to be cancelled. + void CancelActorInScheduling(const std::shared_ptr &actor, + const TaskID &task_id); + /// Callbacks of pending `RegisterActor` requests. /// Maps actor ID to actor registration callbacks, which is used to filter duplicated /// messages from a driver/worker caused by some network problems. @@ -413,7 +434,8 @@ class GcsActorManager : public rpc::ActorInfoHandler { GET_ACTOR_INFO_REQUEST = 2, GET_NAMED_ACTOR_INFO_REQUEST = 3, GET_ALL_ACTOR_INFO_REQUEST = 4, - CountType_MAX = 10, + KILL_ACTOR_REQUEST = 5, + CountType_MAX = 6, }; uint64_t counts_[CountType::CountType_MAX] = {0}; }; diff --git a/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc b/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc index 9c81c8c0e98d..1b4201c4f573 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc +++ b/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc @@ -127,13 +127,27 @@ std::vector GcsActorScheduler::CancelOnNode(const NodeID &node_id) { return actor_ids; } -void GcsActorScheduler::CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id) { - // NOTE: This method does not currently cancel the outstanding lease request. - // It only removes leasing information from the internal state so that - // RequestWorkerLease ignores the response from raylet. +void GcsActorScheduler::CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id, + const TaskID &task_id) { + // NOTE: This method will cancel the outstanding lease request and remove leasing + // information from the internal state. auto node_it = node_to_actors_when_leasing_.find(node_id); - RAY_CHECK(node_it != node_to_actors_when_leasing_.end()); - node_it->second.erase(actor_id); + if (node_it != node_to_actors_when_leasing_.end()) { + node_it->second.erase(actor_id); + } + + const auto &alive_nodes = gcs_node_manager_.GetAllAliveNodes(); + const auto &iter = alive_nodes.find(node_id); + if (iter != alive_nodes.end()) { + const auto &node_info = iter->second; + rpc::Address address; + address.set_raylet_id(node_info->node_id()); + address.set_ip_address(node_info->node_manager_address()); + address.set_port(node_info->node_manager_port()); + auto lease_client = GetOrConnectLeaseClient(address); + lease_client->CancelWorkerLease( + task_id, [](const Status &status, const rpc::CancelWorkerLeaseReply &reply) {}); + } } ActorID GcsActorScheduler::CancelOnWorker(const NodeID &node_id, @@ -238,6 +252,16 @@ void GcsActorScheduler::LeaseWorkerFromNode(std::shared_ptr actor, } if (status.ok()) { + if (reply.worker_address().raylet_id().empty() && + reply.retry_at_raylet_address().raylet_id().empty()) { + // Actor creation task has been cancelled. It is triggered by `ray.kill`. If + // the number of remaining restarts of the actor is not equal to 0, GCS will + // reschedule the actor, so it return directly here. + RAY_LOG(DEBUG) << "Actor " << actor->GetActorID() + << " creation task has been cancelled."; + return; + } + // Remove the actor from the leasing map as the reply is returned from the // remote node. iter->second.erase(actor_iter); diff --git a/src/ray/gcs/gcs_server/gcs_actor_scheduler.h b/src/ray/gcs/gcs_server/gcs_actor_scheduler.h index 71dd351087e0..c0e3d430ecbf 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_scheduler.h +++ b/src/ray/gcs/gcs_server/gcs_actor_scheduler.h @@ -59,7 +59,8 @@ class GcsActorSchedulerInterface { /// /// \param node_id ID of the node where the actor leasing request has been sent. /// \param actor_id ID of an actor. - virtual void CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id) = 0; + virtual void CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id, + const TaskID &task_id) = 0; /// Cancel the actor that is being scheduled to the specified worker. /// @@ -130,7 +131,8 @@ class GcsActorScheduler : public GcsActorSchedulerInterface { /// /// \param node_id ID of the node where the actor leasing request has been sent. /// \param actor_id ID of an actor. - void CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id) override; + void CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id, + const TaskID &task_id) override; /// Cancel the actor that is being scheduled to the specified worker. /// diff --git a/src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc index b88c6702bfeb..b8edb6e82164 100644 --- a/src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc @@ -35,7 +35,8 @@ class MockActorScheduler : public gcs::GcsActorSchedulerInterface { MOCK_METHOD1(CancelOnNode, std::vector(const NodeID &node_id)); MOCK_METHOD2(CancelOnWorker, ActorID(const NodeID &node_id, const WorkerID &worker_id)); - MOCK_METHOD2(CancelOnLeasing, void(const NodeID &node_id, const ActorID &actor_id)); + MOCK_METHOD3(CancelOnLeasing, void(const NodeID &node_id, const ActorID &actor_id, + const TaskID &task_id)); std::vector> actors; }; @@ -735,8 +736,10 @@ TEST_F(GcsActorManagerTest, TestRaceConditionCancelLease) { address.set_raylet_id(node_id.Binary()); address.set_worker_id(worker_id.Binary()); actor->UpdateAddress(address); - const auto actor_id = actor->GetActorID(); - EXPECT_CALL(*mock_actor_scheduler_, CancelOnLeasing(node_id, actor_id)); + const auto &actor_id = actor->GetActorID(); + const auto &task_id = + TaskID::FromBinary(registered_actor->GetActorTableData().task_spec().task_id()); + EXPECT_CALL(*mock_actor_scheduler_, CancelOnLeasing(node_id, actor_id, task_id)); gcs_actor_manager_->OnWorkerDead(owner_node_id, owner_worker_id); } diff --git a/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc b/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc index d84f99b3fe88..bd98d65ef0f9 100644 --- a/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc @@ -262,7 +262,8 @@ TEST_F(GcsActorSchedulerTest, TestLeasingCancelledWhenLeasing) { ASSERT_EQ(1, raylet_client_->callbacks.size()); // Cancel the lease request. - gcs_actor_scheduler_->CancelOnLeasing(node_id, actor->GetActorID()); + const auto &task_id = TaskID::FromBinary(create_actor_request.task_spec().task_id()); + gcs_actor_scheduler_->CancelOnLeasing(node_id, actor->GetActorID(), task_id); ASSERT_EQ(1, raylet_client_->num_workers_requested); ASSERT_EQ(1, raylet_client_->callbacks.size()); diff --git a/src/ray/protobuf/gcs_service.proto b/src/ray/protobuf/gcs_service.proto index ed5ca92e2a42..6e2c450dd111 100644 --- a/src/ray/protobuf/gcs_service.proto +++ b/src/ray/protobuf/gcs_service.proto @@ -92,6 +92,22 @@ message GetAllActorInfoReply { repeated ActorTableData actor_table_data = 2; } +// `KillActorViaGcsRequest` is sent to GCS Service to ask to kill an actor. +// `KillActorViaGcsRequest` is different from `KillActorRequest`. +// `KillActorRequest` is send to core worker to ask to kill an actor. +message KillActorViaGcsRequest { + // ID of this actor. + bytes actor_id = 1; + // Whether to force kill the actor. + bool force_kill = 2; + // If set to true, the killed actor will not be restarted anymore. + bool no_restart = 3; +} + +message KillActorViaGcsReply { + GcsStatus status = 1; +} + // Service for actor info access. service ActorInfoGcsService { // Register actor to gcs service. @@ -104,6 +120,8 @@ service ActorInfoGcsService { rpc GetNamedActorInfo(GetNamedActorInfoRequest) returns (GetNamedActorInfoReply); // Get information of all actor from GCS Service. rpc GetAllActorInfo(GetAllActorInfoRequest) returns (GetAllActorInfoReply); + // Kill actor via GCS Service. + rpc KillActorViaGcs(KillActorViaGcsRequest) returns (KillActorViaGcsReply); } message RegisterNodeRequest { diff --git a/src/ray/rpc/gcs_server/gcs_rpc_client.h b/src/ray/rpc/gcs_server/gcs_rpc_client.h index bf9a72bed7db..bae0e56bd9ae 100644 --- a/src/ray/rpc/gcs_server/gcs_rpc_client.h +++ b/src/ray/rpc/gcs_server/gcs_rpc_client.h @@ -144,6 +144,10 @@ class GcsRpcClient { VOID_GCS_RPC_CLIENT_METHOD(ActorInfoGcsService, GetAllActorInfo, actor_info_grpc_client_, ) + /// Kill actor via GCS Service. + VOID_GCS_RPC_CLIENT_METHOD(ActorInfoGcsService, KillActorViaGcs, + actor_info_grpc_client_, ) + /// Register a node to GCS Service. VOID_GCS_RPC_CLIENT_METHOD(NodeInfoGcsService, RegisterNode, node_info_grpc_client_, ) diff --git a/src/ray/rpc/gcs_server/gcs_rpc_server.h b/src/ray/rpc/gcs_server/gcs_rpc_server.h index 328aa5f7382d..246a5ee9e306 100644 --- a/src/ray/rpc/gcs_server/gcs_rpc_server.h +++ b/src/ray/rpc/gcs_server/gcs_rpc_server.h @@ -125,6 +125,10 @@ class ActorInfoGcsServiceHandler { virtual void HandleGetAllActorInfo(const GetAllActorInfoRequest &request, GetAllActorInfoReply *reply, SendReplyCallback send_reply_callback) = 0; + + virtual void HandleKillActorViaGcs(const KillActorViaGcsRequest &request, + KillActorViaGcsReply *reply, + SendReplyCallback send_reply_callback) = 0; }; /// The `GrpcService` for `ActorInfoGcsService`. @@ -148,6 +152,7 @@ class ActorInfoGrpcService : public GrpcService { ACTOR_INFO_SERVICE_RPC_HANDLER(GetActorInfo); ACTOR_INFO_SERVICE_RPC_HANDLER(GetNamedActorInfo); ACTOR_INFO_SERVICE_RPC_HANDLER(GetAllActorInfo); + ACTOR_INFO_SERVICE_RPC_HANDLER(KillActorViaGcs); } private: From 72f50841b66bca2eb82d746f77dbe0f3955a2d8d Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 052/244] Revert "[autoscaler/dashboard] Publish resource usage in units of bytes (#14002)" This reverts commit b5512310eab1eda05d0bc1930f25344c133a43cf. --- .../ray/autoscaler/_private/load_metrics.py | 12 +---- python/ray/autoscaler/_private/util.py | 4 +- .../tests/test_resource_demand_scheduler.py | 44 ++++++------------- 3 files changed, 18 insertions(+), 42 deletions(-) diff --git a/python/ray/autoscaler/_private/load_metrics.py b/python/ray/autoscaler/_private/load_metrics.py index 09ea112381ed..bf9dc564bdca 100644 --- a/python/ray/autoscaler/_private/load_metrics.py +++ b/python/ray/autoscaler/_private/load_metrics.py @@ -5,7 +5,6 @@ from typing import Dict, List import numpy as np -import ray.ray_constants import ray._private.services as services from ray.autoscaler._private.constants import MEMORY_RESOURCE_UNIT_BYTES,\ AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE @@ -213,15 +212,8 @@ def summary(self): ) if self.static_resources_by_ip else {} usage_dict = {} for key in total_resources: - if key in ["memory", "object_store_memory"]: - total = total_resources[key] * \ - ray.ray_constants.MEMORY_RESOURCE_UNIT_BYTES - available = available_resources[key] * \ - ray.ray_constants.MEMORY_RESOURCE_UNIT_BYTES - usage_dict[key] = (total - available, total) - else: - total = total_resources[key] - usage_dict[key] = (total - available_resources[key], total) + total = total_resources[key] + usage_dict[key] = (total - available_resources[key], total) summarized_demand_vector = freq_of_dicts( self.get_resource_demand_vector(clip=False)) diff --git a/python/ray/autoscaler/_private/util.py b/python/ray/autoscaler/_private/util.py index 788da5cc2da6..39ebd5e799fe 100644 --- a/python/ray/autoscaler/_private/util.py +++ b/python/ray/autoscaler/_private/util.py @@ -313,12 +313,12 @@ def format_pg(pg): def get_usage_report(lm_summary) -> str: usage_lines = [] - for resource, (used, total) in sorted(lm_summary.usage.items()): + for resource, (used, total) in lm_summary.usage.items(): if "node:" in resource: continue # Skip the auto-added per-node "node:" resource. line = f" {used}/{total} {resource}" if resource in ["memory", "object_store_memory"]: - to_GiB = 1 / 2**30 + to_GiB = ray.ray_constants.MEMORY_RESOURCE_UNIT_BYTES / 2**30 used *= to_GiB total *= to_GiB line = f" {used:.2f}/{total:.3f} GiB {resource}" diff --git a/python/ray/tests/test_resource_demand_scheduler.py b/python/ray/tests/test_resource_demand_scheduler.py index d753ffcab35a..977c2f2b8148 100644 --- a/python/ray/tests/test_resource_demand_scheduler.py +++ b/python/ray/tests/test_resource_demand_scheduler.py @@ -8,7 +8,6 @@ import copy import ray -import ray.ray_constants from ray.autoscaler._private.util import \ rewrite_legacy_yaml_to_available_node_types, format_info_string, \ format_info_string_no_node_types @@ -1216,27 +1215,15 @@ def testSummary(self): strategy=PlacementStrategy.PACK, bundles=([Bundle(unit_resources={"GPU": 2})] * 2)), ] - lm.update( - "1.1.1.1", - { - "CPU": 64, - "memory": 20, # 1000 MiB - "object_store_memory": 40 # 2000 MiB - }, - { - "CPU": 2, - "memory": 10, # 500 MiB - "object_store_memory": 20 # 1000 MiB - }, - {}) + lm.update("1.1.1.1", {"CPU": 64}, {"CPU": 2}, {}) lm.update("1.1.1.2", { "CPU": 64, "GPU": 8, - "accelerator_type:V100": 1, + "accelerator_type:V100": 1 }, { "CPU": 0, "GPU": 1, - "accelerator_type:V100": 1, + "accelerator_type:V100": 1 }, {}) lm.update("1.1.1.3", { "CPU": 64, @@ -1270,9 +1257,6 @@ def testSummary(self): assert summary.usage["CPU"] == (190, 194) assert summary.usage["GPU"] == (15, 16) - assert summary.usage["memory"] == (500 * 2**20, 1000 * 2**20) - assert summary.usage["object_store_memory"] == \ - (1000 * 2**20, 2000 * 2**20) assert summary.usage["accelerator_type:V100"][1] == 2, \ "Not comparing the usage value due to floating point error." @@ -1296,7 +1280,7 @@ def testSummary(self): # TODO (Alex): This set of nodes won't be very useful in practice # because the node:xxx.xxx.xxx.xxx resources means that no 2 nodes # should ever have the same set of resources. - assert len(summary.node_types) == 3, summary.node_types + assert len(summary.node_types) == 3 class AutoscalingTest(unittest.TestCase): @@ -2429,8 +2413,8 @@ def test_info_string(): "CPU": (530, 544), "GPU": (2, 2), "AcceleratorType:V100": (0, 2), - "memory": (2 * 2**30, 2**33), - "object_store_memory": (3.14 * 2**30, 2**34) + "memory": (0, 1583.19), + "object_store_memory": (0, 471.02) }, resource_demand=[({ "CPU": 1 @@ -2473,11 +2457,11 @@ def test_info_string(): -------------------------------------------------------- Usage: - 0/2 AcceleratorType:V100 530/544 CPU 2/2 GPU - 2.00/8.000 GiB memory - 3.14/16.000 GiB object_store_memory + 0/2 AcceleratorType:V100 + 0.00/77.304 GiB memory + 0.00/22.999 GiB object_store_memory Demands: {'CPU': 1}: 150+ pending tasks/actors @@ -2500,8 +2484,8 @@ def test_info_string_no_node_type(): "CPU": (530, 544), "GPU": (2, 2), "AcceleratorType:V100": (0, 2), - "memory": (2 * 2**30, 2**33), - "object_store_memory": (3.14 * 2**30, 2**34) + "memory": (0, 1583.19), + "object_store_memory": (0, 471.02) }, resource_demand=[({ "CPU": 1 @@ -2528,11 +2512,11 @@ def test_info_string_no_node_type(): Resources ----------------------------------------------------- Usage: - 0/2 AcceleratorType:V100 530/544 CPU 2/2 GPU - 2.00/8.000 GiB memory - 3.14/16.000 GiB object_store_memory + 0/2 AcceleratorType:V100 + 0.00/77.304 GiB memory + 0.00/22.999 GiB object_store_memory Demands: {'CPU': 1}: 150+ pending tasks/actors From 8a435b50d8589a887863be1f4178da98574a6c73 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 053/244] Revert "[docs] optuna variable typo (#14006)" This reverts commit 2581a455592f3ee4c39fe723af587cd87bc4677d. --- python/ray/tune/suggest/optuna.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/tune/suggest/optuna.py b/python/ray/tune/suggest/optuna.py index a966892d0ef5..61dd13d62646 100644 --- a/python/ray/tune/suggest/optuna.py +++ b/python/ray/tune/suggest/optuna.py @@ -98,7 +98,7 @@ class OptunaSearch(Searcher): param.suggest_uniform("b", 10, 20) ] - optuna_search = OptunaSearch( + algo = OptunaSearch( space, metric="loss", mode="min") From 222cfd429a28015b12d5107178cc3f4aff25ac87 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 054/244] Revert "[tune] pass trainable function name when using `tune.with_parameters` (#14009)" This reverts commit 7bf3d0b593544b28eabe42c3ba10aa94df48cf5d. --- python/ray/tune/function_runner.py | 6 ------ python/ray/tune/tests/test_function_api.py | 2 -- 2 files changed, 8 deletions(-) diff --git a/python/ray/tune/function_runner.py b/python/ray/tune/function_runner.py index c7c088293757..9da6b260130a 100644 --- a/python/ray/tune/function_runner.py +++ b/python/ray/tune/function_runner.py @@ -644,22 +644,16 @@ def inner(config, checkpoint_dir=None): fn_kwargs[k] = parameter_registry.get(prefix + k) fn(config, **fn_kwargs) - fn_name = getattr(fn, "__name__", "tune_with_parameters") - inner.__name__ = fn_name - # Use correct function signature if no `checkpoint_dir` parameter is set if not use_checkpoint: def _inner(config): inner(config, checkpoint_dir=None) - _inner.__name__ = fn_name - if hasattr(fn, "__mixins__"): _inner.__mixins__ = fn.__mixins__ return _inner if hasattr(fn, "__mixins__"): inner.__mixins__ = fn.__mixins__ - return inner diff --git a/python/ray/tune/tests/test_function_api.py b/python/ray/tune/tests/test_function_api.py index f7084a1fac2c..9ee2cdc64777 100644 --- a/python/ray/tune/tests/test_function_api.py +++ b/python/ray/tune/tests/test_function_api.py @@ -455,7 +455,6 @@ def train(config, data=None): self.assertEquals(trial_1.last_result["hundred"], 1) self.assertEquals(trial_2.last_result["metric"], 500_000) self.assertEquals(trial_2.last_result["hundred"], 1) - self.assertTrue(str(trial_1).startswith("train_")) # With checkpoint dir parameter def train(config, checkpoint_dir="DIR", data=None): @@ -470,7 +469,6 @@ def train(config, checkpoint_dir="DIR", data=None): self.assertEquals(trial_1.last_result["cp"], "DIR") self.assertEquals(trial_2.last_result["metric"], 500_000) self.assertEquals(trial_2.last_result["cp"], "DIR") - self.assertTrue(str(trial_1).startswith("train_")) def testWithParameters2(self): class Data: From 90de4e1d1944acd10f25cc0773f985a526741a9e Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 055/244] Revert "[RLlib]: Trajectory View API: Keep env infos (e.g. for postprocessing callbacks), no matter what. (#13555)" This reverts commit 517472b5b149403291e1bd306b0e408635c7add2. --- rllib/policy/dynamic_tf_policy.py | 4 ++-- rllib/policy/policy.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/rllib/policy/dynamic_tf_policy.py b/rllib/policy/dynamic_tf_policy.py index a5b01db875c8..10ecf99311e6 100644 --- a/rllib/policy/dynamic_tf_policy.py +++ b/rllib/policy/dynamic_tf_policy.py @@ -590,12 +590,12 @@ def fake_array(tensor): del self._loss_input_dict[key] # Remove those not needed at all (leave those that are needed # by Sampler to properly execute sample collection). - # Also always leave DONES, REWARDS, and INFOS, no matter what. + # Also always leave DONES and REWARDS, no matter what. for key in list(self.view_requirements.keys()): if key not in all_accessed_keys and key not in [ SampleBatch.EPS_ID, SampleBatch.AGENT_INDEX, SampleBatch.UNROLL_ID, SampleBatch.DONES, - SampleBatch.REWARDS, SampleBatch.INFOS] and \ + SampleBatch.REWARDS] and \ key not in self.model.view_requirements: # If user deleted this key manually in postprocessing # fn, warn about it and do not remove from diff --git a/rllib/policy/policy.py b/rllib/policy/policy.py index d208c7d1537d..1bce4b96d97e 100644 --- a/rllib/policy/policy.py +++ b/rllib/policy/policy.py @@ -676,12 +676,12 @@ def _initialize_loss_from_dummy_batch( self.view_requirements[key].used_for_training = False # Remove those not needed at all (leave those that are needed # by Sampler to properly execute sample collection). - # Also always leave DONES, REWARDS, INFOS, no matter what. + # Also always leave DONES and REWARDS, no matter what. for key in list(self.view_requirements.keys()): if key not in all_accessed_keys and key not in [ SampleBatch.EPS_ID, SampleBatch.AGENT_INDEX, SampleBatch.UNROLL_ID, SampleBatch.DONES, - SampleBatch.REWARDS, SampleBatch.INFOS] and \ + SampleBatch.REWARDS] and \ key not in self.model.view_requirements: # If user deleted this key manually in postprocessing # fn, warn about it and do not remove from From 77bc2813a5bf2fce2b17c014f079a69c2ba8370a Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 056/244] Revert "[Core]Fix ray.kill doesn't cancel pending actor bug (#13254)" This reverts commit 50319b205fe44b9b7bfbf73219b276e713c8ecf8. --- .../main/java/io/ray/test/KillActorTest.java | 2 - python/ray/tests/test_actor_advanced.py | 84 ------------ python/ray/tests/test_placement_group.py | 12 +- python/ray/tests/test_queue.py | 6 +- python/ray/tests/test_reference_counting.py | 4 +- src/ray/core_worker/core_worker.cc | 4 +- src/ray/core_worker/core_worker.h | 1 - src/ray/gcs/accessor.h | 10 -- .../gcs/gcs_client/service_based_accessor.cc | 20 --- .../gcs/gcs_client/service_based_accessor.h | 3 - src/ray/gcs/gcs_server/gcs_actor_manager.cc | 126 +++++------------- src/ray/gcs/gcs_server/gcs_actor_manager.h | 26 +--- src/ray/gcs/gcs_server/gcs_actor_scheduler.cc | 36 +---- src/ray/gcs/gcs_server/gcs_actor_scheduler.h | 6 +- .../gcs_server/test/gcs_actor_manager_test.cc | 9 +- .../test/gcs_actor_scheduler_test.cc | 3 +- src/ray/protobuf/gcs_service.proto | 18 --- src/ray/rpc/gcs_server/gcs_rpc_client.h | 4 - src/ray/rpc/gcs_server/gcs_rpc_server.h | 5 - 19 files changed, 54 insertions(+), 325 deletions(-) diff --git a/java/test/src/main/java/io/ray/test/KillActorTest.java b/java/test/src/main/java/io/ray/test/KillActorTest.java index 753b00a9c59c..fd92b97118ef 100644 --- a/java/test/src/main/java/io/ray/test/KillActorTest.java +++ b/java/test/src/main/java/io/ray/test/KillActorTest.java @@ -59,8 +59,6 @@ private static void remoteKill(ActorHandle actor, boolean noRestart) { private void testKillActor(BiConsumer, Boolean> kill, boolean noRestart) { ActorHandle actor = Ray.actor(HangActor::new).setMaxRestarts(1).remote(); - // Wait for the actor to be created. - actor.task(HangActor::ping).remote().get(); ObjectRef result = actor.task(HangActor::hang).remote(); // The actor will hang in this task. Assert.assertEquals(0, Ray.wait(ImmutableList.of(result), 1, 500).getReady().size()); diff --git a/python/ray/tests/test_actor_advanced.py b/python/ray/tests/test_actor_advanced.py index 496e977fe9cd..1913decf83df 100644 --- a/python/ray/tests/test_actor_advanced.py +++ b/python/ray/tests/test_actor_advanced.py @@ -1093,90 +1093,6 @@ class Actor2: global_state_accessor.disconnect() -def test_kill_pending_actor_with_no_restart_true(): - cluster = ray.init() - global_state_accessor = GlobalStateAccessor( - cluster["redis_address"], ray.ray_constants.REDIS_DEFAULT_PASSWORD) - global_state_accessor.connect() - - @ray.remote(resources={"WORKER": 1.0}) - class PendingActor: - pass - - # Kill actor with `no_restart=True`. - actor = PendingActor.remote() - # TODO(ffbin): The raylet doesn't guarantee the order when dealing with - # RequestWorkerLease and CancelWorkerLease. If we kill the actor - # immediately after creating the actor, we may not be able to clean up - # the request cached by the raylet. - # See https://github.com/ray-project/ray/issues/13545 for details. - time.sleep(1) - ray.kill(actor, no_restart=True) - - def condition1(): - message = global_state_accessor.get_all_resource_usage() - resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString( - message) - if len(resource_usages.resource_load_by_shape.resource_demands) == 0: - return True - return False - - # Actor is dead, so the infeasible task queue length is 0. - wait_for_condition(condition1, timeout=10) - - global_state_accessor.disconnect() - ray.shutdown() - - -def test_kill_pending_actor_with_no_restart_false(): - cluster = ray.init() - global_state_accessor = GlobalStateAccessor( - cluster["redis_address"], ray.ray_constants.REDIS_DEFAULT_PASSWORD) - global_state_accessor.connect() - - @ray.remote(resources={"WORKER": 1.0}, max_restarts=1) - class PendingActor: - pass - - # Kill actor with `no_restart=False`. - actor = PendingActor.remote() - # TODO(ffbin): The raylet doesn't guarantee the order when dealing with - # RequestWorkerLease and CancelWorkerLease. If we kill the actor - # immediately after creating the actor, we may not be able to clean up - # the request cached by the raylet. - # See https://github.com/ray-project/ray/issues/13545 for details. - time.sleep(1) - ray.kill(actor, no_restart=False) - - def condition1(): - message = global_state_accessor.get_all_resource_usage() - resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString( - message) - if len(resource_usages.resource_load_by_shape.resource_demands) == 0: - return False - return True - - # Actor restarts, so the infeasible task queue length is 1. - wait_for_condition(condition1, timeout=10) - - # Kill actor again and actor is dead, - # so the infeasible task queue length is 0. - ray.kill(actor, no_restart=False) - - def condition2(): - message = global_state_accessor.get_all_resource_usage() - resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString( - message) - if len(resource_usages.resource_load_by_shape.resource_demands) == 0: - return True - return False - - wait_for_condition(condition2, timeout=10) - - global_state_accessor.disconnect() - ray.shutdown() - - if __name__ == "__main__": import pytest # Test suite is timing out. Disable on windows for now. diff --git a/python/ray/tests/test_placement_group.py b/python/ray/tests/test_placement_group.py index 92ef90ca4e1e..024ff6c5557a 100644 --- a/python/ray/tests/test_placement_group.py +++ b/python/ray/tests/test_placement_group.py @@ -902,10 +902,8 @@ def schedule_nested_actor_outside_pg(self): # Kill an actor and wait until it is killed. ray.kill(a) - try: + with pytest.raises(ray.exceptions.RayActorError): ray.get(a.ready.remote()) - except ray.exceptions.RayActorError: - pass # Now create an actor, but do not capture the current tasks a = Actor.options( @@ -927,10 +925,8 @@ def schedule_nested_actor_outside_pg(self): # Kill an actor and wait until it is killed. ray.kill(a) - try: + with pytest.raises(ray.exceptions.RayActorError): ray.get(a.ready.remote()) - except ray.exceptions.RayActorError: - pass # Lastly, make sure when None is specified, actors are not scheduled # on the same placement group. @@ -1420,10 +1416,8 @@ def schedule_nested_actor_with_detached_pg(self): # Kill an actor and wait until it is killed. ray.kill(a) - try: + with pytest.raises(ray.exceptions.RayActorError): ray.get(a.ready.remote()) - except ray.exceptions.RayActorError: - pass # We should have 2 alive pgs and 4 alive actors. assert assert_alive_num_pg(2) diff --git a/python/ray/tests/test_queue.py b/python/ray/tests/test_queue.py index 88cf6d7b647f..6c2fb5cf0ec9 100644 --- a/python/ray/tests/test_queue.py +++ b/python/ray/tests/test_queue.py @@ -199,19 +199,17 @@ def test_custom_resources(ray_start_regular_shared): assert current_resources["CPU"] == 1.0 # By default an actor should not reserve any resources. - q = Queue() + Queue() current_resources = ray.available_resources() assert current_resources["CPU"] == 1.0 - q.shutdown() # Specify resource requirement. The queue should now reserve 1 CPU. - q = Queue(actor_options={"num_cpus": 1}) + Queue(actor_options={"num_cpus": 1}) def no_cpu_in_resources(): return "CPU" not in ray.available_resources() wait_for_condition(no_cpu_in_resources) - q.shutdown() if __name__ == "__main__": diff --git a/python/ray/tests/test_reference_counting.py b/python/ray/tests/test_reference_counting.py index 9fcd3c25f4c4..02638ed3dea8 100644 --- a/python/ray/tests/test_reference_counting.py +++ b/python/ray/tests/test_reference_counting.py @@ -470,10 +470,8 @@ def delete_ref2(self): # Test that the actor exiting stops the reference from being pinned. ray.kill(actor) # Wait for the actor to exit. - try: + with pytest.raises(ray.exceptions.RayActorError): ray.get(actor.delete_ref1.remote()) - except ray.exceptions.RayActorError: - pass else: # Test that deleting the second reference stops it from being pinned. ray.get(actor.delete_ref2.remote()) diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index f7c663b5043b..6c8287c1507b 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -1629,9 +1629,7 @@ Status CoreWorker::KillActor(const ActorID &actor_id, bool force_kill, bool no_r stream << "Failed to find a corresponding actor handle for " << actor_id; return Status::Invalid(stream.str()); } - - RAY_CHECK_OK( - gcs_client_->Actors().AsyncKillActor(actor_id, force_kill, no_restart, nullptr)); + direct_actor_submitter_->KillActor(actor_id, force_kill, no_restart); return Status::OK(); } diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h index 83242c00059b..6fa24c29e94e 100644 --- a/src/ray/core_worker/core_worker.h +++ b/src/ray/core_worker/core_worker.h @@ -728,7 +728,6 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { /// Tell an actor to exit immediately, without completing outstanding work. /// /// \param[in] actor_id ID of the actor to kill. - /// \param[in] force_kill Whether to force kill an actor by killing the worker. /// \param[in] no_restart If set to true, the killed actor will not be /// restarted anymore. /// \param[out] Status diff --git a/src/ray/gcs/accessor.h b/src/ray/gcs/accessor.h index db240b411cdf..be929ec3ff0d 100644 --- a/src/ray/gcs/accessor.h +++ b/src/ray/gcs/accessor.h @@ -64,16 +64,6 @@ class ActorInfoAccessor { virtual Status AsyncRegisterActor(const TaskSpecification &task_spec, const StatusCallback &callback) = 0; - /// Kill actor via GCS asynchronously. - /// - /// \param actor_id The ID of actor to destroy. - /// \param force_kill Whether to force kill an actor by killing the worker. - /// \param no_restart If set to true, the killed actor will not be restarted anymore. - /// \param callback Callback that will be called after the actor is destroyed. - /// \return Status - virtual Status AsyncKillActor(const ActorID &actor_id, bool force_kill, bool no_restart, - const StatusCallback &callback) = 0; - /// Asynchronously request GCS to create the actor. /// /// This should be called after the worker has resolved the actor dependencies. diff --git a/src/ray/gcs/gcs_client/service_based_accessor.cc b/src/ray/gcs/gcs_client/service_based_accessor.cc index 5905966cb92a..a82e0ab6bcdd 100644 --- a/src/ray/gcs/gcs_client/service_based_accessor.cc +++ b/src/ray/gcs/gcs_client/service_based_accessor.cc @@ -200,26 +200,6 @@ Status ServiceBasedActorInfoAccessor::AsyncRegisterActor( return Status::OK(); } -Status ServiceBasedActorInfoAccessor::AsyncKillActor( - const ActorID &actor_id, bool force_kill, bool no_restart, - const ray::gcs::StatusCallback &callback) { - rpc::KillActorViaGcsRequest request; - request.set_actor_id(actor_id.Binary()); - request.set_force_kill(force_kill); - request.set_no_restart(no_restart); - client_impl_->GetGcsRpcClient().KillActorViaGcs( - request, [callback](const Status &, const rpc::KillActorViaGcsReply &reply) { - if (callback) { - auto status = - reply.status().code() == (int)StatusCode::OK - ? Status() - : Status(StatusCode(reply.status().code()), reply.status().message()); - callback(status); - } - }); - return Status::OK(); -} - Status ServiceBasedActorInfoAccessor::AsyncCreateActor( const ray::TaskSpecification &task_spec, const ray::gcs::StatusCallback &callback) { RAY_CHECK(task_spec.IsActorCreationTask() && callback); diff --git a/src/ray/gcs/gcs_client/service_based_accessor.h b/src/ray/gcs/gcs_client/service_based_accessor.h index 8aab5198f28e..c883e7b626a7 100644 --- a/src/ray/gcs/gcs_client/service_based_accessor.h +++ b/src/ray/gcs/gcs_client/service_based_accessor.h @@ -85,9 +85,6 @@ class ServiceBasedActorInfoAccessor : public ActorInfoAccessor { Status AsyncCreateActor(const TaskSpecification &task_spec, const StatusCallback &callback) override; - Status AsyncKillActor(const ActorID &actor_id, bool force_kill, bool no_restart, - const StatusCallback &callback) override; - Status AsyncSubscribeAll( const SubscribeCallback &subscribe, const StatusCallback &done) override; diff --git a/src/ray/gcs/gcs_server/gcs_actor_manager.cc b/src/ray/gcs/gcs_server/gcs_actor_manager.cc index 338fc149c327..2f3740654c8b 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_actor_manager.cc @@ -214,25 +214,6 @@ void GcsActorManager::HandleGetNamedActorInfo( ++counts_[CountType::GET_NAMED_ACTOR_INFO_REQUEST]; } -void GcsActorManager::HandleKillActorViaGcs(const rpc::KillActorViaGcsRequest &request, - rpc::KillActorViaGcsReply *reply, - rpc::SendReplyCallback send_reply_callback) { - const auto &actor_id = ActorID::FromBinary(request.actor_id()); - bool force_kill = request.force_kill(); - bool no_restart = request.no_restart(); - if (no_restart) { - DestroyActor(actor_id); - } else { - KillActor(actor_id, force_kill, no_restart); - } - - GCS_RPC_SEND_REPLY(send_reply_callback, reply, Status::OK()); - RAY_LOG(DEBUG) << "Finished killing actor, job id = " << actor_id.JobId() - << ", actor id = " << actor_id << ", force_kill = " << force_kill - << ", no_restart = " << no_restart; - ++counts_[CountType::KILL_ACTOR_REQUEST]; -} - Status GcsActorManager::RegisterActor(const ray::rpc::RegisterActorRequest &request, RegisterActorCallback success_callback) { // NOTE: After the abnormal recovery of the network between GCS client and GCS server or @@ -436,11 +417,8 @@ void GcsActorManager::DestroyActor(const ActorID &actor_id) { actor_to_register_callbacks_.erase(actor_id); actor_to_create_callbacks_.erase(actor_id); auto it = registered_actors_.find(actor_id); - if (it == registered_actors_.end()) { - RAY_LOG(INFO) << "Tried to destroy actor that does not exist " << actor_id; - return; - } - const auto &task_id = it->second->GetCreationTaskSpecification().TaskId(); + RAY_CHECK(it != registered_actors_.end()) + << "Tried to destroy actor that does not exist " << actor_id; it->second->GetMutableActorTableData()->mutable_task_spec()->Clear(); it->second->GetMutableActorTableData()->set_timestamp(current_sys_time_ms()); AddDestroyedActorToCache(it->second); @@ -478,13 +456,38 @@ void GcsActorManager::DestroyActor(const ActorID &actor_id) { if (node_it != created_actors_.end() && node_it->second.count(worker_id)) { // The actor has already been created. Destroy the process by force-killing // it. - NotifyCoreWorkerToKillActor(actor); + KillActor(actor); RAY_CHECK(node_it->second.erase(actor->GetWorkerID())); if (node_it->second.empty()) { created_actors_.erase(node_it); } } else { - CancelActorInScheduling(actor, task_id); + // The actor has not been created yet. It is either being scheduled or is + // pending scheduling. + auto canceled_actor_id = + gcs_actor_scheduler_->CancelOnWorker(actor->GetNodeID(), actor->GetWorkerID()); + if (!canceled_actor_id.IsNil()) { + // The actor was being scheduled and has now been canceled. + RAY_CHECK(canceled_actor_id == actor_id); + } else { + auto pending_it = + std::find_if(pending_actors_.begin(), pending_actors_.end(), + [actor_id](const std::shared_ptr &actor) { + return actor->GetActorID() == actor_id; + }); + + // The actor was pending scheduling. Remove it from the queue. + if (pending_it != pending_actors_.end()) { + pending_actors_.erase(pending_it); + } else { + // When actor creation request of this actor id is pending in raylet, + // it doesn't responds, and the actor should be still in leasing state. + // NOTE: Raylet will cancel the lease request once it receives the + // actor state notification. So this method doesn't have to cancel + // outstanding lease request by calling raylet_client->CancelWorkerLease + gcs_actor_scheduler_->CancelOnLeasing(node_id, actor_id); + } + } } } @@ -703,7 +706,7 @@ void GcsActorManager::ReconstructActor(const ActorID &actor_id, bool need_resche RAY_CHECK_OK(gcs_table_storage_->ActorTable().Put( actor_id, *mutable_actor_table_data, [this, actor, actor_id, mutable_actor_table_data](Status status) { - // If actor was an detached actor, make sure to destroy it. + // if actor was an detached actor, make sure to destroy it. // We need to do this because detached actors are not destroyed // when its owners are dead because it doesn't have owners. if (actor->IsDetached()) { @@ -931,47 +934,15 @@ void GcsActorManager::RemoveActorFromOwner(const std::shared_ptr &acto } } -void GcsActorManager::NotifyCoreWorkerToKillActor(const std::shared_ptr &actor, - bool force_kill, bool no_restart) { +void GcsActorManager::KillActor(const std::shared_ptr &actor) { auto actor_client = worker_client_factory_(actor->GetAddress()); rpc::KillActorRequest request; request.set_intended_actor_id(actor->GetActorID().Binary()); - request.set_force_kill(force_kill); - request.set_no_restart(no_restart); + request.set_force_kill(true); + request.set_no_restart(true); RAY_UNUSED(actor_client->KillActor(request, nullptr)); } -void GcsActorManager::KillActor(const ActorID &actor_id, bool force_kill, - bool no_restart) { - RAY_LOG(DEBUG) << "Killing actor, job id = " << actor_id.JobId() - << ", actor id = " << actor_id << ", force_kill = " << force_kill; - const auto &it = registered_actors_.find(actor_id); - if (it == registered_actors_.end()) { - RAY_LOG(INFO) << "Tried to kill actor that does not exist " << actor_id; - return; - } - - const auto &actor = it->second; - if (actor->GetState() == rpc::ActorTableData::DEAD || - actor->GetState() == rpc::ActorTableData::DEPENDENCIES_UNREADY) { - return; - } - - // The actor is still alive or pending creation. - const auto &node_id = actor->GetNodeID(); - const auto &worker_id = actor->GetWorkerID(); - auto node_it = created_actors_.find(node_id); - if (node_it != created_actors_.end() && node_it->second.count(worker_id)) { - // The actor has already been created. Destroy the process by force-killing - // it. - NotifyCoreWorkerToKillActor(actor, force_kill, no_restart); - } else { - const auto &task_id = actor->GetCreationTaskSpecification().TaskId(); - CancelActorInScheduling(actor, task_id); - ReconstructActor(actor_id, /*need_reschedule=*/true); - } -} - void GcsActorManager::AddDestroyedActorToCache(const std::shared_ptr &actor) { if (destroyed_actors_.size() >= RayConfig::instance().maximum_gcs_destroyed_actor_cached_count()) { @@ -985,36 +956,6 @@ void GcsActorManager::AddDestroyedActorToCache(const std::shared_ptr & actor->GetActorID(), (int64_t)actor->GetActorTableData().timestamp()); } -void GcsActorManager::CancelActorInScheduling(const std::shared_ptr &actor, - const TaskID &task_id) { - const auto &actor_id = actor->GetActorID(); - const auto &node_id = actor->GetNodeID(); - // The actor has not been created yet. It is either being scheduled or is - // pending scheduling. - auto canceled_actor_id = - gcs_actor_scheduler_->CancelOnWorker(actor->GetNodeID(), actor->GetWorkerID()); - if (!canceled_actor_id.IsNil()) { - // The actor was being scheduled and has now been canceled. - RAY_CHECK(canceled_actor_id == actor_id); - } else { - auto pending_it = std::find_if(pending_actors_.begin(), pending_actors_.end(), - [actor_id](const std::shared_ptr &actor) { - return actor->GetActorID() == actor_id; - }); - - // The actor was pending scheduling. Remove it from the queue. - if (pending_it != pending_actors_.end()) { - pending_actors_.erase(pending_it); - } else { - // When actor creation request of this actor id is pending in raylet, - // it doesn't responds, and the actor should be still in leasing state. - // NOTE: We will cancel outstanding lease request by calling - // `raylet_client->CancelWorkerLease`. - gcs_actor_scheduler_->CancelOnLeasing(node_id, actor_id, task_id); - } - } -} - std::string GcsActorManager::DebugString() const { std::ostringstream stream; stream << "GcsActorManager: {RegisterActor request count: " @@ -1023,7 +964,6 @@ std::string GcsActorManager::DebugString() const { << ", GetActorInfo request count: " << counts_[CountType::GET_ACTOR_INFO_REQUEST] << ", GetNamedActorInfo request count: " << counts_[CountType::GET_NAMED_ACTOR_INFO_REQUEST] - << ", KillActor request count: " << counts_[CountType::KILL_ACTOR_REQUEST] << ", Registered actors count: " << registered_actors_.size() << ", Destroyed actors count: " << destroyed_actors_.size() << ", Named actors count: " << named_actors_.size() diff --git a/src/ray/gcs/gcs_server/gcs_actor_manager.h b/src/ray/gcs/gcs_server/gcs_actor_manager.h index f2db9345f0ba..d3ffc309793e 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_manager.h +++ b/src/ray/gcs/gcs_server/gcs_actor_manager.h @@ -190,10 +190,6 @@ class GcsActorManager : public rpc::ActorInfoHandler { rpc::GetAllActorInfoReply *reply, rpc::SendReplyCallback send_reply_callback) override; - void HandleKillActorViaGcs(const rpc::KillActorViaGcsRequest &request, - rpc::KillActorViaGcsReply *reply, - rpc::SendReplyCallback send_reply_callback) override; - /// Register actor asynchronously. /// /// \param request Contains the meta info to create the actor. @@ -340,18 +336,8 @@ class GcsActorManager : public rpc::ActorInfoHandler { /// Kill the specified actor. /// - /// \param actor_id ID of the actor to kill. - /// \param force_kill Whether to force kill an actor by killing the worker. - /// \param no_restart If set to true, the killed actor will not be restarted anymore. - void KillActor(const ActorID &actor_id, bool force_kill, bool no_restart); - - /// Notify CoreWorker to kill the specified actor. - /// /// \param actor The actor to be killed. - /// \param force_kill Whether to force kill an actor by killing the worker. - /// \param no_restart If set to true, the killed actor will not be restarted anymore. - void NotifyCoreWorkerToKillActor(const std::shared_ptr &actor, - bool force_kill = true, bool no_restart = true); + void KillActor(const std::shared_ptr &actor); /// Add the destroyed actor to the cache. If the cache is full, one actor is randomly /// evicted. @@ -370,13 +356,6 @@ class GcsActorManager : public rpc::ActorInfoHandler { return actor_delta; } - /// Cancel actor which is either being scheduled or is pending scheduling. - /// - /// \param actor The actor to be cancelled. - /// \param task_id The id of actor creation task to be cancelled. - void CancelActorInScheduling(const std::shared_ptr &actor, - const TaskID &task_id); - /// Callbacks of pending `RegisterActor` requests. /// Maps actor ID to actor registration callbacks, which is used to filter duplicated /// messages from a driver/worker caused by some network problems. @@ -434,8 +413,7 @@ class GcsActorManager : public rpc::ActorInfoHandler { GET_ACTOR_INFO_REQUEST = 2, GET_NAMED_ACTOR_INFO_REQUEST = 3, GET_ALL_ACTOR_INFO_REQUEST = 4, - KILL_ACTOR_REQUEST = 5, - CountType_MAX = 6, + CountType_MAX = 10, }; uint64_t counts_[CountType::CountType_MAX] = {0}; }; diff --git a/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc b/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc index 1b4201c4f573..9c81c8c0e98d 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc +++ b/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc @@ -127,27 +127,13 @@ std::vector GcsActorScheduler::CancelOnNode(const NodeID &node_id) { return actor_ids; } -void GcsActorScheduler::CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id, - const TaskID &task_id) { - // NOTE: This method will cancel the outstanding lease request and remove leasing - // information from the internal state. +void GcsActorScheduler::CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id) { + // NOTE: This method does not currently cancel the outstanding lease request. + // It only removes leasing information from the internal state so that + // RequestWorkerLease ignores the response from raylet. auto node_it = node_to_actors_when_leasing_.find(node_id); - if (node_it != node_to_actors_when_leasing_.end()) { - node_it->second.erase(actor_id); - } - - const auto &alive_nodes = gcs_node_manager_.GetAllAliveNodes(); - const auto &iter = alive_nodes.find(node_id); - if (iter != alive_nodes.end()) { - const auto &node_info = iter->second; - rpc::Address address; - address.set_raylet_id(node_info->node_id()); - address.set_ip_address(node_info->node_manager_address()); - address.set_port(node_info->node_manager_port()); - auto lease_client = GetOrConnectLeaseClient(address); - lease_client->CancelWorkerLease( - task_id, [](const Status &status, const rpc::CancelWorkerLeaseReply &reply) {}); - } + RAY_CHECK(node_it != node_to_actors_when_leasing_.end()); + node_it->second.erase(actor_id); } ActorID GcsActorScheduler::CancelOnWorker(const NodeID &node_id, @@ -252,16 +238,6 @@ void GcsActorScheduler::LeaseWorkerFromNode(std::shared_ptr actor, } if (status.ok()) { - if (reply.worker_address().raylet_id().empty() && - reply.retry_at_raylet_address().raylet_id().empty()) { - // Actor creation task has been cancelled. It is triggered by `ray.kill`. If - // the number of remaining restarts of the actor is not equal to 0, GCS will - // reschedule the actor, so it return directly here. - RAY_LOG(DEBUG) << "Actor " << actor->GetActorID() - << " creation task has been cancelled."; - return; - } - // Remove the actor from the leasing map as the reply is returned from the // remote node. iter->second.erase(actor_iter); diff --git a/src/ray/gcs/gcs_server/gcs_actor_scheduler.h b/src/ray/gcs/gcs_server/gcs_actor_scheduler.h index c0e3d430ecbf..71dd351087e0 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_scheduler.h +++ b/src/ray/gcs/gcs_server/gcs_actor_scheduler.h @@ -59,8 +59,7 @@ class GcsActorSchedulerInterface { /// /// \param node_id ID of the node where the actor leasing request has been sent. /// \param actor_id ID of an actor. - virtual void CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id, - const TaskID &task_id) = 0; + virtual void CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id) = 0; /// Cancel the actor that is being scheduled to the specified worker. /// @@ -131,8 +130,7 @@ class GcsActorScheduler : public GcsActorSchedulerInterface { /// /// \param node_id ID of the node where the actor leasing request has been sent. /// \param actor_id ID of an actor. - void CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id, - const TaskID &task_id) override; + void CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id) override; /// Cancel the actor that is being scheduled to the specified worker. /// diff --git a/src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc index b8edb6e82164..b88c6702bfeb 100644 --- a/src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc @@ -35,8 +35,7 @@ class MockActorScheduler : public gcs::GcsActorSchedulerInterface { MOCK_METHOD1(CancelOnNode, std::vector(const NodeID &node_id)); MOCK_METHOD2(CancelOnWorker, ActorID(const NodeID &node_id, const WorkerID &worker_id)); - MOCK_METHOD3(CancelOnLeasing, void(const NodeID &node_id, const ActorID &actor_id, - const TaskID &task_id)); + MOCK_METHOD2(CancelOnLeasing, void(const NodeID &node_id, const ActorID &actor_id)); std::vector> actors; }; @@ -736,10 +735,8 @@ TEST_F(GcsActorManagerTest, TestRaceConditionCancelLease) { address.set_raylet_id(node_id.Binary()); address.set_worker_id(worker_id.Binary()); actor->UpdateAddress(address); - const auto &actor_id = actor->GetActorID(); - const auto &task_id = - TaskID::FromBinary(registered_actor->GetActorTableData().task_spec().task_id()); - EXPECT_CALL(*mock_actor_scheduler_, CancelOnLeasing(node_id, actor_id, task_id)); + const auto actor_id = actor->GetActorID(); + EXPECT_CALL(*mock_actor_scheduler_, CancelOnLeasing(node_id, actor_id)); gcs_actor_manager_->OnWorkerDead(owner_node_id, owner_worker_id); } diff --git a/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc b/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc index bd98d65ef0f9..d84f99b3fe88 100644 --- a/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc @@ -262,8 +262,7 @@ TEST_F(GcsActorSchedulerTest, TestLeasingCancelledWhenLeasing) { ASSERT_EQ(1, raylet_client_->callbacks.size()); // Cancel the lease request. - const auto &task_id = TaskID::FromBinary(create_actor_request.task_spec().task_id()); - gcs_actor_scheduler_->CancelOnLeasing(node_id, actor->GetActorID(), task_id); + gcs_actor_scheduler_->CancelOnLeasing(node_id, actor->GetActorID()); ASSERT_EQ(1, raylet_client_->num_workers_requested); ASSERT_EQ(1, raylet_client_->callbacks.size()); diff --git a/src/ray/protobuf/gcs_service.proto b/src/ray/protobuf/gcs_service.proto index 6e2c450dd111..ed5ca92e2a42 100644 --- a/src/ray/protobuf/gcs_service.proto +++ b/src/ray/protobuf/gcs_service.proto @@ -92,22 +92,6 @@ message GetAllActorInfoReply { repeated ActorTableData actor_table_data = 2; } -// `KillActorViaGcsRequest` is sent to GCS Service to ask to kill an actor. -// `KillActorViaGcsRequest` is different from `KillActorRequest`. -// `KillActorRequest` is send to core worker to ask to kill an actor. -message KillActorViaGcsRequest { - // ID of this actor. - bytes actor_id = 1; - // Whether to force kill the actor. - bool force_kill = 2; - // If set to true, the killed actor will not be restarted anymore. - bool no_restart = 3; -} - -message KillActorViaGcsReply { - GcsStatus status = 1; -} - // Service for actor info access. service ActorInfoGcsService { // Register actor to gcs service. @@ -120,8 +104,6 @@ service ActorInfoGcsService { rpc GetNamedActorInfo(GetNamedActorInfoRequest) returns (GetNamedActorInfoReply); // Get information of all actor from GCS Service. rpc GetAllActorInfo(GetAllActorInfoRequest) returns (GetAllActorInfoReply); - // Kill actor via GCS Service. - rpc KillActorViaGcs(KillActorViaGcsRequest) returns (KillActorViaGcsReply); } message RegisterNodeRequest { diff --git a/src/ray/rpc/gcs_server/gcs_rpc_client.h b/src/ray/rpc/gcs_server/gcs_rpc_client.h index bae0e56bd9ae..bf9a72bed7db 100644 --- a/src/ray/rpc/gcs_server/gcs_rpc_client.h +++ b/src/ray/rpc/gcs_server/gcs_rpc_client.h @@ -144,10 +144,6 @@ class GcsRpcClient { VOID_GCS_RPC_CLIENT_METHOD(ActorInfoGcsService, GetAllActorInfo, actor_info_grpc_client_, ) - /// Kill actor via GCS Service. - VOID_GCS_RPC_CLIENT_METHOD(ActorInfoGcsService, KillActorViaGcs, - actor_info_grpc_client_, ) - /// Register a node to GCS Service. VOID_GCS_RPC_CLIENT_METHOD(NodeInfoGcsService, RegisterNode, node_info_grpc_client_, ) diff --git a/src/ray/rpc/gcs_server/gcs_rpc_server.h b/src/ray/rpc/gcs_server/gcs_rpc_server.h index 246a5ee9e306..328aa5f7382d 100644 --- a/src/ray/rpc/gcs_server/gcs_rpc_server.h +++ b/src/ray/rpc/gcs_server/gcs_rpc_server.h @@ -125,10 +125,6 @@ class ActorInfoGcsServiceHandler { virtual void HandleGetAllActorInfo(const GetAllActorInfoRequest &request, GetAllActorInfoReply *reply, SendReplyCallback send_reply_callback) = 0; - - virtual void HandleKillActorViaGcs(const KillActorViaGcsRequest &request, - KillActorViaGcsReply *reply, - SendReplyCallback send_reply_callback) = 0; }; /// The `GrpcService` for `ActorInfoGcsService`. @@ -152,7 +148,6 @@ class ActorInfoGrpcService : public GrpcService { ACTOR_INFO_SERVICE_RPC_HANDLER(GetActorInfo); ACTOR_INFO_SERVICE_RPC_HANDLER(GetNamedActorInfo); ACTOR_INFO_SERVICE_RPC_HANDLER(GetAllActorInfo); - ACTOR_INFO_SERVICE_RPC_HANDLER(KillActorViaGcs); } private: From 61a57356f9f68a3630f9adc3c39e0d7fe58ef4f9 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 057/244] Revert "Skip placement tests on Windows (#14000)" This reverts commit 39b63a9c67d46992635c80b70bc96c031d3d0f59. --- ci/travis/ci.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/travis/ci.sh b/ci/travis/ci.sh index 9324853fee34..61b74b082798 100755 --- a/ci/travis/ci.sh +++ b/ci/travis/ci.sh @@ -165,7 +165,6 @@ test_python() { -python/ray/tests:test_multiprocessing # test_connect_to_ray() fails to connect to raylet -python/ray/tests:test_node_manager -python/ray/tests:test_object_manager - -python/ray/tests:test_placement_group # timeout and OOM -python/ray/tests:test_ray_init # test_redis_port() seems to fail here, but pass in isolation -python/ray/tests:test_resource_demand_scheduler -python/ray/tests:test_stress # timeout From 6834c973cf6baa8e4badc5552b3cf434b7d85fb7 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 058/244] Revert "[autoscaler][kubernetes] Ray client setup, example config simplification, example scripts. (#13920)" This reverts commit a4226a4a5111814eb897b7e03e6b7880cd1c9a34. --- python/ray/autoscaler/_private/commands.py | 2 +- .../ray/autoscaler/kubernetes/defaults.yaml | 237 ++++++++++------ .../kubernetes/example-full-legacy.yaml | 261 ------------------ .../autoscaler/kubernetes/example-full.yaml | 255 +++++++++++------ .../kubernetes/example-minimal.yaml | 29 +- .../kubernetes/example_scripts/job_example.py | 71 ----- .../example_scripts/run_local_example.py | 58 ---- .../kubernetes/example_scripts/run_on_head.py | 50 ---- .../autoscaler/kubernetes/job-example.yaml | 24 -- .../operator_configs/cluster_crd.yaml | 6 +- .../operator_configs/example_cluster.yaml | 27 +- .../operator_configs/example_cluster2.yaml | 27 +- .../kubernetes/operator_configs/operator.yaml | 2 +- python/ray/autoscaler/ray-schema.json | 8 +- python/ray/ray_operator/operator_utils.py | 57 +--- python/ray/tests/test_autoscaler_yaml.py | 9 +- python/ray/tests/test_k8s_cluster_launcher.py | 4 +- .../ray/tests/test_k8s_operator_examples.py | 53 +--- 18 files changed, 392 insertions(+), 788 deletions(-) delete mode 100644 python/ray/autoscaler/kubernetes/example-full-legacy.yaml delete mode 100644 python/ray/autoscaler/kubernetes/example_scripts/job_example.py delete mode 100644 python/ray/autoscaler/kubernetes/example_scripts/run_local_example.py delete mode 100644 python/ray/autoscaler/kubernetes/example_scripts/run_on_head.py delete mode 100644 python/ray/autoscaler/kubernetes/job-example.yaml diff --git a/python/ray/autoscaler/_private/commands.py b/python/ray/autoscaler/_private/commands.py index 336dca40ffd2..84d3b15694ad 100644 --- a/python/ray/autoscaler/_private/commands.py +++ b/python/ray/autoscaler/_private/commands.py @@ -149,7 +149,7 @@ def create_or_update_cluster( redirect_command_output: Optional[bool] = False, use_login_shells: bool = True, no_monitor_on_head: bool = False) -> Dict[str, Any]: - """Creates or updates an autoscaling Ray cluster from a config json.""" + """Create or updates an autoscaling Ray cluster from a config json.""" # no_monitor_on_head is an internal flag used by the Ray K8s operator. # If True, prevents autoscaling config sync to the Ray head during cluster # creation. See https://github.com/ray-project/ray/pull/13720. diff --git a/python/ray/autoscaler/kubernetes/defaults.yaml b/python/ray/autoscaler/kubernetes/defaults.yaml index 4d6d481927f9..31b3301ea0f6 100644 --- a/python/ray/autoscaler/kubernetes/defaults.yaml +++ b/python/ray/autoscaler/kubernetes/defaults.yaml @@ -1,8 +1,12 @@ -# A unique identifier for the head node and workers of this cluster. -cluster_name: defaults +# An unique identifier for the head node and workers of this cluster. +cluster_name: default + +# The minimum number of workers nodes to launch in addition to the head +# node. This number should be >= 0. +min_workers: 0 # The maximum number of workers nodes to launch in addition to the head -# node. +# node. This takes precedence over min_workers. max_workers: 2 # The autoscaler will scale up the cluster faster with higher upscaling speed. @@ -74,83 +78,127 @@ provider: # NOTE: If you're running multiple Ray clusters with services # on one Kubernetes cluster, they must have unique service # names. - name: example-cluster-ray-head + name: ray-head spec: # This selector must match the head node pod's selector below. selector: - component: example-cluster-ray-head + component: ray-head + ports: + - protocol: TCP + port: 8000 + targetPort: 8000 + + # Service that maps to the worker nodes of the Ray cluster. + - apiVersion: v1 + kind: Service + metadata: + # NOTE: If you're running multiple Ray clusters with services + # on one Kubernetes cluster, they must have unique service + # names. + name: ray-workers + spec: + # This selector must match the worker node pods' selector below. + selector: + component: ray-worker ports: - - name: client - protocol: TCP - port: 10001 - targetPort: 10001 - - name: dashboard - protocol: TCP - port: 8265 - targetPort: 8265 - -# Specify the pod type for the ray head node (as configured below). -head_node_type: head_node -# Specify the allowed pod types for this ray cluster and the resources they provide. -available_node_types: - worker_node: - # Minimum number of Ray workers of this Pod type. - min_workers: 0 - # Maximum number of Ray workers of this Pod type. Takes precedence over min_workers. - max_workers: 2 - node_config: - apiVersion: v1 - kind: Pod - metadata: + - protocol: TCP + port: 8000 + targetPort: 8000 + +# Kubernetes pod config for the head node pod. +head_node: + apiVersion: v1 + kind: Pod + metadata: # Automatically generates a name for the pod with this prefix. - generateName: example-cluster-ray-worker- - spec: + generateName: ray-head- + + # Must match the head node service selector above if a head node + # service is required. + labels: + component: ray-head + spec: + # Change this if you altered the autoscaler_service_account above + # or want to provide your own. + serviceAccountName: autoscaler + + # Restarting the head node automatically is not currently supported. + # If the head node goes down, `ray up` must be run again. restartPolicy: Never + + # This volume allocates shared memory for Ray to use for its plasma + # object store. If you do not provide this, Ray will fall back to + # /tmp which cause slowdowns if is not a shared memory volume. volumes: - name: dshm emptyDir: - medium: Memory + medium: Memory + containers: - name: ray-node imagePullPolicy: Always + # You are free (and encouraged) to use your own container image, + # but it should have the following installed: + # - rsync (used for `ray rsync` commands and file mounts) + # - screen (used for `ray attach`) + # - kubectl (used by the autoscaler to manage worker pods) image: rayproject/ray:nightly + # Do not change this command - it keeps the pod alive until it is + # explicitly killed. command: ["/bin/bash", "-c", "--"] args: ["trap : TERM INT; sleep infinity & wait;"] + ports: + - containerPort: 6379 # Redis port. + - containerPort: 6380 # Redis port. + - containerPort: 6381 # Redis port. + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. + # This volume allocates shared memory for Ray to use for its plasma # object store. If you do not provide this, Ray will fall back to # /tmp which cause slowdowns if is not a shared memory volume. volumeMounts: - - mountPath: /dev/shm - name: dshm + - mountPath: /dev/shm + name: dshm resources: - requests: - cpu: 1000m - memory: 512Mi - limits: - # The maximum memory that this pod is allowed to use. The - # limit will be detected by ray and split to use 10% for - # redis, 30% for the shared memory object store, and the - # rest for application memory. If this limit is not set and - # the object store size is not set manually, ray will - # allocate a very large object store in each pod that may - # cause problems for other pods. - memory: 512Mi - head_node: - node_config: - apiVersion: v1 - kind: Pod - metadata: + requests: + cpu: 1000m + memory: 512Mi + limits: + # The maximum memory that this pod is allowed to use. The + # limit will be detected by ray and split to use 10% for + # redis, 30% for the shared memory object store, and the + # rest for application memory. If this limit is not set and + # the object store size is not set manually, ray will + # allocate a very large object store in each pod that may + # cause problems for other pods. + memory: 2Gi + env: + # This is used in the head_start_ray_commands below so that + # Ray can spawn the correct number of processes. Omitting this + # may lead to degraded performance. + - name: MY_CPU_REQUEST + valueFrom: + resourceFieldRef: + resource: requests.cpu + +# Kubernetes pod config for worker node pods. +worker_nodes: + apiVersion: v1 + kind: Pod + metadata: # Automatically generates a name for the pod with this prefix. - generateName: example-cluster-ray-head- - # Must match the head node service selector above if a head node + generateName: ray-worker- + + # Must match the worker node service selector above if a worker node # service is required. labels: - component: example-cluster-ray-head - spec: - # Change this if you altered the autoscaler_service_account above - # or want to provide your own. - serviceAccountName: autoscaler + component: ray-worker + spec: + serviceAccountName: default + # Worker nodes will be managed automatically by the head node, so + # do not change the restart policy. restartPolicy: Never # This volume allocates shared memory for Ray to use for its plasma @@ -159,51 +207,45 @@ available_node_types: volumes: - name: dshm emptyDir: - medium: Memory + medium: Memory + containers: - name: ray-node imagePullPolicy: Always + # You are free (and encouraged) to use your own container image, + # but it should have the following installed: + # - rsync (used for `ray rsync` commands and file mounts) image: rayproject/ray:nightly # Do not change this command - it keeps the pod alive until it is # explicitly killed. command: ["/bin/bash", "-c", "--"] - args: ['trap : TERM INT; sleep infinity & wait;'] + args: ["trap : TERM INT; sleep infinity & wait;"] ports: - - containerPort: 6379 # Redis port - - containerPort: 10001 # Used by Ray Client - - containerPort: 8265 # Used by Ray Dashboard + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. # This volume allocates shared memory for Ray to use for its plasma # object store. If you do not provide this, Ray will fall back to # /tmp which cause slowdowns if is not a shared memory volume. volumeMounts: - - mountPath: /dev/shm - name: dshm + - mountPath: /dev/shm + name: dshm resources: - requests: - cpu: 1000m - memory: 512Mi - limits: - # The maximum memory that this pod is allowed to use. The - # limit will be detected by ray and split to use 10% for - # redis, 30% for the shared memory object store, and the - # rest for application memory. If this limit is not set and - # the object store size is not set manually, ray will - # allocate a very large object store in each pod that may - # cause problems for other pods. - memory: 512Mi - - -# Command to start ray on the head node. You don't need to change this. -# Note dashboard-host is set to 0.0.0.0 so that kubernetes can port forward. -head_start_ray_commands: - - ray stop - - ulimit -n 65536; ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0 - -# Command to start ray on worker nodes. You don't need to change this. -worker_start_ray_commands: - - ray stop - - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 + requests: + cpu: 1000m + memory: 512Mi + limits: + # This memory limit will be detected by ray and split into + # 30% for plasma, and 70% for workers. + memory: 2Gi + env: + # This is used in the head_start_ray_commands below so that + # Ray can spawn the correct number of processes. Omitting this + # may lead to degraded performance. + - name: MY_CPU_REQUEST + valueFrom: + resourceFieldRef: + resource: requests.cpu # Files or directories to copy to the head and worker nodes. The format is a # dictionary from REMOTE_PATH: LOCAL_PATH, e.g. @@ -224,6 +266,16 @@ cluster_synced_files: [] # should sync to the worker node continuously file_mounts_sync_continuously: False +# Patterns for files to exclude when running rsync up or rsync down. +# This is not supported on kubernetes. +# rsync_exclude: [] + +# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for +# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided +# as a value, the behavior will match git's behavior for finding and using .gitignore files. +# This is not supported on kubernetes. +# rsync_filter: [] + # List of commands that will be run before `setup_commands`. If docker is # enabled, these commands will run outside the container and before docker @@ -239,6 +291,13 @@ head_setup_commands: [] # Custom commands that will be run on worker nodes after common setup. worker_setup_commands: [] -head_node: {} +# Command to start ray on the head node. You don't need to change this. +# Note webui-host is set to 0.0.0.0 so that kubernetes can port forward. +head_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0 -worker_nodes: {} +# Command to start ray on worker nodes. You don't need to change this. +worker_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/python/ray/autoscaler/kubernetes/example-full-legacy.yaml b/python/ray/autoscaler/kubernetes/example-full-legacy.yaml deleted file mode 100644 index 1af270ed4f8a..000000000000 --- a/python/ray/autoscaler/kubernetes/example-full-legacy.yaml +++ /dev/null @@ -1,261 +0,0 @@ -# A unique identifier for the head node and workers of this cluster. -cluster_name: example-cluster - -# The minimum number of workers nodes to launch in addition to the head -# node. This number should be >= 0. -min_workers: 0 - -# The maximum number of workers nodes to launch in addition to the head -# node. This takes precedence over min_workers. -max_workers: 2 - -# The autoscaler will scale up the cluster faster with higher upscaling speed. -# E.g., if the task requires adding more nodes then autoscaler will gradually -# scale up the cluster in chunks of upscaling_speed*currently_running_nodes. -# This number should be > 0. -upscaling_speed: 1.0 - -# If a node is idle for this many minutes, it will be removed. -idle_timeout_minutes: 5 - -# Kubernetes resources that need to be configured for the autoscaler to be -# able to manage the Ray cluster. If any of the provided resources don't -# exist, the autoscaler will attempt to create them. If this fails, you may -# not have the required permissions and will have to request them to be -# created by your cluster administrator. -provider: - type: kubernetes - - # Exposing external IP addresses for ray pods isn't currently supported. - use_internal_ips: true - - # Namespace to use for all resources created. - namespace: ray - - # ServiceAccount created by the autoscaler for the head node pod that it - # runs in. If this field isn't provided, the head pod config below must - # contain a user-created service account with the proper permissions. - autoscaler_service_account: - apiVersion: v1 - kind: ServiceAccount - metadata: - name: autoscaler - - # Role created by the autoscaler for the head node pod that it runs in. - # If this field isn't provided, the role referenced in - # autoscaler_role_binding must exist and have at least these permissions. - autoscaler_role: - kind: Role - apiVersion: rbac.authorization.k8s.io/v1 - metadata: - name: autoscaler - rules: - - apiGroups: [""] - resources: ["pods", "pods/status", "pods/exec"] - verbs: ["get", "watch", "list", "create", "delete", "patch"] - - # RoleBinding created by the autoscaler for the head node pod that it runs - # in. If this field isn't provided, the head pod config below must contain - # a user-created service account with the proper permissions. - autoscaler_role_binding: - apiVersion: rbac.authorization.k8s.io/v1 - kind: RoleBinding - metadata: - name: autoscaler - subjects: - - kind: ServiceAccount - name: autoscaler - roleRef: - kind: Role - name: autoscaler - apiGroup: rbac.authorization.k8s.io - - services: - # Service that maps to the head node of the Ray cluster. - - apiVersion: v1 - kind: Service - metadata: - # NOTE: If you're running multiple Ray clusters with services - # on one Kubernetes cluster, they must have unique service - # names. - name: example-cluster-ray-head - spec: - # This selector must match the head node pod's selector below. - selector: - component: example-cluster-ray-head - ports: - - name: client - protocol: TCP - port: 10001 - targetPort: 10001 - - name: dashboard - protocol: TCP - port: 8265 - targetPort: 8265 - - -# Kubernetes pod config for the head node pod. -head_node: - apiVersion: v1 - kind: Pod - metadata: - # Automatically generates a name for the pod with this prefix. - generateName: example-cluster-ray-head- - - # Must match the head node service selector above if a head node - # service is required. - labels: - component: example-cluster-ray-head - spec: - # Change this if you altered the autoscaler_service_account above - # or want to provide your own. - serviceAccountName: autoscaler - - # Restarting the head node automatically is not currently supported. - # If the head node goes down, `ray up` must be run again. - restartPolicy: Never - - # This volume allocates shared memory for Ray to use for its plasma - # object store. If you do not provide this, Ray will fall back to - # /tmp which cause slowdowns if is not a shared memory volume. - volumes: - - name: dshm - emptyDir: - medium: Memory - - containers: - - name: ray-node - imagePullPolicy: Always - # You are free (and encouraged) to use your own container image, - # but it should have the following installed: - # - rsync (used for `ray rsync` commands and file mounts) - # - screen (used for `ray attach`) - # - kubectl (used by the autoscaler to manage worker pods) - image: rayproject/ray:nightly - # Do not change this command - it keeps the pod alive until it is - # explicitly killed. - command: ["/bin/bash", "-c", "--"] - args: ["trap : TERM INT; sleep infinity & wait;"] - ports: - - containerPort: 6379 # Redis port - - containerPort: 10001 # Used by Ray Client - - containerPort: 8265 # Used by Ray Dashboard - - # This volume allocates shared memory for Ray to use for its plasma - # object store. If you do not provide this, Ray will fall back to - # /tmp which cause slowdowns if is not a shared memory volume. - volumeMounts: - - mountPath: /dev/shm - name: dshm - resources: - requests: - cpu: 1000m - memory: 512Mi - limits: - # The maximum memory that this pod is allowed to use. The - # limit will be detected by ray and split to use 10% for - # redis, 30% for the shared memory object store, and the - # rest for application memory. If this limit is not set and - # the object store size is not set manually, ray will - # allocate a very large object store in each pod that may - # cause problems for other pods. - memory: 2Gi - -# Kubernetes pod config for worker node pods. -worker_nodes: - apiVersion: v1 - kind: Pod - metadata: - # Automatically generates a name for the pod with this prefix. - generateName: example-cluster-ray-worker- - - # Must match the worker node service selector above if a worker node - # service is required. - labels: - component: ray-worker - spec: - serviceAccountName: default - - # Worker nodes will be managed automatically by the head node, so - # do not change the restart policy. - restartPolicy: Never - - # This volume allocates shared memory for Ray to use for its plasma - # object store. If you do not provide this, Ray will fall back to - # /tmp which cause slowdowns if is not a shared memory volume. - volumes: - - name: dshm - emptyDir: - medium: Memory - - containers: - - name: ray-node - imagePullPolicy: Always - # You are free (and encouraged) to use your own container image, - # but it should have the following installed: - # - rsync (used for `ray rsync` commands and file mounts) - image: rayproject/ray:nightly - # Do not change this command - it keeps the pod alive until it is - # explicitly killed. - command: ["/bin/bash", "-c", "--"] - args: ["trap : TERM INT; sleep infinity & wait;"] - - # This volume allocates shared memory for Ray to use for its plasma - # object store. If you do not provide this, Ray will fall back to - # /tmp which cause slowdowns if is not a shared memory volume. - volumeMounts: - - mountPath: /dev/shm - name: dshm - resources: - requests: - cpu: 1000m - memory: 512Mi - limits: - # This memory limit will be detected by ray and split into - # 30% for plasma, and 70% for workers. - memory: 2Gi - -# Files or directories to copy to the head and worker nodes. The format is a -# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. -file_mounts: { -# "~/path1/on/remote/machine": "/path1/on/local/machine", -# "~/path2/on/remote/machine": "/path2/on/local/machine", -} -# Note that the container images in this example have a non-root user. -# To avoid permissions issues, we recommend mounting into a subdirectory of home (~). - -# Files or directories to copy from the head node to the worker nodes. The format is a -# list of paths. The same path on the head node will be copied to the worker node. -# This behavior is a subset of the file_mounts behavior. In the vast majority of cases -# you should just use file_mounts. Only use this if you know what you're doing! -cluster_synced_files: [] - -# Whether changes to directories in file_mounts or cluster_synced_files in the head node -# should sync to the worker node continuously -file_mounts_sync_continuously: False - - -# List of commands that will be run before `setup_commands`. If docker is -# enabled, these commands will run outside the container and before docker -# is setup. -initialization_commands: [] - -# List of shell commands to run to set up nodes. -setup_commands: [] - -# Custom commands that will be run on the head node after common setup. -head_setup_commands: [] - -# Custom commands that will be run on worker nodes after common setup. -worker_setup_commands: [] - -# Command to start ray on the head node. You don't need to change this. -# Note dashboard-host is set to 0.0.0.0 so that kubernetes can port forward. -head_start_ray_commands: - - ray stop - - ulimit -n 65536; ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0 - -# Command to start ray on worker nodes. You don't need to change this. -worker_start_ray_commands: - - ray stop - - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 diff --git a/python/ray/autoscaler/kubernetes/example-full.yaml b/python/ray/autoscaler/kubernetes/example-full.yaml index cb09545d4f09..80ada3b27966 100644 --- a/python/ray/autoscaler/kubernetes/example-full.yaml +++ b/python/ray/autoscaler/kubernetes/example-full.yaml @@ -1,8 +1,12 @@ -# A unique identifier for the head node and workers of this cluster. -cluster_name: example-cluster +# An unique identifier for the head node and workers of this cluster. +cluster_name: default + +# The minimum number of workers nodes to launch in addition to the head +# node. This number should be >= 0. +min_workers: 0 # The maximum number of workers nodes to launch in addition to the head -# node. +# node. This takes precedence over min_workers. max_workers: 2 # The autoscaler will scale up the cluster faster with higher upscaling speed. @@ -74,86 +78,127 @@ provider: # NOTE: If you're running multiple Ray clusters with services # on one Kubernetes cluster, they must have unique service # names. - name: example-cluster-ray-head + name: ray-head spec: # This selector must match the head node pod's selector below. selector: - component: example-cluster-ray-head + component: ray-head + ports: + - protocol: TCP + port: 8000 + targetPort: 8000 + + # Service that maps to the worker nodes of the Ray cluster. + - apiVersion: v1 + kind: Service + metadata: + # NOTE: If you're running multiple Ray clusters with services + # on one Kubernetes cluster, they must have unique service + # names. + name: ray-workers + spec: + # This selector must match the worker node pods' selector below. + selector: + component: ray-worker ports: - - name: client - protocol: TCP - port: 10001 - targetPort: 10001 - - name: dashboard - protocol: TCP - port: 8265 - targetPort: 8265 - -# Specify the pod type for the ray head node (as configured below). -head_node_type: head_node -# Specify the allowed pod types for this ray cluster and the resources they provide. -available_node_types: - worker_node: - # Minimum number of Ray workers of this Pod type. - min_workers: 0 - # Maximum number of Ray workers of this Pod type. Takes precedence over min_workers. - max_workers: 2 - # User-specified custom resources for use by Ray. Object with string keys and integer values. - # (Ray detects CPU and GPU from pod spec resource requests and limits, so no need to fill those here.) - resources: {"foo": 1, "bar": 2} - node_config: - apiVersion: v1 - kind: Pod - metadata: + - protocol: TCP + port: 8000 + targetPort: 8000 + +# Kubernetes pod config for the head node pod. +head_node: + apiVersion: v1 + kind: Pod + metadata: # Automatically generates a name for the pod with this prefix. - generateName: example-cluster-ray-worker- - spec: + generateName: ray-head- + + # Must match the head node service selector above if a head node + # service is required. + labels: + component: ray-head + spec: + # Change this if you altered the autoscaler_service_account above + # or want to provide your own. + serviceAccountName: autoscaler + + # Restarting the head node automatically is not currently supported. + # If the head node goes down, `ray up` must be run again. restartPolicy: Never + + # This volume allocates shared memory for Ray to use for its plasma + # object store. If you do not provide this, Ray will fall back to + # /tmp which cause slowdowns if is not a shared memory volume. volumes: - name: dshm emptyDir: - medium: Memory + medium: Memory + containers: - name: ray-node imagePullPolicy: Always + # You are free (and encouraged) to use your own container image, + # but it should have the following installed: + # - rsync (used for `ray rsync` commands and file mounts) + # - screen (used for `ray attach`) + # - kubectl (used by the autoscaler to manage worker pods) image: rayproject/ray:nightly + # Do not change this command - it keeps the pod alive until it is + # explicitly killed. command: ["/bin/bash", "-c", "--"] args: ["trap : TERM INT; sleep infinity & wait;"] + ports: + - containerPort: 6379 # Redis port. + - containerPort: 6380 # Redis port. + - containerPort: 6381 # Redis port. + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. + # This volume allocates shared memory for Ray to use for its plasma # object store. If you do not provide this, Ray will fall back to # /tmp which cause slowdowns if is not a shared memory volume. volumeMounts: - - mountPath: /dev/shm - name: dshm + - mountPath: /dev/shm + name: dshm resources: - requests: - cpu: 1000m - memory: 512Mi - limits: - # The maximum memory that this pod is allowed to use. The - # limit will be detected by ray and split to use 10% for - # redis, 30% for the shared memory object store, and the - # rest for application memory. If this limit is not set and - # the object store size is not set manually, ray will - # allocate a very large object store in each pod that may - # cause problems for other pods. - memory: 512Mi - head_node: - node_config: - apiVersion: v1 - kind: Pod - metadata: + requests: + cpu: 1000m + memory: 512Mi + limits: + # The maximum memory that this pod is allowed to use. The + # limit will be detected by ray and split to use 10% for + # redis, 30% for the shared memory object store, and the + # rest for application memory. If this limit is not set and + # the object store size is not set manually, ray will + # allocate a very large object store in each pod that may + # cause problems for other pods. + memory: 2Gi + env: + # This is used in the head_start_ray_commands below so that + # Ray can spawn the correct number of processes. Omitting this + # may lead to degraded performance. + - name: MY_CPU_REQUEST + valueFrom: + resourceFieldRef: + resource: requests.cpu + +# Kubernetes pod config for worker node pods. +worker_nodes: + apiVersion: v1 + kind: Pod + metadata: # Automatically generates a name for the pod with this prefix. - generateName: example-cluster-ray-head- - # Must match the head node service selector above if a head node + generateName: ray-worker- + + # Must match the worker node service selector above if a worker node # service is required. labels: - component: example-cluster-ray-head - spec: - # Change this if you altered the autoscaler_service_account above - # or want to provide your own. - serviceAccountName: autoscaler + component: ray-worker + spec: + serviceAccountName: default + # Worker nodes will be managed automatically by the head node, so + # do not change the restart policy. restartPolicy: Never # This volume allocates shared memory for Ray to use for its plasma @@ -162,48 +207,96 @@ available_node_types: volumes: - name: dshm emptyDir: - medium: Memory + medium: Memory + containers: - name: ray-node imagePullPolicy: Always + # You are free (and encouraged) to use your own container image, + # but it should have the following installed: + # - rsync (used for `ray rsync` commands and file mounts) image: rayproject/ray:nightly # Do not change this command - it keeps the pod alive until it is # explicitly killed. command: ["/bin/bash", "-c", "--"] - args: ['trap : TERM INT; sleep infinity & wait;'] + args: ["trap : TERM INT; sleep infinity & wait;"] ports: - - containerPort: 6379 # Redis port - - containerPort: 10001 # Used by Ray Client - - containerPort: 8265 # Used by Ray Dashboard + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. # This volume allocates shared memory for Ray to use for its plasma # object store. If you do not provide this, Ray will fall back to # /tmp which cause slowdowns if is not a shared memory volume. volumeMounts: - - mountPath: /dev/shm - name: dshm + - mountPath: /dev/shm + name: dshm resources: - requests: - cpu: 1000m - memory: 512Mi - limits: - # The maximum memory that this pod is allowed to use. The - # limit will be detected by ray and split to use 10% for - # redis, 30% for the shared memory object store, and the - # rest for application memory. If this limit is not set and - # the object store size is not set manually, ray will - # allocate a very large object store in each pod that may - # cause problems for other pods. - memory: 512Mi + requests: + cpu: 1000m + memory: 512Mi + limits: + # This memory limit will be detected by ray and split into + # 30% for plasma, and 70% for workers. + memory: 2Gi + env: + # This is used in the head_start_ray_commands below so that + # Ray can spawn the correct number of processes. Omitting this + # may lead to degraded performance. + - name: MY_CPU_REQUEST + valueFrom: + resourceFieldRef: + resource: requests.cpu + +# Files or directories to copy to the head and worker nodes. The format is a +# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. +file_mounts: { +# "~/path1/on/remote/machine": "/path1/on/local/machine", +# "~/path2/on/remote/machine": "/path2/on/local/machine", +} +# Note that the container images in this example have a non-root user. +# To avoid permissions issues, we recommend mounting into a subdirectory of home (~). + +# Files or directories to copy from the head node to the worker nodes. The format is a +# list of paths. The same path on the head node will be copied to the worker node. +# This behavior is a subset of the file_mounts behavior. In the vast majority of cases +# you should just use file_mounts. Only use this if you know what you're doing! +cluster_synced_files: [] + +# Whether changes to directories in file_mounts or cluster_synced_files in the head node +# should sync to the worker node continuously +file_mounts_sync_continuously: False + +# Patterns for files to exclude when running rsync up or rsync down. +# This is not supported on kubernetes. +# rsync_exclude: [] + +# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for +# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided +# as a value, the behavior will match git's behavior for finding and using .gitignore files. +# This is not supported on kubernetes. +# rsync_filter: [] + +# List of commands that will be run before `setup_commands`. If docker is +# enabled, these commands will run outside the container and before docker +# is setup. +initialization_commands: [] + +# List of shell commands to run to set up nodes. +setup_commands: [] + +# Custom commands that will be run on the head node after common setup. +head_setup_commands: [] +# Custom commands that will be run on worker nodes after common setup. +worker_setup_commands: [] # Command to start ray on the head node. You don't need to change this. -# Note dashboard-host is set to 0.0.0.0 so that kubernetes can port forward. +# Note webui-host is set to 0.0.0.0 so that kubernetes can port forward. head_start_ray_commands: - ray stop - - ulimit -n 65536; ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0 + - ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0 # Command to start ray on worker nodes. You don't need to change this. worker_start_ray_commands: - ray stop - - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 + - ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/python/ray/autoscaler/kubernetes/example-minimal.yaml b/python/ray/autoscaler/kubernetes/example-minimal.yaml index dc5b95d0f336..62cf855db8fb 100644 --- a/python/ray/autoscaler/kubernetes/example-minimal.yaml +++ b/python/ray/autoscaler/kubernetes/example-minimal.yaml @@ -1,9 +1,9 @@ # An unique identifier for the head node and workers of this cluster. -cluster_name: example-cluster +cluster_name: minimal # The maximum number of workers nodes to launch in addition to the head -# node. -max_workers: 2 +# node. This takes precedence over min_workers. min_workers default to 0. +max_workers: 1 # Kubernetes resources that need to be configured for the autoscaler to be # able to manage the Ray cluster. If any of the provided resources don't @@ -56,26 +56,3 @@ provider: kind: Role name: autoscaler apiGroup: rbac.authorization.k8s.io - - services: - # Service that maps to the head node of the Ray cluster. - - apiVersion: v1 - kind: Service - metadata: - # NOTE: If you're running multiple Ray clusters with services - # on one Kubernetes cluster, they must have unique service - # names. - name: example-cluster-ray-head - spec: - # This selector must match the head node pod's selector below. - selector: - component: example-cluster-ray-head - ports: - - name: client - protocol: TCP - port: 10001 - targetPort: 10001 - - name: dashboard - protocol: TCP - port: 8265 - targetPort: 8265 diff --git a/python/ray/autoscaler/kubernetes/example_scripts/job_example.py b/python/ray/autoscaler/kubernetes/example_scripts/job_example.py deleted file mode 100644 index e58a789ee6ae..000000000000 --- a/python/ray/autoscaler/kubernetes/example_scripts/job_example.py +++ /dev/null @@ -1,71 +0,0 @@ -from collections import Counter -import os -import sys -import time -import ray - -""" This script is meant to be run from a pod in the same Kubernetes namespace -as your Ray cluster. - -Just below are the environment variables used to access Ray client via a -service targetting the Ray cluster's head node pod. -These environment variables are set by Kubernetes. -See https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables -In the documentation examples, the head service has -"example-cluster-ray-head" and the relevant port is named "client". -Modify the environment variables as needed to match the name of the service -and port. - -Note: The default head service set up by the Ray Kubernetes operator is named --ray-head, -where is the metadata.name field you set in the RayCluster -custom resource. -""" # noqa -HEAD_SERVICE_IP_ENV = "EXAMPLE_CLUSTER_RAY_HEAD_SERVICE_HOST" -HEAD_SERVICE_CLIENT_PORT_ENV = "EXAMPLE_CLUSTER_RAY_HEAD_SERVICE_PORT_CLIENT" - - -@ray.remote -def gethostname(x): - import platform - import time - time.sleep(0.01) - return x + (platform.node(), ) - - -def wait_for_nodes(expected): - # Wait for all nodes to join the cluster. - while True: - resources = ray.cluster_resources() - node_keys = [key for key in resources if "node" in key] - num_nodes = sum(resources[node_key] for node_key in node_keys) - if num_nodes < expected: - print("{} nodes have joined so far, waiting for {} more.".format( - num_nodes, expected - num_nodes)) - sys.stdout.flush() - time.sleep(1) - else: - break - - -def main(): - wait_for_nodes(3) - - # Check that objects can be transferred from each node to each other node. - for i in range(10): - print("Iteration {}".format(i)) - results = [ - gethostname.remote(gethostname.remote(())) for _ in range(100) - ] - print(Counter(ray.get(results))) - sys.stdout.flush() - - print("Success!") - sys.stdout.flush() - - -if __name__ == "__main__": - head_service_ip = os.environ[HEAD_SERVICE_IP_ENV] - client_port = os.environ[HEAD_SERVICE_CLIENT_PORT_ENV] - ray.util.connect(f"{head_service_ip}:{client_port}") - main() diff --git a/python/ray/autoscaler/kubernetes/example_scripts/run_local_example.py b/python/ray/autoscaler/kubernetes/example_scripts/run_local_example.py deleted file mode 100644 index 667f8c628960..000000000000 --- a/python/ray/autoscaler/kubernetes/example_scripts/run_local_example.py +++ /dev/null @@ -1,58 +0,0 @@ -from collections import Counter -import sys -import time -import ray -""" Run this script locally to execute a Ray program on your Ray cluster on -Kubernetes. - -Before running this script, you must port-forward from the local host to -the relevant Kubernetes head service e.g. -kubectl -n ray port-forward service/example-cluster-ray-head 10001:10001. - -Set the constant LOCAL_PORT below to the local port being forwarded. -""" -LOCAL_PORT = 10001 - - -@ray.remote -def gethostname(x): - import platform - import time - time.sleep(0.01) - return x + (platform.node(), ) - - -def wait_for_nodes(expected): - # Wait for all nodes to join the cluster. - while True: - resources = ray.cluster_resources() - node_keys = [key for key in resources if "node" in key] - num_nodes = sum(resources[node_key] for node_key in node_keys) - if num_nodes < expected: - print("{} nodes have joined so far, waiting for {} more.".format( - num_nodes, expected - num_nodes)) - sys.stdout.flush() - time.sleep(1) - else: - break - - -def main(): - wait_for_nodes(3) - - # Check that objects can be transferred from each node to each other node. - for i in range(10): - print("Iteration {}".format(i)) - results = [ - gethostname.remote(gethostname.remote(())) for _ in range(100) - ] - print(Counter(ray.get(results))) - sys.stdout.flush() - - print("Success!") - sys.stdout.flush() - - -if __name__ == "__main__": - ray.util.connect(f"127.0.0.1:{LOCAL_PORT}") - main() diff --git a/python/ray/autoscaler/kubernetes/example_scripts/run_on_head.py b/python/ray/autoscaler/kubernetes/example_scripts/run_on_head.py deleted file mode 100644 index 3def71effcf2..000000000000 --- a/python/ray/autoscaler/kubernetes/example_scripts/run_on_head.py +++ /dev/null @@ -1,50 +0,0 @@ -from collections import Counter -import sys -import time -import ray - -# Run this script on the Ray head node using kubectl exec. - - -@ray.remote -def gethostname(x): - import platform - import time - time.sleep(0.01) - return x + (platform.node(), ) - - -def wait_for_nodes(expected): - # Wait for all nodes to join the cluster. - while True: - resources = ray.cluster_resources() - node_keys = [key for key in resources if "node" in key] - num_nodes = sum(resources[node_key] for node_key in node_keys) - if num_nodes < expected: - print("{} nodes have joined so far, waiting for {} more.".format( - num_nodes, expected - num_nodes)) - sys.stdout.flush() - time.sleep(1) - else: - break - - -def main(): - wait_for_nodes(3) - - # Check that objects can be transferred from each node to each other node. - for i in range(10): - print("Iteration {}".format(i)) - results = [ - gethostname.remote(gethostname.remote(())) for _ in range(100) - ] - print(Counter(ray.get(results))) - sys.stdout.flush() - - print("Success!") - sys.stdout.flush() - - -if __name__ == "__main__": - ray.init(address="auto") - main() diff --git a/python/ray/autoscaler/kubernetes/job-example.yaml b/python/ray/autoscaler/kubernetes/job-example.yaml deleted file mode 100644 index b5e140dc8036..000000000000 --- a/python/ray/autoscaler/kubernetes/job-example.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# Job to run a Ray program in its own pod. Assumes that a Ray cluster is already -# running. -apiVersion: batch/v1 -kind: Job -metadata: - generateName: ray-test-job- -spec: - template: - spec: - restartPolicy: Never - containers: - - name: ray - image: rayproject/ray:nightly - imagePullPolicy: Always - command: ["python"] - args: - - "$(EXAMPLE_PROGRAM_PATH)" - env: - - name: EXAMPLE_PROGRAM_PATH - value: "/home/ray/anaconda3/lib/python3.7/site-packages/ray/autoscaler/kubernetes/example_scripts/job_example.py" - resources: - requests: - cpu: 100m - memory: 512Mi diff --git a/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml b/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml index df7a33254cf5..5387803c136e 100644 --- a/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml +++ b/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml @@ -78,9 +78,9 @@ spec: description: Maximum number of Ray workers of this Pod type. rayResources: type: object - description: User-specified custom resources for use by Ray. Keys strings, values integers. - # TODO (dmitri): Validate that values are integers [patternProperties not supported by OpenAPI v3.0] - x-kubernetes-preserve-unknown-fields: true + description: User-specified custom resources for use by Ray. + # TODO (dmitri): Validate that values are numeric [patternProperties not supported by OpenAPI v3.0] + x-kubernetes-preserve-unknown-fields: true setupCommands: description: Commands to run before starting the Ray runtime. type: array diff --git a/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml b/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml index 34018f0c47d0..2735c72eb948 100644 --- a/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml +++ b/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml @@ -42,9 +42,9 @@ spec: command: ["/bin/bash", "-c", "--"] args: ['trap : TERM INT; sleep infinity & wait;'] ports: - - containerPort: 6379 # Redis port - - containerPort: 10001 # Used by Ray Client - - containerPort: 8265 # Used by Ray Dashboard + - containerPort: 6379 # Redis port. + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. # This volume allocates shared memory for Ray to use for its plasma # object store. If you do not provide this, Ray will fall back to @@ -65,14 +65,16 @@ spec: # allocate a very large object store in each pod that may # cause problems for other pods. memory: 512Mi - - name: worker-node + - name: worker-nodes # Minimum number of Ray workers of this Pod type. minWorkers: 2 # Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers. maxWorkers: 3 - # User-specified custom resources for use by Ray. - # (Ray detects CPU and GPU from pod spec resource requests and limits, so no need to fill those here.) - rayResources: {"foo": 1, "bar": 1} + # User-specified custom resources for use by Ray + rayResources: {"Custom1": 1, "is_spot": 1} + # Optional commands to run before starting the Ray runtime. + setupCommands: + - pip install numpy # Example podConfig: apiVersion: v1 kind: Pod @@ -91,6 +93,9 @@ spec: image: rayproject/ray:nightly command: ["/bin/bash", "-c", "--"] args: ["trap : TERM INT; sleep infinity & wait;"] + ports: + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. # This volume allocates shared memory for Ray to use for its plasma # object store. If you do not provide this, Ray will fall back to # /tmp which cause slowdowns if is not a shared memory volume. @@ -113,9 +118,9 @@ spec: # Commands to start Ray on the head node. You don't need to change this. # Note dashboard-host is set to 0.0.0.0 so that Kubernetes can port forward. headStartRayCommands: - - ray stop - - ulimit -n 65536; ray start --head --no-monitor --dashboard-host 0.0.0.0 + - ray stop + - ulimit -n 65536; ray start --head --no-monitor --port=6379 --object-manager-port=8076 --dashboard-host 0.0.0.0 # Commands to start Ray on worker nodes. You don't need to change this. workerStartRayCommands: - - ray stop - - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 + - ray stop + - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml b/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml index c244a589faac..7341e16fa914 100644 --- a/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml +++ b/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml @@ -42,9 +42,9 @@ spec: command: ["/bin/bash", "-c", "--"] args: ['trap : TERM INT; sleep infinity & wait;'] ports: - - containerPort: 6379 # Redis port - - containerPort: 10001 # Used by Ray Client - - containerPort: 8265 # Used by Ray Dashboard + - containerPort: 6379 # Redis port. + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. # This volume allocates shared memory for Ray to use for its plasma # object store. If you do not provide this, Ray will fall back to @@ -65,14 +65,16 @@ spec: # allocate a very large object store in each pod that may # cause problems for other pods. memory: 512Mi - - name: worker-node + - name: worker-nodes # Minimum number of Ray workers of this Pod type. minWorkers: 1 # Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers. maxWorkers: 3 - # User-specified custom resources for use by Ray. Object with string keys and integer values. - # (Ray detects CPU and GPU from pod spec resource requests and limits, so no need to fill those here.) - rayResources: {"baz": 5, "quux": 17} + # User-specified custom resources for use by Ray + rayResources: {"Custom1": 1, "is_spot": 1} + # Optional commands to run before starting the Ray runtime. + setupCommands: + - pip install numpy # Example podConfig: apiVersion: v1 kind: Pod @@ -91,6 +93,9 @@ spec: image: rayproject/ray:nightly command: ["/bin/bash", "-c", "--"] args: ["trap : TERM INT; sleep infinity & wait;"] + ports: + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. # This volume allocates shared memory for Ray to use for its plasma # object store. If you do not provide this, Ray will fall back to # /tmp which cause slowdowns if is not a shared memory volume. @@ -113,9 +118,9 @@ spec: # Commands to start Ray on the head node. You don't need to change this. # Note dashboard-host is set to 0.0.0.0 so that Kubernetes can port forward. headStartRayCommands: - - ray stop - - ulimit -n 65536; ray start --head --no-monitor --dashboard-host 0.0.0.0 + - ray stop + - ulimit -n 65536; ray start --head --no-monitor --port=6379 --object-manager-port=8076 --dashboard-host 0.0.0.0 # Commands to start Ray on worker nodes. You don't need to change this. workerStartRayCommands: - - ray stop - - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 + - ray stop + - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/python/ray/autoscaler/kubernetes/operator_configs/operator.yaml b/python/ray/autoscaler/kubernetes/operator_configs/operator.yaml index f0f43a1efdc9..6f259a9a7467 100644 --- a/python/ray/autoscaler/kubernetes/operator_configs/operator.yaml +++ b/python/ray/autoscaler/kubernetes/operator_configs/operator.yaml @@ -10,7 +10,7 @@ metadata: name: ray-operator-role rules: - apiGroups: ["", "cluster.ray.io"] - resources: ["rayclusters", "rayclusters/finalizers", "rayclusters/status", "pods", "pods/exec", "services"] + resources: ["rayclusters", "rayclusters/finalizers", "rayclusters/status", "pods", "pods/exec"] verbs: ["get", "watch", "list", "create", "delete", "patch", "update"] --- apiVersion: rbac.authorization.k8s.io/v1 diff --git a/python/ray/autoscaler/ray-schema.json b/python/ray/autoscaler/ray-schema.json index a5d927a01178..df157bdc067c 100644 --- a/python/ray/autoscaler/ray-schema.json +++ b/python/ray/autoscaler/ray-schema.json @@ -337,12 +337,8 @@ "min_workers": {"type": "integer"}, "max_workers": {"type": "integer"}, "resources": { - "patternProperties": { - ".*":{ - "type": "integer", - "minimum": 0 - } - } + "type": "object", + ".*": {"type": "number"} }, "initialization_commands": { "$ref": "#/definitions/commands", diff --git a/python/ray/ray_operator/operator_utils.py b/python/ray/ray_operator/operator_utils.py index 3dc50e9a1529..e20cd6719b21 100644 --- a/python/ray/ray_operator/operator_utils.py +++ b/python/ray/ray_operator/operator_utils.py @@ -6,7 +6,6 @@ from kubernetes.watch import Watch from ray.autoscaler._private.kubernetes import custom_objects_api -from ray.autoscaler._private.providers import _get_default_config RAY_NAMESPACE = os.environ.get("RAY_OPERATOR_POD_NAMESPACE") @@ -60,64 +59,36 @@ def cr_to_config(cluster_resource: Dict[str, Any]) -> Dict[str, Any]: """Convert RayCluster custom resource to a ray cluster config for use by the autoscaler.""" config = translate(cluster_resource["spec"], dictionary=CONFIG_FIELDS) - cluster_name = cluster_resource["metadata"]["name"] - config["available_node_types"] = get_node_types(cluster_resource, - cluster_name) - config["cluster_name"] = cluster_name - config["provider"] = get_provider_config(cluster_name) + config["available_node_types"] = get_node_types(cluster_resource) + config["cluster_name"] = cluster_resource["metadata"]["name"] + config["provider"] = PROVIDER_CONFIG return config -def get_node_types(cluster_resource: Dict[str, Any], cluster_name) ->\ - Dict[str, Any]: - cluster_owner_reference = get_cluster_owner_reference( - cluster_resource, cluster_name) +def get_node_types(cluster_resource: Dict[str, Any]) -> Dict[str, Any]: + cluster_owner_reference = get_cluster_owner_reference(cluster_resource) node_types = {} for pod_type in cluster_resource["spec"]["podTypes"]: name = pod_type["name"] pod_type_copy = copy.deepcopy(pod_type) pod_type_copy.pop("name") - node_type = translate(pod_type_copy, dictionary=NODE_TYPE_FIELDS) - metadata = node_type["node_config"]["metadata"] - metadata.update({"ownerReferences": [cluster_owner_reference]}) - if name == cluster_resource["spec"]["headPodType"]: - if "labels" not in metadata: - metadata["labels"] = {} - metadata["labels"].update(head_service_selector(cluster_name)) - node_types[name] = node_type + node_types[name] = translate( + pod_type_copy, dictionary=NODE_TYPE_FIELDS) + # Deleting a RayCluster CR will also delete the associated pods. + node_types[name]["node_config"]["metadata"].update({ + "ownerReferences": [cluster_owner_reference] + }) return node_types -def get_provider_config(cluster_name): - default_kubernetes_config = _get_default_config({"type": "kubernetes"}) - default_provider_conf = default_kubernetes_config["provider"] - - # Configure head service for dashboard and client - head_service = copy.deepcopy(default_provider_conf["services"][0]) - service_name = f"{cluster_name}-ray-head" - head_service["metadata"]["name"] = service_name - head_service["spec"]["selector"] = head_service_selector(cluster_name) - - provider_conf = {} - provider_conf["type"] = "kubernetes" - provider_conf["use_internal_ips"] = True - provider_conf["namespace"] = RAY_NAMESPACE - provider_conf["services"] = [head_service] - return provider_conf - - -def head_service_selector(cluster_name): - return {"component": f"{cluster_name}-ray-head"} - - -def get_cluster_owner_reference(cluster_resource: Dict[str, Any], - cluster_name: str) -> Dict[str, Any]: +def get_cluster_owner_reference( + cluster_resource: Dict[str, Any]) -> Dict[str, Any]: return { "apiVersion": cluster_resource["apiVersion"], "kind": cluster_resource["kind"], "blockOwnerDeletion": True, "controller": True, - "name": cluster_name, + "name": cluster_resource["metadata"]["name"], "uid": cluster_resource["metadata"]["uid"] } diff --git a/python/ray/tests/test_autoscaler_yaml.py b/python/ray/tests/test_autoscaler_yaml.py index 5595382a02ea..10edbb8fe7e0 100644 --- a/python/ray/tests/test_autoscaler_yaml.py +++ b/python/ray/tests/test_autoscaler_yaml.py @@ -11,8 +11,6 @@ from ray.autoscaler._private.util import prepare_config, validate_config from ray.autoscaler._private.providers import _NODE_PROVIDERS -from ray.autoscaler._private.kubernetes.node_provider import\ - KubernetesNodeProvider from ray.test_utils import recursive_fnmatch @@ -27,7 +25,6 @@ def ignore_k8s_operator_configs(paths): return [ path for path in paths if "kubernetes/operator_configs" not in path - and "kubernetes/job-example.yaml" not in path ] @@ -43,14 +40,10 @@ def testValidateDefaultConfig(self): with open(config_path) as f: config = yaml.safe_load(f) config = prepare_config(config) - if config["provider"]["type"] == "kubernetes": - KubernetesNodeProvider.fillout_available_node_types_resources( - config) try: validate_config(config) except Exception: - self.fail( - f"Config {config_path} did not pass validation test!") + self.fail("Config did not pass validation test!") @pytest.mark.skipif( sys.platform.startswith("win"), reason="Fails on Windows.") diff --git a/python/ray/tests/test_k8s_cluster_launcher.py b/python/ray/tests/test_k8s_cluster_launcher.py index 49ecadd688bb..eb6d596b93e5 100644 --- a/python/ray/tests/test_k8s_cluster_launcher.py +++ b/python/ray/tests/test_k8s_cluster_launcher.py @@ -69,8 +69,8 @@ def test_up_and_down(self): while True: monitor_output = sdk.run_on_cluster( config, cmd=log_cmd, with_output=True).decode() - if ("head-node" in monitor_output - and "worker-node" in monitor_output): + if ("ray-legacy-head-node-type" in monitor_output + and "ray-legacy-worker-node-type" in monitor_output): break else: time.sleep(1) diff --git a/python/ray/tests/test_k8s_operator_examples.py b/python/ray/tests/test_k8s_operator_examples.py index 025ad1709172..1636b347bd14 100644 --- a/python/ray/tests/test_k8s_operator_examples.py +++ b/python/ray/tests/test_k8s_operator_examples.py @@ -20,7 +20,7 @@ def retry_until_true(f): # Retry 60 times with 1 second delay between attempts. def f_with_retries(*args, **kwargs): - for _ in range(120): + for _ in range(60): if f(*args, **kwargs): return else: @@ -47,38 +47,25 @@ def wait_for_logs(): cmd = f"kubectl -n {NAMESPACE} logs ray-operator-pod"\ "| grep ^example-cluster: | tail -n 100" log_tail = subprocess.check_output(cmd, shell=True).decode() - return ("head-node" in log_tail) and ("worker-node" in log_tail) + return ("head-node" in log_tail) and ("worker-nodes" in log_tail) -@retry_until_true -def wait_for_job(job_pod): - cmd = f"kubectl -n {NAMESPACE} logs {job_pod}" - out = subprocess.check_output(cmd, shell=True).decode() - return ("success" in out.lower()) - - -def kubernetes_configs_directory(): +def operator_configs_directory(): here = os.path.realpath(__file__) ray_python_root = os.path.dirname(os.path.dirname(here)) - relative_path = "autoscaler/kubernetes" + relative_path = "autoscaler/kubernetes/operator_configs" return os.path.join(ray_python_root, relative_path) -def get_kubernetes_config_path(name): - return os.path.join(kubernetes_configs_directory(), name) - - def get_operator_config_path(file_name): - operator_configs = get_kubernetes_config_path("operator_configs") - return os.path.join(operator_configs, file_name) + return os.path.join(operator_configs_directory(), file_name) class KubernetesOperatorTest(unittest.TestCase): def test_examples(self): with tempfile.NamedTemporaryFile("w+") as example_cluster_file, \ tempfile.NamedTemporaryFile("w+") as example_cluster2_file,\ - tempfile.NamedTemporaryFile("w+") as operator_file,\ - tempfile.NamedTemporaryFile("w+") as job_file: + tempfile.NamedTemporaryFile("w+") as operator_file: # Get paths to operator configs example_cluster_config_path = get_operator_config_path( @@ -86,7 +73,6 @@ def test_examples(self): example_cluster2_config_path = get_operator_config_path( "example_cluster2.yaml") operator_config_path = get_operator_config_path("operator.yaml") - job_path = get_kubernetes_config_path("job-example.yaml") self.crd_path = get_operator_config_path("cluster_crd.yaml") # Load operator configs @@ -96,23 +82,19 @@ def test_examples(self): open(example_cluster2_config_path).read()) operator_config = list( yaml.safe_load_all(open(operator_config_path).read())) - job_config = yaml.safe_load(open(job_path).read()) # Fill image fields podTypes = example_cluster_config["spec"]["podTypes"] podTypes2 = example_cluster2_config["spec"]["podTypes"] - pod_specs = ([operator_config[-1]["spec"]] + [ - job_config["spec"]["template"]["spec"] - ] + [podType["podConfig"]["spec"] for podType in podTypes - ] + [podType["podConfig"]["spec"] for podType in podTypes2]) - for pod_spec in pod_specs: - pod_spec["containers"][0]["image"] = IMAGE - pod_spec["containers"][0]["imagePullPolicy"] = "IfNotPresent" + pod_configs = ([operator_config[-1]] + [ + podType["podConfig"] for podType in podTypes + ] + [podType["podConfig"] for podType in podTypes2]) + for pod_config in pod_configs: + pod_config["spec"]["containers"][0]["image"] = IMAGE # Dump to temporary files yaml.dump(example_cluster_config, example_cluster_file) yaml.dump(example_cluster2_config, example_cluster2_file) - yaml.dump(job_config, job_file) yaml.dump_all(operator_config, operator_file) files = [ example_cluster_file, example_cluster2_file, operator_file @@ -149,19 +131,6 @@ def test_examples(self): # Four pods remain wait_for_pods(4) - # Check job submission - cmd = f"kubectl -n {NAMESPACE} create -f {job_file.name}" - subprocess.check_call(cmd, shell=True) - - cmd = f"kubectl -n {NAMESPACE} get pods --no-headers -o"\ - " custom-columns=\":metadata.name\"" - pods = subprocess.check_output(cmd, shell=True).decode().split() - job_pod = [pod for pod in pods if "job" in pod].pop() - time.sleep(10) - wait_for_job(job_pod) - cmd = f"kubectl -n {NAMESPACE} delete jobs --all" - subprocess.check_call(cmd, shell=True) - # Check that cluster updates work: increase minWorkers to 3 # and check that one worker is created. example_cluster_edit = copy.deepcopy(example_cluster_config) From a4ddf904d0b1cf649b8ee05a9a2254c26eb23de8 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 059/244] Revert "Fix autoscaler wrong parameter names (#13966)" This reverts commit 4bbb8af47db91d4c7b6802a6e97a6adee33ccb15. --- python/ray/autoscaler/_private/autoscaler.py | 29 ++++++++++---------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/python/ray/autoscaler/_private/autoscaler.py b/python/ray/autoscaler/_private/autoscaler.py index 727c4db2effb..1166597ed9d6 100644 --- a/python/ray/autoscaler/_private/autoscaler.py +++ b/python/ray/autoscaler/_private/autoscaler.py @@ -43,7 +43,7 @@ # that will be passed into a NodeUpdaterThread. UpdateInstructions = namedtuple( "UpdateInstructions", - ["node_id", "setup_commands", "ray_start_commands", "docker_config"]) + ["node_id", "init_commands", "start_ray_commands", "docker_config"]) AutoscalerSummary = namedtuple( "AutoscalerSummary", @@ -283,7 +283,7 @@ def _update(self): # problems. They should at a minimum be spawned as daemon threads. # See https://github.com/ray-project/ray/pull/5903 for more info. T = [] - for node_id, setup_commands, ray_start_commands, docker_config in ( + for node_id, commands, ray_start, docker_config in ( self.should_update(node_id) for node_id in nodes): if node_id is not None: resources = self._node_resources(node_id) @@ -291,8 +291,8 @@ def _update(self): T.append( threading.Thread( target=self.spawn_updater, - args=(node_id, setup_commands, ray_start_commands, - resources, docker_config))) + args=(node_id, commands, ray_start, resources, + docker_config))) for t in T: t.start() for t in T: @@ -633,25 +633,25 @@ def should_update(self, node_id): successful_updated = self.num_successful_updates.get(node_id, 0) > 0 if successful_updated and self.config.get("restart_only", False): - setup_commands = [] - ray_start_commands = self.config["worker_start_ray_commands"] + init_commands = [] + ray_commands = self.config["worker_start_ray_commands"] elif successful_updated and self.config.get("no_restart", False): - setup_commands = self._get_node_type_specific_fields( + init_commands = self._get_node_type_specific_fields( node_id, "worker_setup_commands") - ray_start_commands = [] + ray_commands = [] else: - setup_commands = self._get_node_type_specific_fields( + init_commands = self._get_node_type_specific_fields( node_id, "worker_setup_commands") - ray_start_commands = self.config["worker_start_ray_commands"] + ray_commands = self.config["worker_start_ray_commands"] docker_config = self._get_node_specific_docker_config(node_id) return UpdateInstructions( node_id=node_id, - setup_commands=setup_commands, - ray_start_commands=ray_start_commands, + init_commands=init_commands, + start_ray_commands=ray_commands, docker_config=docker_config) - def spawn_updater(self, node_id, setup_commands, ray_start_commands, + def spawn_updater(self, node_id, init_commands, ray_start_commands, node_resources, docker_config): logger.info(f"Creating new (spawn_updater) updater thread for node" f" {node_id}.") @@ -665,8 +665,7 @@ def spawn_updater(self, node_id, setup_commands, ray_start_commands, initialization_commands=with_head_node_ip( self._get_node_type_specific_fields( node_id, "initialization_commands"), self.head_node_ip), - setup_commands=with_head_node_ip(setup_commands, - self.head_node_ip), + setup_commands=with_head_node_ip(init_commands, self.head_node_ip), ray_start_commands=with_head_node_ip(ray_start_commands, self.head_node_ip), runtime_hash=self.runtime_hash, From 471c708f83ddb21cc147efef7948a53a547bfb12 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 060/244] Revert "random a job id in c++ worker (#13982)" This reverts commit 16cf0c141bf6a13d101c590b635c5fa1a8c047e3. --- cpp/src/ray/util/process_helper.cc | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/cpp/src/ray/util/process_helper.cc b/cpp/src/ray/util/process_helper.cc index 6511b5b8b96b..3ee6a2c34d8e 100644 --- a/cpp/src/ray/util/process_helper.cc +++ b/cpp/src/ray/util/process_helper.cc @@ -70,12 +70,7 @@ void ProcessHelper::RayStart(std::shared_ptr config, options.store_socket = store_socket; options.raylet_socket = raylet_socket; if (options.worker_type == WorkerType::DRIVER) { - /// TODO(Guyang Song): Get next job id from core worker by GCS client. - /// Random a number to avoid repeated job ids. - /// The repeated job ids will lead to task hang when driver connects to a existing - /// cluster more than once. - std::srand(std::time(nullptr)); - options.job_id = JobID::FromInt(std::rand()); + options.job_id = JobID::FromInt(0); } options.gcs_options = gcs_options; options.enable_logging = true; From 06cbb03eaf98a2365f7e9d5947c66877f32f9b3f Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 061/244] Revert "Revert "[Java] fix test hang occasionally when running FailureTest (#13934)" (#13992)" This reverts commit dffae0be259285c0177ff68aaa5994237b06ab81. --- .../io/ray/runtime/runner/RunManager.java | 2 +- java/test.sh | 57 +++--- .../io/ray/test/TestProgressListener.java | 166 ++++++++++++++++-- java/testng.xml | 2 +- src/ray/core_worker/core_worker.cc | 16 +- src/ray/core_worker/core_worker.h | 2 + 6 files changed, 201 insertions(+), 44 deletions(-) diff --git a/java/runtime/src/main/java/io/ray/runtime/runner/RunManager.java b/java/runtime/src/main/java/io/ray/runtime/runner/RunManager.java index 2307b0489d3c..192e5550ceb4 100644 --- a/java/runtime/src/main/java/io/ray/runtime/runner/RunManager.java +++ b/java/runtime/src/main/java/io/ray/runtime/runner/RunManager.java @@ -96,7 +96,7 @@ public static void getAddressInfoAndFillConfig(RayConfig rayConfig) { * * @param command The command to start the process with. */ - private static String runCommand(List command) throws IOException, InterruptedException { + public static String runCommand(List command) throws IOException, InterruptedException { if (LOGGER.isDebugEnabled()) { LOGGER.debug("Starting process with command: {}", Joiner.on(" ").join(command)); } diff --git a/java/test.sh b/java/test.sh index a842194e67fb..b49f06037c10 100755 --- a/java/test.sh +++ b/java/test.sh @@ -16,30 +16,27 @@ pushd "$ROOT_DIR" mvn -T16 checkstyle:check popd -on_exit() { - exit_code=$? - if [ $exit_code -ne 0 ]; then - echo "Exit trap, printing ray logs" - cat /tmp/ray/session_latest/logs/* - fi -} - -trap on_exit EXIT - run_testng() { + local pid local exit_code - if "$@"; then + "$@" & + pid=$! + if wait $pid; then exit_code=0 else exit_code=$? fi # exit_code == 2 means there are skipped tests. if [ $exit_code -ne 2 ] && [ $exit_code -ne 0 ] ; then - if [ $exit_code -gt 128 ] ; then - # Test crashed. Print the driver log for diagnosis. - cat /tmp/ray/session_latest/logs/java-core-driver-* + # Only print log files if it ran in cluster mode + if [[ ! "$*" =~ SINGLE_PROCESS ]]; then + if [ $exit_code -gt 128 ] ; then + # Test crashed. Print the driver log for diagnosis. + cat /tmp/ray/session_latest/logs/java-core-driver-*$pid* + fi fi - find . -name "hs_err_*log" -exec cat {} + + # Only print the hs_err_pid file of TestNG process + find . -name "hs_err_pid$pid.log" -exec cat {} + exit $exit_code fi } @@ -60,11 +57,31 @@ if ! git diff --exit-code -- java src/ray/core_worker/lib/java; then exit 1 fi -echo "Running tests under cluster mode." -# TODO(hchen): Ideally, we should use the following bazel command to run Java tests. However, if there're skipped tests, -# TestNG will exit with code 2. And bazel treats it as test failure. -# bazel test //java:all_tests --config=ci || cluster_exit_code=$? -run_testng java -cp "$ROOT_DIR"/../bazel-bin/java/all_tests_deploy.jar org.testng.TestNG -d /tmp/ray_java_test_output "$ROOT_DIR"/testng.xml +# NOTE(kfstrom): Java test troubleshooting only. +# Set MAX_ROUNDS to a big number (e.g. 1000) to run Java tests repeatedly. +# You may also want to modify java/testng.xml to run only a subset of test cases. +MAX_ROUNDS=1 +if [ $MAX_ROUNDS -gt 1 ]; then + export RAY_BACKEND_LOG_LEVEL=debug +fi + +round=1 +while true; do + echo Starting cluster mode test round $round + + echo "Running tests under cluster mode." + # TODO(hchen): Ideally, we should use the following bazel command to run Java tests. However, if there're skipped tests, + # TestNG will exit with code 2. And bazel treats it as test failure. + # bazel test //java:all_tests --config=ci || cluster_exit_code=$? + run_testng java -cp "$ROOT_DIR"/../bazel-bin/java/all_tests_deploy.jar org.testng.TestNG -d /tmp/ray_java_test_output "$ROOT_DIR"/testng.xml + + echo Finished cluster mode test round $round + date + round=$((round+1)) + if (( round > MAX_ROUNDS )); then + break + fi +done echo "Running tests under single-process mode." # bazel test //java:all_tests --jvmopt="-Dray.run-mode=SINGLE_PROCESS" --config=ci || single_exit_code=$? diff --git a/java/test/src/main/java/io/ray/test/TestProgressListener.java b/java/test/src/main/java/io/ray/test/TestProgressListener.java index 1fed5ac21375..915d82af317b 100644 --- a/java/test/src/main/java/io/ray/test/TestProgressListener.java +++ b/java/test/src/main/java/io/ray/test/TestProgressListener.java @@ -1,27 +1,42 @@ package io.ray.test; +import com.google.common.collect.ImmutableList; +import io.ray.runtime.runner.RunManager; +import java.io.File; import java.time.LocalDateTime; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import org.apache.commons.io.FileUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.SystemUtils; import org.testng.IInvokedMethod; import org.testng.IInvokedMethodListener; import org.testng.ITestContext; import org.testng.ITestListener; import org.testng.ITestResult; +import org.testng.SkipException; public class TestProgressListener implements IInvokedMethodListener, ITestListener { + // Travis aborts CI if no outputs for 10 minutes. So threshold needs to be smaller than 10m. + private static final long hangDetectionThresholdMillis = 5 * 60 * 1000; + private static final int TAIL_NO_OF_LINES = 500; + private Thread testMainThread; + private long testStartTimeMillis; + private String getFullTestName(ITestResult testResult) { return testResult.getTestClass().getName() + "." + testResult.getMethod().getMethodName(); } - private void printInfo(String tag, String content) { + private void printSection(String sectionName) { System.out.println( - "============ [" - + LocalDateTime.now().toString() - + "] [" - + tag - + "] " - + content - + " ============"); + "============ [" + LocalDateTime.now().toString() + "] " + sectionName + " ============"); + } + + private void printTestStage(String tag, String content) { + printSection("[" + tag + "] " + content); } @Override @@ -32,31 +47,50 @@ public void afterInvocation(IInvokedMethod method, ITestResult testResult) {} @Override public void onTestStart(ITestResult result) { - printInfo("TEST START", getFullTestName(result)); + printTestStage("TEST START", getFullTestName(result)); + testStartTimeMillis = System.currentTimeMillis(); + // TODO(kfstorm): Add a timer to detect hang + if (testMainThread == null) { + testMainThread = Thread.currentThread(); + Thread hangDetectionThread = + new Thread( + () -> { + try { + // If current task case has ran for more than 5 minutes. + while (System.currentTimeMillis() - testStartTimeMillis + < hangDetectionThresholdMillis) { + Thread.sleep(1000); + } + printDebugInfo(null, /*testHanged=*/ true); + } catch (InterruptedException e) { + // ignored + } + }); + hangDetectionThread.setDaemon(true); + hangDetectionThread.start(); + } } @Override public void onTestSuccess(ITestResult result) { - printInfo("TEST SUCCESS", getFullTestName(result)); + printTestStage("TEST SUCCESS", getFullTestName(result)); } @Override public void onTestFailure(ITestResult result) { - printInfo("TEST FAILURE", getFullTestName(result)); - Throwable throwable = result.getThrowable(); - if (throwable != null) { - throwable.printStackTrace(); - } + printTestStage("TEST FAILURE", getFullTestName(result)); + printDebugInfo(result, /*testHanged=*/ false); } @Override public void onTestSkipped(ITestResult result) { - printInfo("TEST SKIPPED", getFullTestName(result)); + printTestStage("TEST SKIPPED", getFullTestName(result)); + printDebugInfo(result, /*testHanged=*/ false); } @Override public void onTestFailedButWithinSuccessPercentage(ITestResult result) { - printInfo("TEST FAILED BUT WITHIN SUCCESS PERCENTAGE", getFullTestName(result)); + printTestStage("TEST FAILED BUT WITHIN SUCCESS PERCENTAGE", getFullTestName(result)); } @Override @@ -64,4 +98,102 @@ public void onStart(ITestContext context) {} @Override public void onFinish(ITestContext context) {} + + private void printDebugInfo(ITestResult result, boolean testHanged) { + boolean testFailed = false; + if (result != null) { + Throwable throwable = result.getThrowable(); + if (throwable != null && !(throwable instanceof SkipException)) { + testFailed = true; + throwable.printStackTrace(); + } + } + if (!testFailed && !testHanged) { + return; + } + + if (testHanged) { + printSection("TEST CASE HANGED"); + printSection("STACK TRACE OF TEST THREAD"); + for (StackTraceElement element : testMainThread.getStackTrace()) { + System.out.println(element.toString()); + } + Set javaPids = getJavaPids(); + for (Integer pid : javaPids) { + runCommandSafely(ImmutableList.of("jstack", pid.toString())); + // TODO(kfstorm): Check lldb or gdb exists rather than detecting OS type. + if (SystemUtils.IS_OS_MAC) { + runCommandSafely( + ImmutableList.of("lldb", "--batch", "-o", "bt all", "-p", pid.toString())); + } else { + runCommandSafely( + ImmutableList.of( + "sudo", "gdb", "-batch", "-ex", "thread apply all bt", "-p", pid.toString())); + } + } + } + + printLogFiles(); + + if (testHanged) { + printSection("ABORT TEST"); + System.exit(1); + } + } + + private String runCommandSafely(List command) { + String output; + String commandString = String.join(" ", command); + printSection(commandString); + try { + output = RunManager.runCommand(command); + System.out.println(output); + } catch (Exception e) { + System.out.println("Failed to execute command: " + commandString); + e.printStackTrace(); + output = ""; + } + return output; + } + + private Set getJavaPids() { + Set javaPids = new HashSet<>(); + String jpsOutput = runCommandSafely(ImmutableList.of("jps", "-v")); + try { + for (String line : StringUtils.split(jpsOutput, "\n")) { + String[] parts = StringUtils.split(line); + if (parts.length > 1 && parts[1].toLowerCase().equals("jps")) { + // Skip jps. + continue; + } + Integer pid = Integer.valueOf(parts[0]); + javaPids.add(pid); + } + } catch (Exception e) { + System.out.println("Failed to parse jps output."); + e.printStackTrace(); + } + + String pgrepJavaResult = runCommandSafely(ImmutableList.of("pgrep", "java")); + try { + for (String line : StringUtils.split(pgrepJavaResult, "\n")) { + Integer pid = Integer.valueOf(line); + javaPids.add(pid); + } + } catch (Exception e) { + System.out.println("Failed to parse pgrep java output."); + e.printStackTrace(); + } + + return javaPids; + } + + private void printLogFiles() { + Collection logFiles = + FileUtils.listFiles(new File("/tmp/ray/session_latest/logs"), null, false); + for (File file : logFiles) { + runCommandSafely( + ImmutableList.of("tail", "-n", String.valueOf(TAIL_NO_OF_LINES), file.getAbsolutePath())); + } + } } diff --git a/java/testng.xml b/java/testng.xml index 6cc10b9ab24a..0db2704845d4 100644 --- a/java/testng.xml +++ b/java/testng.xml @@ -1,6 +1,6 @@ - + diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 6c8287c1507b..262c837011a7 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -161,15 +161,19 @@ CoreWorkerProcess::CoreWorkerProcess(const CoreWorkerOptions &options) // RayConfig is generated in Java_io_ray_runtime_RayNativeRuntime_nativeInitialize // for java worker or in constructor of CoreWorker for python worker. ray::stats::Init(global_tags, options_.metrics_agent_port); + + // NOTE(kfstorm): std::atexit should be put at the end of `CoreWorkerProcess` + // constructor. We assume that spdlog has been initialized before this line. When the + // process is exiting, `HandleAtExit` will be invoked before destructing spdlog static + // variables. We explicitly destruct `CoreWorkerProcess` instance in the callback to + // ensure the static `CoreWorkerProcess` instance is destructed while spdlog is still + // usable. This prevents crashing (or hanging) when using `RAY_LOG` in + // `CoreWorkerProcess` destructor. + RAY_CHECK(std::atexit(CoreWorkerProcess::HandleAtExit) == 0); } CoreWorkerProcess::~CoreWorkerProcess() { RAY_LOG(INFO) << "Destructing CoreWorkerProcess. pid: " << getpid(); - { - // Check that all `CoreWorker` instances have been removed. - absl::ReaderMutexLock lock(&worker_map_mutex_); - RAY_CHECK(workers_.empty()); - } RAY_LOG(DEBUG) << "Stats stop in core worker."; // Shutdown stats module if worker process exits. ray::stats::Shutdown(); @@ -183,6 +187,8 @@ void CoreWorkerProcess::EnsureInitialized() { << "shutdown."; } +void CoreWorkerProcess::HandleAtExit() { instance_.reset(); } + std::shared_ptr CoreWorkerProcess::TryGetWorker(const WorkerID &worker_id) { if (!instance_) { return nullptr; diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h index 6fa24c29e94e..72ef4f36ca7b 100644 --- a/src/ray/core_worker/core_worker.h +++ b/src/ray/core_worker/core_worker.h @@ -265,6 +265,8 @@ class CoreWorkerProcess { /// \return Void. static void EnsureInitialized(); + static void HandleAtExit(); + /// Get the `CoreWorker` instance by worker ID. /// /// \param[in] workerId The worker ID. From 32ea02cb4aeb51483d66fefef6874b35b8bf4f05 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 062/244] Revert "[Doc] Update actor resource information (#13909)" This reverts commit 7f7f2cf087171424f9abc93e1da4f48ce6186e28. --- doc/source/actors.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/source/actors.rst b/doc/source/actors.rst index d82559af86b0..9e4a0fd34dba 100644 --- a/doc/source/actors.rst +++ b/doc/source/actors.rst @@ -174,12 +174,14 @@ have these resources (see `configuration instructions * If you specify resource requirements in an actor class's remote decorator, then the actor will acquire those resources for its entire lifetime (if you - do not specify CPU resources, the default is 0), even if it is not executing + do not specify CPU resources, the default is 1), even if it is not executing any methods. The actor will not acquire any additional resources when executing methods. * If you do not specify any resource requirements in the actor class's remote decorator, then by default, the actor will not acquire any resources for its - lifetime. + lifetime, but every time it executes a method, it will need to acquire 1 CPU + resource. + .. tabs:: .. code-tab:: python From 7f614337b56d95ee0b470252dd6b395739d37ab4 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 063/244] Revert "[RLlib] Extend on_learn_on_batch callback to allow for custom metrics to be added. (#13584)" This reverts commit 5a3a5a927a595de8b6a5c439d2098451c41052d1. --- rllib/BUILD | 32 +++++++++--------- rllib/agents/callbacks.py | 16 +++++++-- rllib/agents/marwil/tests/test_marwil.py | 2 +- rllib/agents/sac/sac_tf_model.py | 2 -- rllib/agents/sac/sac_torch_model.py | 2 -- rllib/env/policy_client.py | 1 + rllib/env/policy_server_input.py | 1 + rllib/evaluation/metrics.py | 15 +-------- .../tests/test_trajectory_view_api.py | 2 +- .../examples/custom_metrics_and_callbacks.py | 13 +------- rllib/examples/serving/cartpole_client.py | 2 +- rllib/examples/serving/cartpole_server.py | 2 -- rllib/execution/metric_ops.py | 3 -- rllib/execution/train_ops.py | 18 +++------- rllib/policy/eager_tf_policy.py | 9 ++--- rllib/policy/tf_policy.py | 15 +++------ rllib/policy/torch_policy.py | 4 +-- rllib/tests/test_supported_multi_agent.py | 5 +-- rllib/tests/test_supported_spaces.py | 33 +++++++++---------- rllib/utils/sgd.py | 12 +++---- 20 files changed, 73 insertions(+), 116 deletions(-) diff --git a/rllib/BUILD b/rllib/BUILD index 431f6b75ab19..05c09d85d8b9 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -1466,29 +1466,29 @@ py_test( args = ["TestSupportedMultiAgentPG"] ) +#py_test( +# name = "tests/test_supported_multi_agent_off_policy", +# main = "tests/test_supported_multi_agent.py", +# tags = ["tests_dir", "tests_dir_S"], +# size = "medium", +# srcs = ["tests/test_supported_multi_agent.py"], +# args = ["TestSupportedMultiAgentOffPolicy"] +#) + py_test( - name = "tests/test_supported_multi_agent_off_policy", - main = "tests/test_supported_multi_agent.py", + name = "tests/test_supported_spaces_pg", + main = "tests/test_supported_spaces.py", tags = ["tests_dir", "tests_dir_S"], - size = "medium", - srcs = ["tests/test_supported_multi_agent.py"], - args = ["TestSupportedMultiAgentOffPolicy"] + size = "enormous", + srcs = ["tests/test_supported_spaces.py"], + args = ["TestSupportedSpacesPG"] ) -# py_test( -# name = "tests/test_supported_spaces_pg", -# main = "tests/test_supported_spaces.py", -# tags = ["tests_dir", "tests_dir_S"], -# size = "enormous", -# srcs = ["tests/test_supported_spaces.py"], -# args = ["TestSupportedSpacesPG"] -# ) - py_test( name = "tests/test_supported_spaces_off_policy", main = "tests/test_supported_spaces.py", tags = ["tests_dir", "tests_dir_S"], - size = "medium", + size = "enormous", srcs = ["tests/test_supported_spaces.py"], args = ["TestSupportedSpacesOffPolicy"] ) @@ -1497,7 +1497,7 @@ py_test( name = "tests/test_supported_spaces_evolution_algos", main = "tests/test_supported_spaces.py", tags = ["tests_dir", "tests_dir_S"], - size = "medium", + size = "large", srcs = ["tests/test_supported_spaces.py"], args = ["TestSupportedSpacesEvolutionAlgos"] ) diff --git a/rllib/agents/callbacks.py b/rllib/agents/callbacks.py index 1972fabec711..e84cf41485b7 100644 --- a/rllib/agents/callbacks.py +++ b/rllib/agents/callbacks.py @@ -7,6 +7,7 @@ from ray.rllib.utils.annotations import PublicAPI from ray.rllib.utils.deprecation import deprecation_warning from ray.rllib.utils.typing import AgentID, PolicyID +from ray.util.debug import log_once if TYPE_CHECKING: from ray.rllib.evaluation import RolloutWorker @@ -55,6 +56,10 @@ def on_episode_start(self, kwargs: Forward compatibility placeholder. """ + if env_index is not None: + if log_once("callbacks_env_index_deprecated"): + deprecation_warning("env_index", "episode.env_id", error=False) + if self.legacy_callbacks.get("on_episode_start"): self.legacy_callbacks["on_episode_start"]({ "env": base_env, @@ -84,6 +89,10 @@ def on_episode_step(self, kwargs: Forward compatibility placeholder. """ + if env_index is not None: + if log_once("callbacks_env_index_deprecated"): + deprecation_warning("env_index", "episode.env_id", error=False) + if self.legacy_callbacks.get("on_episode_step"): self.legacy_callbacks["on_episode_step"]({ "env": base_env, @@ -115,6 +124,10 @@ def on_episode_end(self, kwargs: Forward compatibility placeholder. """ + if env_index is not None: + if log_once("callbacks_env_index_deprecated"): + deprecation_warning("env_index", "episode.env_id", error=False) + if self.legacy_callbacks.get("on_episode_end"): self.legacy_callbacks["on_episode_end"]({ "env": base_env, @@ -175,7 +188,7 @@ def on_sample_end(self, *, worker: "RolloutWorker", samples: SampleBatch, }) def on_learn_on_batch(self, *, policy: Policy, train_batch: SampleBatch, - result: dict, **kwargs) -> None: + **kwargs) -> None: """Called at the beginning of Policy.learn_on_batch(). Note: This is called before 0-padding via @@ -185,7 +198,6 @@ def on_learn_on_batch(self, *, policy: Policy, train_batch: SampleBatch, policy (Policy): Reference to the current Policy object. train_batch (SampleBatch): SampleBatch to be trained on. You can mutate this object to modify the samples generated. - result (dict): A results dict to add custom metrics to. kwargs: Forward compatibility placeholder. """ diff --git a/rllib/agents/marwil/tests/test_marwil.py b/rllib/agents/marwil/tests/test_marwil.py index a0b3caa1079e..afb3ec9ee261 100644 --- a/rllib/agents/marwil/tests/test_marwil.py +++ b/rllib/agents/marwil/tests/test_marwil.py @@ -51,7 +51,7 @@ def test_marwil_compilation_and_learning_from_offline_file(self): min_reward = 70.0 # Test for all frameworks. - for _ in framework_iterator(config, frameworks=("tf", "torch")): + for _ in framework_iterator(config): trainer = marwil.MARWILTrainer(config=config, env="CartPole-v0") learnt = False for i in range(num_iterations): diff --git a/rllib/agents/sac/sac_tf_model.py b/rllib/agents/sac/sac_tf_model.py index b457f1e947e0..e2c56b5215d2 100644 --- a/rllib/agents/sac/sac_tf_model.py +++ b/rllib/agents/sac/sac_tf_model.py @@ -231,8 +231,6 @@ def _get_q_value(self, model_out, actions, net): if isinstance(net.obs_space, Box): if isinstance(model_out, (list, tuple)): model_out = tf.concat(model_out, axis=-1) - elif isinstance(model_out, dict): - model_out = tf.concat(list(model_out.values()), axis=-1) elif isinstance(model_out, dict): model_out = list(model_out.values()) diff --git a/rllib/agents/sac/sac_torch_model.py b/rllib/agents/sac/sac_torch_model.py index 1288d20da362..f3fe34e23324 100644 --- a/rllib/agents/sac/sac_torch_model.py +++ b/rllib/agents/sac/sac_torch_model.py @@ -237,8 +237,6 @@ def _get_q_value(self, model_out, actions, net): if isinstance(net.obs_space, Box): if isinstance(model_out, (list, tuple)): model_out = torch.cat(model_out, dim=-1) - elif isinstance(model_out, dict): - model_out = torch.cat(list(model_out.values()), dim=-1) elif isinstance(model_out, dict): model_out = list(model_out.values()) diff --git a/rllib/env/policy_client.py b/rllib/env/policy_client.py index 39a85a5cf91b..232f74f1a17f 100644 --- a/rllib/env/policy_client.py +++ b/rllib/env/policy_client.py @@ -17,6 +17,7 @@ EnvActionType logger = logging.getLogger(__name__) +logger.setLevel("INFO") # TODO(ekl) seems to be needed for cartpole_client.py try: import requests # `requests` is not part of stdlib. diff --git a/rllib/env/policy_server_input.py b/rllib/env/policy_server_input.py index 952130ac5306..45c2a00d292c 100644 --- a/rllib/env/policy_server_input.py +++ b/rllib/env/policy_server_input.py @@ -13,6 +13,7 @@ from ray.rllib.utils.annotations import override, PublicAPI logger = logging.getLogger(__name__) +logger.setLevel("INFO") # TODO(ekl) this is needed for cartpole_server.py class PolicyServerInput(ThreadingMixIn, HTTPServer, InputReader): diff --git a/rllib/evaluation/metrics.py b/rllib/evaluation/metrics.py index e44b301f42d3..6ed723b156d2 100644 --- a/rllib/evaluation/metrics.py +++ b/rllib/evaluation/metrics.py @@ -1,7 +1,7 @@ import logging import numpy as np import collections -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Union import ray from ray.rllib.evaluation.rollout_metrics import RolloutMetrics @@ -14,19 +14,6 @@ logger = logging.getLogger(__name__) -def extract_stats(stats: Dict, key: str) -> Dict[str, Any]: - if key in stats: - return stats[key] - - multiagent_stats = {} - for k, v in stats.items(): - if isinstance(v, dict): - if key in v: - multiagent_stats[k] = v[key] - - return multiagent_stats - - @DeveloperAPI def get_learner_stats(grad_info: GradInfoDict) -> LearnerStatsDict: """Return optimization stats reported from the policy. diff --git a/rllib/evaluation/tests/test_trajectory_view_api.py b/rllib/evaluation/tests/test_trajectory_view_api.py index 1c56ef2b9e65..1601e07f3666 100644 --- a/rllib/evaluation/tests/test_trajectory_view_api.py +++ b/rllib/evaluation/tests/test_trajectory_view_api.py @@ -25,7 +25,7 @@ class MyCallbacks(DefaultCallbacks): @override(DefaultCallbacks) - def on_learn_on_batch(self, *, policy, train_batch, result, **kwargs): + def on_learn_on_batch(self, *, policy, train_batch, **kwargs): assert train_batch.count == 201 assert sum(train_batch.seq_lens) == 201 for k, v in train_batch.data.items(): diff --git a/rllib/examples/custom_metrics_and_callbacks.py b/rllib/examples/custom_metrics_and_callbacks.py index ecbe99bd7baa..745a94029a2e 100644 --- a/rllib/examples/custom_metrics_and_callbacks.py +++ b/rllib/examples/custom_metrics_and_callbacks.py @@ -59,12 +59,6 @@ def on_train_result(self, *, trainer, result: dict, **kwargs): # you can mutate the result dict to add new fields to return result["callback_ok"] = True - def on_learn_on_batch(self, *, policy: Policy, train_batch: SampleBatch, - result: dict, **kwargs) -> None: - result["sum_actions_in_train_batch"] = np.sum(train_batch["actions"]) - print("policy.learn_on_batch() result: {} -> sum actions: {}".format( - policy, result["sum_actions_in_train_batch"])) - def on_postprocess_trajectory( self, *, worker: RolloutWorker, episode: MultiAgentEpisode, agent_id: str, policy_id: str, policies: Dict[str, Policy], @@ -94,7 +88,7 @@ def on_postprocess_trajectory( "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), }).trials - # Verify episode-related custom metrics are there. + # verify custom metrics for integration tests custom_metrics = trials[0].last_result["custom_metrics"] print(custom_metrics) assert "pole_angle_mean" in custom_metrics @@ -102,8 +96,3 @@ def on_postprocess_trajectory( assert "pole_angle_max" in custom_metrics assert "num_batches_mean" in custom_metrics assert "callback_ok" in trials[0].last_result - - # Verify `on_learn_on_batch` custom metrics are there (per policy). - info_custom_metrics = custom_metrics["default_policy"] - print(info_custom_metrics) - assert "sum_actions_in_train_batch" in info_custom_metrics diff --git a/rllib/examples/serving/cartpole_client.py b/rllib/examples/serving/cartpole_client.py index f2d45b5b3ea2..3541e0f6f7c6 100755 --- a/rllib/examples/serving/cartpole_client.py +++ b/rllib/examples/serving/cartpole_client.py @@ -17,7 +17,7 @@ parser.add_argument( "--no-train", action="store_true", help="Whether to disable training.") parser.add_argument( - "--inference-mode", type=str, default="local", choices=["local", "remote"]) + "--inference-mode", type=str, required=True, choices=["local", "remote"]) parser.add_argument( "--off-policy", action="store_true", diff --git a/rllib/examples/serving/cartpole_server.py b/rllib/examples/serving/cartpole_server.py index f76a34a91fc1..297320422ca0 100755 --- a/rllib/examples/serving/cartpole_server.py +++ b/rllib/examples/serving/cartpole_server.py @@ -13,7 +13,6 @@ from ray.rllib.agents.dqn import DQNTrainer from ray.rllib.agents.ppo import PPOTrainer from ray.rllib.env.policy_server_input import PolicyServerInput -from ray.rllib.examples.custom_metrics_and_callbacks import MyCallbacks from ray.tune.logger import pretty_print SERVER_ADDRESS = "localhost" @@ -44,7 +43,6 @@ "num_workers": 0, # Disable OPE, since the rollouts are coming from online clients. "input_evaluation": [], - "callbacks": MyCallbacks, } if args.run == "DQN": diff --git a/rllib/execution/metric_ops.py b/rllib/execution/metric_ops.py index 06857f674a8e..70ae38e3fbf8 100644 --- a/rllib/execution/metric_ops.py +++ b/rllib/execution/metric_ops.py @@ -88,7 +88,6 @@ def __call__(self, _: Any) -> Dict: # Add in iterator metrics. metrics = _get_shared_metrics() - custom_metrics_from_info = metrics.info.pop("custom_metrics", {}) timers = {} counters = {} info = {} @@ -107,8 +106,6 @@ def __call__(self, _: Any) -> Dict: res["timers"] = timers res["info"] = info res["info"].update(counters) - res["custom_metrics"] = res.get("custom_metrics", {}) - res["custom_metrics"].update(custom_metrics_from_info) return res diff --git a/rllib/execution/train_ops.py b/rllib/execution/train_ops.py index fe8e7b95b6f5..e2411ed3279a 100644 --- a/rllib/execution/train_ops.py +++ b/rllib/execution/train_ops.py @@ -5,8 +5,7 @@ from typing import List, Tuple, Any import ray -from ray.rllib.evaluation.metrics import extract_stats, get_learner_stats, \ - LEARNER_STATS_KEY +from ray.rllib.evaluation.metrics import get_learner_stats, LEARNER_STATS_KEY from ray.rllib.evaluation.worker_set import WorkerSet from ray.rllib.execution.common import \ STEPS_SAMPLED_COUNTER, STEPS_TRAINED_COUNTER, LEARNER_INFO, \ @@ -59,25 +58,18 @@ def __call__(self, learn_timer = metrics.timers[LEARN_ON_BATCH_TIMER] with learn_timer: if self.num_sgd_iter > 1 or self.sgd_minibatch_size > 0: - lw = self.workers.local_worker() + w = self.workers.local_worker() info = do_minibatch_sgd( - batch, {pid: lw.get_policy(pid) - for pid in self.policies}, lw, self.num_sgd_iter, + batch, {p: w.get_policy(p) + for p in self.policies}, w, self.num_sgd_iter, self.sgd_minibatch_size, []) # TODO(ekl) shouldn't be returning learner stats directly here - # TODO(sven): Skips `custom_metrics` key from on_learn_on_batch - # callback (shouldn't). metrics.info[LEARNER_INFO] = info else: info = self.workers.local_worker().learn_on_batch(batch) - metrics.info[LEARNER_INFO] = extract_stats( - info, LEARNER_STATS_KEY) - metrics.info["custom_metrics"] = extract_stats( - info, "custom_metrics") + metrics.info[LEARNER_INFO] = get_learner_stats(info) learn_timer.push_units_processed(batch.count) metrics.counters[STEPS_TRAINED_COUNTER] += batch.count - # Update weights - after learning on the local worker - on all remote - # workers. if self.workers.remote_workers(): with metrics.timers[WORKER_UPDATE_TIMER]: weights = ray.put(self.workers.local_worker().get_weights( diff --git a/rllib/policy/eager_tf_policy.py b/rllib/policy/eager_tf_policy.py index 050e655ca6ff..1e1f42c05df2 100644 --- a/rllib/policy/eager_tf_policy.py +++ b/rllib/policy/eager_tf_policy.py @@ -320,11 +320,8 @@ def postprocess_trajectory(self, @override(Policy) def learn_on_batch(self, postprocessed_batch): # Callback handling. - learn_stats = {} self.callbacks.on_learn_on_batch( - policy=self, - train_batch=postprocessed_batch, - result=learn_stats) + policy=self, train_batch=postprocessed_batch) pad_batch_to_sequences_of_same_size( postprocessed_batch, @@ -336,9 +333,7 @@ def learn_on_batch(self, postprocessed_batch): self._is_training = True postprocessed_batch["is_training"] = True - stats = self._learn_on_batch_eager(postprocessed_batch) - stats.update({"custom_metrics": learn_stats}) - return stats + return self._learn_on_batch_eager(postprocessed_batch) @convert_eager_inputs @convert_eager_outputs diff --git a/rllib/policy/tf_policy.py b/rllib/policy/tf_policy.py index f16f3f72adfd..3ac64441575d 100644 --- a/rllib/policy/tf_policy.py +++ b/rllib/policy/tf_policy.py @@ -423,18 +423,9 @@ def compute_log_likelihoods( def learn_on_batch( self, postprocessed_batch: SampleBatch) -> Dict[str, TensorType]: assert self.loss_initialized() - builder = TFRunBuilder(self._sess, "learn_on_batch") - - # Callback handling. - learn_stats = {} - self.callbacks.on_learn_on_batch( - policy=self, train_batch=postprocessed_batch, result=learn_stats) - fetches = self._build_learn_on_batch(builder, postprocessed_batch) - stats = builder.get(fetches) - stats.update({"custom_metrics": learn_stats}) - return stats + return builder.get(fetches) @override(Policy) @DeveloperAPI @@ -850,6 +841,10 @@ def _build_apply_gradients(self, builder, gradients): def _build_learn_on_batch(self, builder, postprocessed_batch): self._debug_vars() + # Callback handling. + self.callbacks.on_learn_on_batch( + policy=self, train_batch=postprocessed_batch) + builder.add_feed_dict(self.extra_compute_grad_feed_dict()) builder.add_feed_dict( self._get_loss_inputs_dict(postprocessed_batch, shuffle=False)) diff --git a/rllib/policy/torch_policy.py b/rllib/policy/torch_policy.py index 7ff26dfda601..e492a5048563 100644 --- a/rllib/policy/torch_policy.py +++ b/rllib/policy/torch_policy.py @@ -347,9 +347,8 @@ def learn_on_batch( if self.model: self.model.train() # Callback handling. - learn_stats = {} self.callbacks.on_learn_on_batch( - policy=self, train_batch=postprocessed_batch, result=learn_stats) + policy=self, train_batch=postprocessed_batch) # Compute gradients (will calculate all losses and `backward()` # them to get the grads). @@ -361,7 +360,6 @@ def learn_on_batch( if self.model: fetches["model"] = self.model.metrics() - fetches.update({"custom_metrics": learn_stats}) return fetches diff --git a/rllib/tests/test_supported_multi_agent.py b/rllib/tests/test_supported_multi_agent.py index 0f4063bb2e88..933c2d60814e 100644 --- a/rllib/tests/test_supported_multi_agent.py +++ b/rllib/tests/test_supported_multi_agent.py @@ -66,7 +66,7 @@ def test_ppo_multiagent(self): class TestSupportedMultiAgentOffPolicy(unittest.TestCase): @classmethod def setUpClass(cls) -> None: - ray.init(num_cpus=6) + ray.init(num_cpus=4) @classmethod def tearDownClass(cls) -> None: @@ -82,9 +82,6 @@ def test_apex_multiagent(self): "min_iter_time_s": 1, "learning_starts": 10, "target_network_update_freq": 100, - "optimizer": { - "num_replay_buffer_shards": 1, - }, }) def test_apex_ddpg_multiagent(self): diff --git a/rllib/tests/test_supported_spaces.py b/rllib/tests/test_supported_spaces.py index 9da6249273c9..05b90cba52d2 100644 --- a/rllib/tests/test_supported_spaces.py +++ b/rllib/tests/test_supported_spaces.py @@ -47,8 +47,6 @@ def check_support(alg, config, train=True, check_bounds=False, tfe=False): config["log_level"] = "ERROR" - config["train_batch_size"] = 10 - config["rollout_fragment_length"] = 10 def _do_check(alg, config, a_name, o_name): fw = config["framework"] @@ -90,24 +88,25 @@ def _do_check(alg, config, a_name, o_name): frameworks = ("tf", "torch") if tfe: - frameworks += ("tf2", "tfe") + frameworks += ("tfe", ) for _ in framework_iterator(config, frameworks=frameworks): - # Zip through action- and obs-spaces. - for a_name, o_name in zip(ACTION_SPACES_TO_TEST.keys(), - OBSERVATION_SPACES_TO_TEST.keys()): - _do_check(alg, config, a_name, o_name) - # Do the remaining obs spaces. - assert len(OBSERVATION_SPACES_TO_TEST) >= len(ACTION_SPACES_TO_TEST) - for i, o_name in enumerate(OBSERVATION_SPACES_TO_TEST.keys()): - if i < len(ACTION_SPACES_TO_TEST): + # Check all action spaces (using a discrete obs-space). + for a_name in ACTION_SPACES_TO_TEST.keys(): + _do_check(alg, config, a_name, "discrete") + # Check all obs spaces (using a supported action-space). + for o_name in OBSERVATION_SPACES_TO_TEST.keys(): + # We already tested discrete observation spaces against all action + # spaces above -> skip. + if o_name == "discrete": continue - _do_check(alg, config, "discrete", o_name) + a_name = "discrete" if alg not in ["DDPG", "SAC"] else "vector" + _do_check(alg, config, a_name, o_name) class TestSupportedSpacesPG(unittest.TestCase): @classmethod def setUpClass(cls) -> None: - ray.init(num_cpus=6) + ray.init(num_cpus=4) @classmethod def tearDownClass(cls) -> None: @@ -126,11 +125,11 @@ def test_impala(self): def test_ppo(self): config = { - "num_workers": 0, - "train_batch_size": 100, - "rollout_fragment_length": 10, + "num_workers": 1, "num_sgd_iter": 1, - "sgd_minibatch_size": 10, + "train_batch_size": 10, + "rollout_fragment_length": 10, + "sgd_minibatch_size": 1, } check_support("PPO", config, check_bounds=True, tfe=True) diff --git a/rllib/utils/sgd.py b/rllib/utils/sgd.py index 787b885cd7d6..b5b72d44d37c 100644 --- a/rllib/utils/sgd.py +++ b/rllib/utils/sgd.py @@ -104,12 +104,12 @@ def do_minibatch_sgd(samples, policies, local_worker, num_sgd_iter, """Execute minibatch SGD. Args: - samples (SampleBatch): Batch of samples to optimize. - policies (dict): Dictionary of policies to optimize. - local_worker (RolloutWorker): Master rollout worker instance. - num_sgd_iter (int): Number of epochs of optimization to take. - sgd_minibatch_size (int): Size of minibatches to use for optimization. - standardize_fields (list): List of sample field names that should be + samples (SampleBatch): batch of samples to optimize. + policies (dict): dictionary of policies to optimize. + local_worker (RolloutWorker): master rollout worker instance. + num_sgd_iter (int): number of epochs of optimization to take. + sgd_minibatch_size (int): size of minibatches to use for optimization. + standardize_fields (list): list of sample field names that should be normalized prior to optimization. Returns: From 7fa1cfc7edb26c32c7c7c5c31f3a3c09b7b6701e Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 064/244] Revert "[RLlib] Pytorch MAML fix for more than two workers with discrete actions (#13835)" This reverts commit 2c7d2af5f1923ef8c60acd0bb1586fce41a7053b. --- rllib/agents/maml/maml_torch_policy.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/rllib/agents/maml/maml_torch_policy.py b/rllib/agents/maml/maml_torch_policy.py index 695826798272..2e0e1e2083b7 100644 --- a/rllib/agents/maml/maml_torch_policy.py +++ b/rllib/agents/maml/maml_torch_policy.py @@ -8,8 +8,8 @@ from ray.rllib.agents.ppo.ppo_tf_policy import setup_config from ray.rllib.agents.ppo.ppo_torch_policy import vf_preds_fetches, \ ValueNetworkMixin -from ray.rllib.utils.torch_ops import apply_grad_clipping from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.torch_ops import apply_grad_clipping torch, nn = try_import_torch() @@ -178,7 +178,7 @@ def __init__(self, # Meta Update ppo_loss, s_loss, kl_loss, v_loss, ent = self.compute_losses( - fnet, self.inner_adaptation_steps - 1, i, clip_loss=True) + fnet, self.inner_adaptation_steps, i, clip_loss=True) inner_loss = torch.mean( torch.stack([ @@ -271,14 +271,8 @@ def maml_loss(policy, model, dist_class, train_batch): # `split` may not exist yet (during test-loss call), use a dummy value. # Cannot use get here due to train_batch being a TrackingDict. - if "split" in train_batch: - split = train_batch["split"] - else: - split_shape = (policy.config["inner_adaptation_steps"], - policy.config["num_workers"]) - split_const = int(train_batch["obs"].shape[0] // - (split_shape[0] * split_shape[1])) - split = torch.ones(split_shape, dtype=int) * split_const + split = train_batch["split"] if "split" in train_batch else \ + torch.tensor([[8, 8], [8, 8]]) policy.loss_obj = MAMLLoss( model=model, dist_class=dist_class, From 531073c8436d6d866879c088dfa7541e004d1a42 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 065/244] Revert "[RLlib] Allow `rllib rollout` to run distributed via evaluation workers. (#13718)" This reverts commit 14967d2e93affe51b7a2f1034d47d0b1ab30f95d. --- rllib/BUILD | 8 +- rllib/__init__.py | 4 +- rllib/agents/mock.py | 8 +- rllib/agents/registry.py | 88 +++--- rllib/agents/trainer.py | 22 +- rllib/contrib/registry.py | 17 +- rllib/env/base_env.py | 42 +-- rllib/env/vector_env.py | 85 ++---- rllib/evaluation/rollout_worker.py | 11 +- rllib/evaluation/sampler.py | 35 +-- rllib/examples/export/cartpole_dqn_export.py | 4 +- rllib/examples/pettingzoo_env.py | 6 +- .../rock_paper_scissors_multiagent.py | 4 +- rllib/execution/learner_thread.py | 10 +- rllib/rollout.py | 285 ++++++++---------- rllib/tests/test_checkpoint_restore.py | 4 +- rllib/tests/test_eager_support.py | 4 +- rllib/tests/test_export.py | 4 +- rllib/tests/test_ignore_worker_failure.py | 6 +- rllib/tests/test_model_imports.py | 4 +- rllib/tests/test_pettingzoo_env.py | 4 +- rllib/tests/test_supported_multi_agent.py | 7 +- rllib/tests/test_supported_spaces.py | 4 +- rllib/train.py | 3 +- 24 files changed, 258 insertions(+), 411 deletions(-) diff --git a/rllib/BUILD b/rllib/BUILD index 05c09d85d8b9..cfe22c60fbfd 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -517,7 +517,7 @@ py_test( py_test( name = "test_marwil", tags = ["agents_dir"], - size = "large", + size = "medium", # Include the json data file. data = ["tests/data/cartpole/large.json"], srcs = ["agents/marwil/tests/test_marwil.py"] @@ -527,7 +527,7 @@ py_test( py_test( name = "test_bc", tags = ["agents_dir"], - size = "large", + size = "medium", # Include the json data file. data = ["tests/data/cartpole/large.json"], srcs = ["agents/marwil/tests/test_bc.py"] @@ -1753,7 +1753,7 @@ py_test( name = "examples/custom_eval_tf", main = "examples/custom_eval.py", tags = ["examples", "examples_C"], - size = "medium", + size = "small", srcs = ["examples/custom_eval.py"], args = ["--num-cpus=4", "--as-test"] ) @@ -1762,7 +1762,7 @@ py_test( name = "examples/custom_eval_torch", main = "examples/custom_eval.py", tags = ["examples", "examples_C"], - size = "medium", + size = "small", srcs = ["examples/custom_eval.py"], args = ["--num-cpus=4", "--as-test", "--torch"] ) diff --git a/rllib/__init__.py b/rllib/__init__.py index 4af44a28786f..d27194f692b3 100644 --- a/rllib/__init__.py +++ b/rllib/__init__.py @@ -27,12 +27,12 @@ def _setup_logger(): def _register_all(): from ray.rllib.agents.trainer import Trainer, with_common_config - from ray.rllib.agents.registry import ALGORITHMS, get_trainer_class + from ray.rllib.agents.registry import ALGORITHMS, get_agent_class from ray.rllib.contrib.registry import CONTRIBUTED_ALGORITHMS for key in list(ALGORITHMS.keys()) + list(CONTRIBUTED_ALGORITHMS.keys( )) + ["__fake", "__sigmoid_fake_data", "__parameter_tuning"]: - register_trainable(key, get_trainer_class(key)) + register_trainable(key, get_agent_class(key)) def _see_contrib(name): """Returns dummy agent class warning algo is in contrib/.""" diff --git a/rllib/agents/mock.py b/rllib/agents/mock.py index 1a9017252567..90bfffe83bd8 100644 --- a/rllib/agents/mock.py +++ b/rllib/agents/mock.py @@ -118,14 +118,14 @@ def step(self): info={}) -def _trainer_import_failed(trace): +def _agent_import_failed(trace): """Returns dummy agent class for if PyTorch etc. is not installed.""" - class _TrainerImportFailed(Trainer): - _name = "TrainerImportFailed" + class _AgentImportFailed(Trainer): + _name = "AgentImportFailed" _default_config = with_common_config({}) def setup(self, config): raise ImportError(trace) - return _TrainerImportFailed + return _AgentImportFailed diff --git a/rllib/agents/registry.py b/rllib/agents/registry.py index efed5a21742f..8ec4a4582ede 100644 --- a/rllib/agents/registry.py +++ b/rllib/agents/registry.py @@ -3,127 +3,126 @@ import traceback from ray.rllib.contrib.registry import CONTRIBUTED_ALGORITHMS -from ray.rllib.utils.deprecation import deprecation_warning def _import_a2c(): from ray.rllib.agents import a3c - return a3c.A2CTrainer, a3c.a2c.A2C_DEFAULT_CONFIG + return a3c.A2CTrainer def _import_a3c(): from ray.rllib.agents import a3c - return a3c.A3CTrainer, a3c.DEFAULT_CONFIG + return a3c.A3CTrainer def _import_apex(): from ray.rllib.agents import dqn - return dqn.ApexTrainer, dqn.apex.APEX_DEFAULT_CONFIG + return dqn.ApexTrainer def _import_apex_ddpg(): from ray.rllib.agents import ddpg - return ddpg.ApexDDPGTrainer, ddpg.apex.APEX_DDPG_DEFAULT_CONFIG + return ddpg.ApexDDPGTrainer def _import_appo(): from ray.rllib.agents import ppo - return ppo.APPOTrainer, ppo.appo.DEFAULT_CONFIG + return ppo.APPOTrainer def _import_ars(): from ray.rllib.agents import ars - return ars.ARSTrainer, ars.DEFAULT_CONFIG + return ars.ARSTrainer def _import_bc(): from ray.rllib.agents import marwil - return marwil.BCTrainer, marwil.DEFAULT_CONFIG + return marwil.BCTrainer def _import_cql(): from ray.rllib.agents import cql - return cql.CQLTrainer, cql.CQL_DEFAULT_CONFIG + return cql.CQLTrainer def _import_ddpg(): from ray.rllib.agents import ddpg - return ddpg.DDPGTrainer, ddpg.DEFAULT_CONFIG + return ddpg.DDPGTrainer def _import_ddppo(): from ray.rllib.agents import ppo - return ppo.DDPPOTrainer, ppo.DEFAULT_CONFIG + return ppo.DDPPOTrainer def _import_dqn(): from ray.rllib.agents import dqn - return dqn.DQNTrainer, dqn.DEFAULT_CONFIG + return dqn.DQNTrainer def _import_dreamer(): from ray.rllib.agents import dreamer - return dreamer.DREAMERTrainer, dreamer.DEFAULT_CONFIG + return dreamer.DREAMERTrainer def _import_es(): from ray.rllib.agents import es - return es.ESTrainer, es.DEFAULT_CONFIG + return es.ESTrainer def _import_impala(): from ray.rllib.agents import impala - return impala.ImpalaTrainer, impala.DEFAULT_CONFIG + return impala.ImpalaTrainer def _import_maml(): from ray.rllib.agents import maml - return maml.MAMLTrainer, maml.DEFAULT_CONFIG + return maml.MAMLTrainer def _import_marwil(): from ray.rllib.agents import marwil - return marwil.MARWILTrainer, marwil.DEFAULT_CONFIG + return marwil.MARWILTrainer def _import_mbmpo(): from ray.rllib.agents import mbmpo - return mbmpo.MBMPOTrainer, mbmpo.DEFAULT_CONFIG + return mbmpo.MBMPOTrainer def _import_pg(): from ray.rllib.agents import pg - return pg.PGTrainer, pg.DEFAULT_CONFIG + return pg.PGTrainer def _import_ppo(): from ray.rllib.agents import ppo - return ppo.PPOTrainer, ppo.DEFAULT_CONFIG + return ppo.PPOTrainer def _import_qmix(): from ray.rllib.agents import qmix - return qmix.QMixTrainer, qmix.DEFAULT_CONFIG + return qmix.QMixTrainer def _import_sac(): from ray.rllib.agents import sac - return sac.SACTrainer, sac.DEFAULT_CONFIG + return sac.SACTrainer def _import_simple_q(): from ray.rllib.agents import dqn - return dqn.SimpleQTrainer, dqn.simple_q.DEFAULT_CONFIG + return dqn.SimpleQTrainer def _import_slate_q(): from ray.rllib.agents import slateq - return slateq.SlateQTrainer, slateq.DEFAULT_CONFIG + return slateq.SlateQTrainer def _import_td3(): from ray.rllib.agents import ddpg - return ddpg.TD3Trainer, ddpg.td3.TD3_DEFAULT_CONFIG + return ddpg.TD3Trainer ALGORITHMS = { @@ -154,47 +153,32 @@ def _import_td3(): } -def get_trainer_class(alg: str, return_config=False) -> type: - """Returns the class of a known Trainer given its name.""" +def get_agent_class(alg: str) -> type: + """Returns the class of a known agent given its name.""" try: - return _get_trainer_class(alg, return_config=return_config) + return _get_agent_class(alg) except ImportError: - from ray.rllib.agents.mock import _trainer_import_failed - class_ = _trainer_import_failed(traceback.format_exc()) - config = class_._default_config - if return_config: - return class_, config - return class_ - + from ray.rllib.agents.mock import _agent_import_failed + return _agent_import_failed(traceback.format_exc()) -# Deprecated: Use `get_trainer_class` instead. -def get_agent_class(alg: str) -> type: - deprecation_warning("get_agent_class", "get_trainer_class", error=False) - return get_trainer_class(alg) - -def _get_trainer_class(alg: str, return_config=False) -> type: +def _get_agent_class(alg: str) -> type: if alg in ALGORITHMS: - class_, config = ALGORITHMS[alg]() + return ALGORITHMS[alg]() elif alg in CONTRIBUTED_ALGORITHMS: - class_, config = CONTRIBUTED_ALGORITHMS[alg]() + return CONTRIBUTED_ALGORITHMS[alg]() elif alg == "script": from ray.tune import script_runner - class_, config = script_runner.ScriptRunner, {} + return script_runner.ScriptRunner elif alg == "__fake": from ray.rllib.agents.mock import _MockTrainer - class_, config = _MockTrainer, _MockTrainer._default_config + return _MockTrainer elif alg == "__sigmoid_fake_data": from ray.rllib.agents.mock import _SigmoidFakeData - class_, config = _SigmoidFakeData, _SigmoidFakeData._default_config + return _SigmoidFakeData elif alg == "__parameter_tuning": from ray.rllib.agents.mock import _ParameterTuningTrainer - class_, config = _ParameterTuningTrainer, \ - _ParameterTuningTrainer._default_config + return _ParameterTuningTrainer else: raise Exception(("Unknown algorithm {}.").format(alg)) - - if return_config: - return class_, config - return class_ diff --git a/rllib/agents/trainer.py b/rllib/agents/trainer.py index b2c57d0b1311..65e315a1d1e8 100644 --- a/rllib/agents/trainer.py +++ b/rllib/agents/trainer.py @@ -52,7 +52,7 @@ # Number of rollout worker actors to create for parallel sampling. Setting # this to 0 will force rollouts to be done in the trainer actor. "num_workers": 2, - # Number of environments to evaluate vector-wise per worker. This enables + # Number of environments to evaluate vectorwise per worker. This enables # model inference batching, which can improve performance for inference # bottlenecked workloads. "num_envs_per_worker": 1, @@ -120,18 +120,10 @@ # set this if soft_horizon=True, unless your env is actually running # forever without returning done=True. "no_done_at_end": False, - # Environment name can also be passed via config. - "env": None, # Arguments to pass to the env creator. "env_config": {}, - # If True, try to render the environment on the local worker or on worker - # 1 (if num_workers > 0). For vectorized envs, this usually means that only - # the first sub-environment will be rendered. - "render_env": False, - # If True, store evaluation videos in the output dir. - # Alternatively, provide a path (str) to a directory here, where the env - # recordings should be stored instead. - "record_env": False, + # Environment name can also be passed via config. + "env": None, # Unsquash actions to the upper and lower bounds of env's action space "normalize_actions": False, # Whether to clip rewards during Policy's postprocessing. @@ -221,10 +213,9 @@ }, # Number of parallel workers to use for evaluation. Note that this is set # to zero by default, which means evaluation will be run in the trainer - # process (only if evaluation_interval is not None). If you increase this, - # it will increase the Ray resource usage of the trainer since evaluation - # workers are created separately from rollout workers (used to sample data - # for training). + # process. If you increase this, it will increase the Ray resource usage + # of the trainer since evaluation workers are created separately from + # rollout workers. "evaluation_num_workers": 0, # Customize the evaluation method. This must be a function of signature # (trainer: Trainer, eval_workers: WorkerSet) -> metrics: dict. See the @@ -671,6 +662,7 @@ def get_scope(): extra_config["in_evaluation"] is True extra_config.update({ "batch_mode": "complete_episodes", + "rollout_fragment_length": 1, "in_evaluation": True, }) logger.debug( diff --git a/rllib/contrib/registry.py b/rllib/contrib/registry.py index 301516602c24..aed8712bbc0c 100644 --- a/rllib/contrib/registry.py +++ b/rllib/contrib/registry.py @@ -3,29 +3,28 @@ def _import_random_agent(): from ray.rllib.contrib.random_agent.random_agent import RandomAgent - return RandomAgent, RandomAgent._default_config + return RandomAgent def _import_maddpg(): from ray.rllib.contrib import maddpg - return maddpg.MADDPGTrainer, maddpg.DEFAULT_CONFIG + return maddpg.MADDPGTrainer def _import_alphazero(): from ray.rllib.contrib.alpha_zero.core.alpha_zero_trainer import\ - AlphaZeroTrainer, DEFAULT_CONFIG - return AlphaZeroTrainer, DEFAULT_CONFIG + AlphaZeroTrainer + return AlphaZeroTrainer def _import_bandit_lints(): - from ray.rllib.contrib.bandits.agents.lin_ts import LinTSTrainer, TS_CONFIG - return LinTSTrainer, TS_CONFIG + from ray.rllib.contrib.bandits.agents.lin_ts import LinTSTrainer + return LinTSTrainer def _import_bandit_linucb(): - from ray.rllib.contrib.bandits.agents.lin_ucb import LinUCBTrainer, \ - UCB_CONFIG - return LinUCBTrainer, UCB_CONFIG + from ray.rllib.contrib.bandits.agents.lin_ucb import LinUCBTrainer + return LinUCBTrainer CONTRIBUTED_ALGORITHMS = { diff --git a/rllib/env/base_env.py b/rllib/env/base_env.py index 081fae6fe13c..9ff16ac5ac6c 100644 --- a/rllib/env/base_env.py +++ b/rllib/env/base_env.py @@ -5,8 +5,8 @@ from ray.rllib.env.multi_agent_env import MultiAgentEnv from ray.rllib.env.vector_env import VectorEnv from ray.rllib.utils.annotations import override, PublicAPI -from ray.rllib.utils.typing import AgentID, EnvID, EnvType, MultiAgentDict, \ - MultiEnvDict, PartialTrainerConfigDict +from ray.rllib.utils.typing import EnvType, MultiEnvDict, EnvID, \ + AgentID, MultiAgentDict if TYPE_CHECKING: from ray.rllib.models.preprocessors import Preprocessor @@ -80,14 +80,11 @@ class BaseEnv: """ @staticmethod - def to_base_env( - env: EnvType, - make_env: Callable[[int], EnvType] = None, - num_envs: int = 1, - remote_envs: bool = False, - remote_env_batch_wait_ms: int = 0, - policy_config: PartialTrainerConfigDict = None, - ) -> "BaseEnv": + def to_base_env(env: EnvType, + make_env: Callable[[int], EnvType] = None, + num_envs: int = 1, + remote_envs: bool = False, + remote_env_batch_wait_ms: int = 0) -> "BaseEnv": """Wraps any env type as needed to expose the async interface.""" from ray.rllib.env.remote_vector_env import RemoteVectorEnv @@ -132,9 +129,7 @@ def to_base_env( existing_envs=[env], num_envs=num_envs, action_space=env.action_space, - observation_space=env.observation_space, - policy_config=policy_config, - ) + observation_space=env.observation_space) env = _VectorEnvToBaseEnv(env) assert isinstance(env, BaseEnv), env return env @@ -210,18 +205,6 @@ def stop(self) -> None: if hasattr(env, "close"): env.close() - # Experimental method. - def try_render(self, env_id: Optional[EnvID] = None) -> None: - """Tries to render the environment. - - Args: - env_id (Optional[int]): The sub-env ID if applicable. If None, - renders the entire Env (i.e. all sub-envs). - """ - - # By default, do nothing. - pass - # Fixed agent identifier when there is only the single agent in the env _DUMMY_AGENT_ID = "agent0" @@ -363,19 +346,14 @@ def send_actions(self, action_dict: MultiEnvDict) -> None: self.vector_env.vector_step(action_vector) @override(BaseEnv) - def try_reset(self, env_id: Optional[EnvID] = None) -> MultiAgentDict: - assert env_id is None or isinstance(env_id, int) + def try_reset(self, + env_id: Optional[EnvID] = None) -> Optional[MultiAgentDict]: return {_DUMMY_AGENT_ID: self.vector_env.reset_at(env_id)} @override(BaseEnv) def get_unwrapped(self) -> List[EnvType]: return self.vector_env.get_unwrapped() - @override(BaseEnv) - def try_render(self, env_id: Optional[EnvID] = None) -> None: - assert env_id is None or isinstance(env_id, int) - return self.vector_env.try_render_at(env_id) - class _MultiAgentEnvToBaseEnv(BaseEnv): """Internal adapter of MultiAgentEnv to BaseEnv. diff --git a/rllib/env/vector_env.py b/rllib/env/vector_env.py index f07098d0a352..49d4bdf6d855 100644 --- a/rllib/env/vector_env.py +++ b/rllib/env/vector_env.py @@ -1,12 +1,11 @@ import logging import gym -from gym import wrappers as gym_wrappers import numpy as np -from typing import Callable, List, Optional, Tuple +from typing import Callable, List, Tuple from ray.rllib.utils.annotations import override, PublicAPI -from ray.rllib.utils.typing import EnvActionType, EnvConfigDict, EnvInfoDict, \ - EnvObsType, EnvType, PartialTrainerConfigDict +from ray.rllib.utils.typing import EnvType, EnvConfigDict, EnvObsType, \ + EnvInfoDict, EnvActionType logger = logging.getLogger(__name__) @@ -31,22 +30,19 @@ def __init__(self, observation_space: gym.Space, action_space: gym.Space, self.num_envs = num_envs @staticmethod - def wrap(make_env: Optional[Callable[[int], EnvType]] = None, - existing_envs: Optional[List[gym.Env]] = None, + def wrap(make_env: Callable[[int], EnvType] = None, + existing_envs: List[gym.Env] = None, num_envs: int = 1, - action_space: Optional[gym.Space] = None, - observation_space: Optional[gym.Space] = None, - env_config: Optional[EnvConfigDict] = None, - policy_config: Optional[PartialTrainerConfigDict] = None): + action_space: gym.Space = None, + observation_space: gym.Space = None, + env_config: EnvConfigDict = None): return _VectorizedGymEnv( make_env=make_env, existing_envs=existing_envs or [], num_envs=num_envs, observation_space=observation_space, action_space=action_space, - env_config=env_config, - policy_config=policy_config, - ) + env_config=env_config) @PublicAPI def vector_reset(self) -> List[EnvObsType]: @@ -58,12 +54,9 @@ def vector_reset(self) -> List[EnvObsType]: raise NotImplementedError @PublicAPI - def reset_at(self, index: Optional[int] = None) -> EnvObsType: + def reset_at(self, index: int) -> EnvObsType: """Resets a single environment. - Args: - index (Optional[int]): An optional sub-env index to reset. - Returns: obs (obj): Observations from the reset sub environment. """ @@ -95,31 +88,19 @@ def get_unwrapped(self) -> List[EnvType]: """ raise NotImplementedError - # Experimental method. - def try_render_at(self, index: Optional[int] = None) -> None: - """Renders a single environment. - - Args: - index (Optional[int]): An optional sub-env index to render. - """ - pass - class _VectorizedGymEnv(VectorEnv): """Internal wrapper to translate any gym envs into a VectorEnv object. """ - def __init__( - self, - make_env=None, - existing_envs=None, - num_envs=1, - *, - observation_space=None, - action_space=None, - env_config=None, - policy_config=None, - ): + def __init__(self, + make_env=None, + existing_envs=None, + num_envs=1, + *, + observation_space=None, + action_space=None, + env_config=None): """Initializes a _VectorizedGymEnv object. Args: @@ -135,27 +116,11 @@ def __init__( If None, use existing_envs[0]'s action space. env_config (Optional[dict]): Additional sub env config to pass to make_env as first arg. - policy_config (Optional[PartialTrainerConfigDict]): An optional - trainer/policy config dict. """ + self.make_env = make_env self.envs = existing_envs - - # Fill up missing envs (so we have exactly num_envs sub-envs in this - # VectorEnv. while len(self.envs) < num_envs: - self.envs.append(make_env(len(self.envs))) - - # Wrap all envs with video recorder if necessary. - if policy_config is not None and policy_config.get("record_env"): - - def wrapper_(env): - return gym_wrappers.Monitor( - env=env, - directory=policy_config["record_env"], - video_callable=lambda _: True, - force=True) - - self.envs = [wrapper_(e) for e in self.envs] + self.envs.append(self.make_env(len(self.envs))) super().__init__( observation_space=observation_space @@ -168,9 +133,7 @@ def vector_reset(self): return [e.reset() for e in self.envs] @override(VectorEnv) - def reset_at(self, index: Optional[int] = None) -> EnvObsType: - if index is None: - index = 0 + def reset_at(self, index): return self.envs[index].reset() @override(VectorEnv) @@ -194,9 +157,3 @@ def vector_step(self, actions): @override(VectorEnv) def get_unwrapped(self): return self.envs - - @override(VectorEnv) - def try_render_at(self, index: Optional[int] = None): - if index is None: - index = 0 - return self.envs[index].render() diff --git a/rllib/evaluation/rollout_worker.py b/rllib/evaluation/rollout_worker.py index e824a01747d7..39d4bef776db 100644 --- a/rllib/evaluation/rollout_worker.py +++ b/rllib/evaluation/rollout_worker.py @@ -546,9 +546,7 @@ def make_env(vector_index): make_env=make_env, num_envs=num_envs, remote_envs=remote_worker_envs, - remote_env_batch_wait_ms=remote_env_batch_wait_ms, - policy_config=policy_config, - ) + remote_env_batch_wait_ms=remote_env_batch_wait_ms) # `truncate_episodes`: Allow a batch to contain more than one episode # (fragments) and always make the batch `rollout_fragment_length` @@ -585,11 +583,6 @@ def make_env(vector_index): raise ValueError( "Unknown evaluation method: {}".format(method)) - render = False - if policy_config.get("render_env") is True and \ - (num_workers == 0 or worker_index == 1): - render = True - if self.env is None: self.sampler = None elif sample_async: @@ -615,7 +608,6 @@ def make_env(vector_index): _use_trajectory_view_api=_use_trajectory_view_api, sample_collector_class=policy_config.get( "sample_collector_class"), - render=render, ) # Start the Sampler thread. self.sampler.start() @@ -641,7 +633,6 @@ def make_env(vector_index): _use_trajectory_view_api=_use_trajectory_view_api, sample_collector_class=policy_config.get( "sample_collector_class"), - render=render, ) self.input_reader: InputReader = input_creator(self.io_context) diff --git a/rllib/evaluation/sampler.py b/rllib/evaluation/sampler.py index 1eea70fc3cdf..eb81b65de9c9 100644 --- a/rllib/evaluation/sampler.py +++ b/rllib/evaluation/sampler.py @@ -65,16 +65,17 @@ class _PerfStats: def __init__(self): self.iters = 0 + self.env_wait_time = 0.0 self.raw_obs_processing_time = 0.0 self.inference_time = 0.0 self.action_processing_time = 0.0 - self.env_wait_time = 0.0 - self.env_render_time = 0.0 def get(self): # Mean multiplicator (1000 = ms -> sec). factor = 1000 / self.iters return { + # Waiting for environment (during poll). + "mean_env_wait_ms": self.env_wait_time * factor, # Raw observation preprocessing. "mean_raw_obs_processing_ms": self.raw_obs_processing_time * factor, @@ -82,10 +83,6 @@ def get(self): "mean_inference_ms": self.inference_time * factor, # Processing actions (to be sent to env, e.g. clipping). "mean_action_processing_ms": self.action_processing_time * factor, - # Waiting for environment (during poll). - "mean_env_wait_ms": self.env_wait_time * factor, - # Environment rendering (False by default). - "mean_env_render_ms": self.env_render_time * factor, } @@ -144,9 +141,7 @@ def __init__( no_done_at_end: bool = False, observation_fn: "ObservationFunction" = None, _use_trajectory_view_api: bool = False, - sample_collector_class: Optional[Type[SampleCollector]] = None, - render: bool = False, - ): + sample_collector_class: Optional[Type[SampleCollector]] = None): """Initializes a SyncSampler object. Args: @@ -189,8 +184,6 @@ def __init__( sample_collector_class (Optional[Type[SampleCollector]]): An optional Samplecollector sub-class to use to collect, store, and retrieve environment-, model-, and sampler data. - render (bool): Whether to try to render the environment after each - step. """ self.base_env = BaseEnv.to_base_env(env) @@ -214,7 +207,6 @@ def __init__( count_steps_by=count_steps_by) else: self.sample_collector = None - self.render = render # Create the rollout generator to use for calls to `get_data()`. self.rollout_provider = _env_runner( @@ -223,7 +215,7 @@ def __init__( self.preprocessors, self.obs_filters, clip_rewards, clip_actions, multiple_episodes_in_batch, callbacks, tf_sess, self.perf_stats, soft_horizon, no_done_at_end, observation_fn, - _use_trajectory_view_api, self.sample_collector, self.render) + _use_trajectory_view_api, self.sample_collector) self.metrics_queue = queue.Queue() @override(SamplerInput) @@ -288,7 +280,6 @@ def __init__( observation_fn: "ObservationFunction" = None, _use_trajectory_view_api: bool = False, sample_collector_class: Optional[Type[SampleCollector]] = None, - render: bool = False, ): """Initializes a AsyncSampler object. @@ -336,8 +327,6 @@ def __init__( sample_collector_class (Optional[Type[SampleCollector]]): An optional Samplecollector sub-class to use to collect, store, and retrieve environment-, model-, and sampler data. - render (bool): Whether to try to render the environment after each - step. """ for _, f in obs_filters.items(): assert getattr(f, "is_concurrent", False), \ @@ -367,7 +356,6 @@ def __init__( self.shutdown = False self.observation_fn = observation_fn self._use_trajectory_view_api = _use_trajectory_view_api - self.render = render if _use_trajectory_view_api: if not sample_collector_class: sample_collector_class = SimpleListCollector @@ -404,7 +392,7 @@ def _run(self): self.clip_actions, self.multiple_episodes_in_batch, self.callbacks, self.tf_sess, self.perf_stats, self.soft_horizon, self.no_done_at_end, self.observation_fn, - self._use_trajectory_view_api, self.sample_collector, self.render) + self._use_trajectory_view_api, self.sample_collector) while not self.shutdown: # The timeout variable exists because apparently, if one worker # dies, the other workers won't die with it, unless the timeout is @@ -470,7 +458,6 @@ def _env_runner( observation_fn: "ObservationFunction", _use_trajectory_view_api: bool = False, sample_collector: Optional[SampleCollector] = None, - render: bool = None, ) -> Iterable[SampleBatchType]: """This implements the common experience collection logic. @@ -510,9 +497,7 @@ def _env_runner( `_use_trajectory_view_api` to make generic trajectory views available to Models. Default: False. sample_collector (Optional[SampleCollector]): An optional - SampleCollector object to use. - render (bool): Whether to try to render the environment after each - step. + SampleCollector object to use Yields: rollout (SampleBatch): Object containing state, action, reward, @@ -701,12 +686,6 @@ def new_episode(env_id): base_env.send_actions(actions_to_send) perf_stats.env_wait_time += time.time() - t4 - # Try to render the env, if required. - if render: - t5 = time.time() - base_env.try_render() - perf_stats.env_render_time += time.time() - t5 - def _process_observations( *, diff --git a/rllib/examples/export/cartpole_dqn_export.py b/rllib/examples/export/cartpole_dqn_export.py index 8d0ac7abaf87..8b315dd79a34 100644 --- a/rllib/examples/export/cartpole_dqn_export.py +++ b/rllib/examples/export/cartpole_dqn_export.py @@ -3,7 +3,7 @@ import os import ray -from ray.rllib.agents.registry import get_trainer_class +from ray.rllib.agents.registry import get_agent_class from ray.rllib.utils.framework import try_import_tf tf1, tf, tfv = try_import_tf() @@ -12,7 +12,7 @@ def train_and_export(algo_name, num_steps, model_dir, ckpt_dir, prefix): - cls = get_trainer_class(algo_name) + cls = get_agent_class(algo_name) alg = cls(config={}, env="CartPole-v0") for _ in range(num_steps): alg.train() diff --git a/rllib/examples/pettingzoo_env.py b/rllib/examples/pettingzoo_env.py index da49ccbdc22d..bd9901a17954 100644 --- a/rllib/examples/pettingzoo_env.py +++ b/rllib/examples/pettingzoo_env.py @@ -4,7 +4,7 @@ from supersuit import normalize_obs_v0, dtype_v0, color_reduction_v0 import ray -from ray.rllib.agents.registry import get_trainer_class +from ray.rllib.agents.registry import get_agent_class from ray.rllib.env import PettingZooEnv from pettingzoo.butterfly import pistonball_v1 @@ -33,7 +33,7 @@ def env_creator(config): num_rollouts = 2 # 1. Gets default training configuration and specifies the POMgame to load. - config = deepcopy(get_trainer_class(alg_name)._default_config) + config = deepcopy(get_agent_class(alg_name)._default_config) # 2. Set environment config. This will be passed to # the env_creator function via the register env lambda below. @@ -76,7 +76,7 @@ def env_creator(config): # 6. Initialize ray and trainer object ray.init(num_cpus=num_cpus + 1) - trainer = get_trainer_class(alg_name)(env="pistonball", config=config) + trainer = get_agent_class(alg_name)(env="pistonball", config=config) # 7. Train once trainer.train() diff --git a/rllib/examples/rock_paper_scissors_multiagent.py b/rllib/examples/rock_paper_scissors_multiagent.py index 0eb3709c14a0..dde72248e9b8 100644 --- a/rllib/examples/rock_paper_scissors_multiagent.py +++ b/rllib/examples/rock_paper_scissors_multiagent.py @@ -14,7 +14,7 @@ from ray import tune from ray.rllib.agents.pg import PGTrainer, PGTFPolicy, PGTorchPolicy -from ray.rllib.agents.registry import get_trainer_class +from ray.rllib.agents.registry import get_agent_class from ray.rllib.examples.env.rock_paper_scissors import RockPaperScissors from ray.rllib.examples.policy.rock_paper_scissors_dummies import \ BeatLastHeuristic, AlwaysSameHeuristic @@ -87,7 +87,7 @@ def select_policy(agent_id): }, "framework": "torch" if args.torch else "tf", } - cls = get_trainer_class(trainer) if isinstance(trainer, str) else trainer + cls = get_agent_class(trainer) if isinstance(trainer, str) else trainer trainer_obj = cls(config=config) env = trainer_obj.workers.local_worker().env for _ in range(args.stop_iters): diff --git a/rllib/execution/learner_thread.py b/rllib/execution/learner_thread.py index 4f1f6e84275f..8f5350fa146d 100644 --- a/rllib/execution/learner_thread.py +++ b/rllib/execution/learner_thread.py @@ -1,7 +1,8 @@ +from typing import Dict +import threading import copy + from six.moves import queue -import threading -from typing import Dict from ray.rllib.evaluation.metrics import get_learner_stats from ray.rllib.execution.minibatch_buffer import MinibatchBuffer @@ -68,10 +69,7 @@ def run(self) -> None: def step(self) -> None: with self.queue_timer: - try: - batch, _ = self.minibatch_buffer.get() - except queue.Empty: - return + batch, _ = self.minibatch_buffer.get() with self.grad_timer: fetches = self.local_worker.learn_on_batch(batch) diff --git a/rllib/rollout.py b/rllib/rollout.py index be4bce95a58e..dfc599160865 100755 --- a/rllib/rollout.py +++ b/rllib/rollout.py @@ -12,27 +12,24 @@ import ray import ray.cloudpickle as cloudpickle -from ray.rllib.agents.registry import get_trainer_class from ray.rllib.env import MultiAgentEnv from ray.rllib.env.base_env import _DUMMY_AGENT_ID from ray.rllib.env.env_context import EnvContext from ray.rllib.evaluation.worker_set import WorkerSet from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID +from ray.rllib.utils.deprecation import deprecation_warning from ray.rllib.utils.spaces.space_utils import flatten_to_single_ndarray from ray.tune.utils import merge_dicts from ray.tune.registry import get_trainable_cls, _global_registry, ENV_CREATOR EXAMPLE_USAGE = """ -Example usage via RLlib CLI: +Example Usage via RLlib CLI: rllib rollout /tmp/ray/checkpoint_dir/checkpoint-0 --run DQN --env CartPole-v0 --steps 1000000 --out rollouts.pkl -Example usage via executable: +Example Usage via executable: ./rollout.py /tmp/ray/checkpoint_dir/checkpoint-0 --run DQN --env CartPole-v0 --steps 1000000 --out rollouts.pkl - -Example usage w/o checkpoint (for testing purposes): - ./rollout.py --run PPO --env CartPole-v0 --episodes 500 """ # Note: if you use any custom models or envs, register them here first, e.g.: @@ -45,94 +42,6 @@ # register_env("pa_cartpole", lambda _: ParametricActionsCartPole(10)) -def create_parser(parser_creator=None): - parser_creator = parser_creator or argparse.ArgumentParser - parser = parser_creator( - formatter_class=argparse.RawDescriptionHelpFormatter, - description="Roll out a reinforcement learning agent " - "given a checkpoint.", - epilog=EXAMPLE_USAGE) - - parser.add_argument( - "checkpoint", - type=str, - nargs="?", - help="(Optional) checkpoint from which to roll out. " - "If none given, will use an initial (untrained) Trainer.") - - required_named = parser.add_argument_group("required named arguments") - required_named.add_argument( - "--run", - type=str, - required=True, - help="The algorithm or model to train. This may refer to the name " - "of a built-on algorithm (e.g. RLLib's `DQN` or `PPO`), or a " - "user-defined trainable function or class registered in the " - "tune registry.") - required_named.add_argument( - "--env", - type=str, - help="The environment specifier to use. This could be an openAI gym " - "specifier (e.g. `CartPole-v0`) or a full class-path (e.g. " - "`ray.rllib.examples.env.simple_corridor.SimpleCorridor`).") - parser.add_argument( - "--local-mode", - action="store_true", - help="Run ray in local mode for easier debugging.") - parser.add_argument( - "--no-render", - default=False, - action="store_const", - const=True, - help="Suppress rendering of the environment.") - parser.add_argument( - "--video-dir", - type=str, - default=None, - help="Specifies the directory into which videos of all episode " - "rollouts will be stored.") - parser.add_argument( - "--steps", - default=10000, - help="Number of timesteps to roll out. Rollout will also stop if " - "`--episodes` limit is reached first. A value of 0 means no " - "limitation on the number of timesteps run.") - parser.add_argument( - "--episodes", - default=0, - help="Number of complete episodes to roll out. Rollout will also stop " - "if `--steps` (timesteps) limit is reached first. A value of 0 means " - "no limitation on the number of episodes run.") - parser.add_argument("--out", default=None, help="Output filename.") - parser.add_argument( - "--config", - default="{}", - type=json.loads, - help="Algorithm-specific configuration (e.g. env, hyperparams). " - "Gets merged with loaded configuration from checkpoint file and " - "`evaluation_config` settings therein.") - parser.add_argument( - "--save-info", - default=False, - action="store_true", - help="Save the info field generated by the step() method, " - "as well as the action, observations, rewards and done fields.") - parser.add_argument( - "--use-shelve", - default=False, - action="store_true", - help="Save rollouts into a python shelf file (will save each episode " - "as it is generated). An output filename must be set using --out.") - parser.add_argument( - "--track-progress", - default=False, - action="store_true", - help="Write progress to a temporary file (updated " - "after each episode). An output filename must be set using --out; " - "the progress file will live in the same folder.") - return parser - - class RolloutSaver: """Utility class for storing rollouts. @@ -256,31 +165,108 @@ def append_step(self, obs, action, next_obs, reward, done, info): self._total_steps += 1 +def create_parser(parser_creator=None): + parser_creator = parser_creator or argparse.ArgumentParser + parser = parser_creator( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="Roll out a reinforcement learning agent " + "given a checkpoint.", + epilog=EXAMPLE_USAGE) + + parser.add_argument( + "checkpoint", type=str, help="Checkpoint from which to roll out.") + required_named = parser.add_argument_group("required named arguments") + required_named.add_argument( + "--run", + type=str, + required=True, + help="The algorithm or model to train. This may refer to the name " + "of a built-on algorithm (e.g. RLLib's DQN or PPO), or a " + "user-defined trainable function or class registered in the " + "tune registry.") + required_named.add_argument( + "--env", type=str, help="The gym environment to use.") + parser.add_argument( + "--no-render", + default=False, + action="store_const", + const=True, + help="Suppress rendering of the environment.") + parser.add_argument( + "--monitor", + default=False, + action="store_true", + help="Wrap environment in gym Monitor to record video. NOTE: This " + "option is deprecated: Use `--video-dir [some dir]` instead.") + parser.add_argument( + "--video-dir", + type=str, + default=None, + help="Specifies the directory into which videos of all episode " + "rollouts will be stored.") + parser.add_argument( + "--steps", + default=10000, + help="Number of timesteps to roll out (overwritten by --episodes).") + parser.add_argument( + "--episodes", + default=0, + help="Number of complete episodes to roll out (overrides --steps).") + parser.add_argument("--out", default=None, help="Output filename.") + parser.add_argument( + "--config", + default="{}", + type=json.loads, + help="Algorithm-specific configuration (e.g. env, hyperparams). " + "Gets merged with loaded configuration from checkpoint file and " + "`evaluation_config` settings therein.") + parser.add_argument( + "--save-info", + default=False, + action="store_true", + help="Save the info field generated by the step() method, " + "as well as the action, observations, rewards and done fields.") + parser.add_argument( + "--use-shelve", + default=False, + action="store_true", + help="Save rollouts into a python shelf file (will save each episode " + "as it is generated). An output filename must be set using --out.") + parser.add_argument( + "--track-progress", + default=False, + action="store_true", + help="Write progress to a temporary file (updated " + "after each episode). An output filename must be set using --out; " + "the progress file will live in the same folder.") + return parser + + def run(args, parser): # Load configuration from checkpoint file. - config_path = "" - if args.checkpoint: - config_dir = os.path.dirname(args.checkpoint) - config_path = os.path.join(config_dir, "params.pkl") - # Try parent directory. - if not os.path.exists(config_path): - config_path = os.path.join(config_dir, "../params.pkl") + config_dir = os.path.dirname(args.checkpoint) + config_path = os.path.join(config_dir, "params.pkl") + # Try parent directory. + if not os.path.exists(config_path): + config_path = os.path.join(config_dir, "../params.pkl") - # Load the config from pickled. - if os.path.exists(config_path): - with open(config_path, "rb") as f: - config = cloudpickle.load(f) # If no pkl file found, require command line `--config`. - else: - # If no config in given checkpoint -> Error. - if args.checkpoint: + if not os.path.exists(config_path): + if not args.config: raise ValueError( "Could not find params.pkl in either the checkpoint dir or " - "its parent directory AND no `--config` given on command " - "line!") + "its parent directory AND no config given on command line!") + else: + config = args.config - # Use default config for given agent. - _, config = get_trainer_class(args.run, return_config=True) + # Load the config from pickled. + else: + with open(config_path, "rb") as f: + config = cloudpickle.load(f) + + # Set num_workers to be at least 2. + if "num_workers" in config: + config["num_workers"] = min(2, config["num_workers"]) # Make sure worker 0 has an Env. config["create_env_on_driver"] = True @@ -299,31 +285,25 @@ def run(args, parser): parser.error("the following arguments are required: --env") args.env = config.get("env") - # Make sure we have evaluation workers. - if not config.get("evaluation_num_workers"): - config["evaluation_num_workers"] = config.get("num_workers", 0) - if not config.get("evaluation_num_episodes"): - config["evaluation_num_episodes"] = 1 - config["render_env"] = not args.no_render - config["record_env"] = args.video_dir - - ray.init(local_mode=args.local_mode) + ray.init() # Create the Trainer from config. cls = get_trainable_cls(args.run) agent = cls(env=args.env, config=config) - - # Load state from checkpoint, if provided. - if args.checkpoint: - agent.restore(args.checkpoint) - + # Load state from checkpoint. + agent.restore(args.checkpoint) num_steps = int(args.steps) num_episodes = int(args.episodes) # Determine the video output directory. + # Deprecated way: Use (--out|~/ray_results) + "/monitor" as dir. video_dir = None - # Allow user to specify a video output path. - if args.video_dir: + if args.monitor: + video_dir = os.path.join( + os.path.dirname(args.out or "") + or os.path.expanduser("~/ray_results/"), "monitor") + # New way: Allow user to specify a video output path. + elif args.video_dir: video_dir = os.path.expanduser(args.video_dir) # Do the actual rollout. @@ -353,13 +333,13 @@ def default_policy_agent_mapping(unused_agent_id): def keep_going(steps, num_steps, episodes, num_episodes): """Determine whether we've collected enough data""" - # If num_episodes is set, stop if limit reached. - if num_episodes and episodes >= num_episodes: - return False - # If num_steps is set, stop if limit reached. - elif num_steps and steps >= num_steps: - return False - # Otherwise, keep going. + # if num_episodes is set, this overrides num_steps + if num_episodes: + return episodes < num_episodes + # if num_steps is set, continue until we reach the limit + if num_steps: + return steps < num_steps + # otherwise keep going forever return True @@ -375,36 +355,16 @@ def rollout(agent, if saver is None: saver = RolloutSaver() - # Normal case: Agent was setup correctly with an evaluation WorkerSet, - # which we will now use to rollout. - if hasattr(agent, "evaluation_workers") and isinstance( - agent.evaluation_workers, WorkerSet): - steps = 0 - episodes = 0 - while keep_going(steps, num_steps, episodes, num_episodes): - saver.begin_rollout() - eval_result = agent._evaluate()["evaluation"] - # Increase timestep and episode counters. - eps = agent.config["evaluation_num_episodes"] - episodes += eps - steps += eps * eval_result["episode_len_mean"] - # Print out results and continue. - print("Episode #{}: reward: {}".format( - episodes, eval_result["episode_reward_mean"])) - saver.end_rollout() - return - - # Agent has no evaluation workers, but RolloutWorkers. - elif hasattr(agent, "workers") and isinstance(agent.workers, WorkerSet): + if hasattr(agent, "workers") and isinstance(agent.workers, WorkerSet): env = agent.workers.local_worker().env multiagent = isinstance(env, MultiAgentEnv) if agent.workers.local_worker().multiagent: policy_agent_mapping = agent.config["multiagent"][ "policy_mapping_fn"] + policy_map = agent.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} - # Agent has neither evaluation- nor rollout workers. else: from gym import envs if envs.registry.env_specs.get(agent.config["env"]): @@ -437,7 +397,7 @@ def rollout(agent, env = gym_wrappers.Monitor( env=env, directory=video_dir, - video_callable=lambda _: True, + video_callable=lambda x: True, force=True) steps = 0 @@ -510,6 +470,15 @@ def rollout(agent, parser = create_parser() args = parser.parse_args() + # Old option: monitor, use video-dir instead. + if args.monitor: + deprecation_warning("--monitor", "--video-dir=[some dir]") + # User tries to record videos, but no-render is set: Error. + if (args.monitor or args.video_dir) and args.no_render: + raise ValueError( + "You have --no-render set, but are trying to record rollout videos" + " (via options --video-dir/--monitor)! " + "Either unset --no-render or do not use --video-dir/--monitor.") # --use_shelve w/o --out option. if args.use_shelve and not args.out: raise ValueError( diff --git a/rllib/tests/test_checkpoint_restore.py b/rllib/tests/test_checkpoint_restore.py index b95a50015273..42bc039d8423 100644 --- a/rllib/tests/test_checkpoint_restore.py +++ b/rllib/tests/test_checkpoint_restore.py @@ -4,7 +4,7 @@ import unittest import ray -from ray.rllib.agents.registry import get_trainer_class +from ray.rllib.agents.registry import get_agent_class from ray.rllib.utils.test_utils import check, framework_iterator @@ -69,7 +69,7 @@ def ckpt_restore_test(alg_name, tfe=False): for fw in framework_iterator(config, frameworks=frameworks): for use_object_store in [False, True]: print("use_object_store={}".format(use_object_store)) - cls = get_trainer_class(alg_name) + cls = get_agent_class(alg_name) if "DDPG" in alg_name or "SAC" in alg_name: alg1 = cls(config=config, env="Pendulum-v0") alg2 = cls(config=config, env="Pendulum-v0") diff --git a/rllib/tests/test_eager_support.py b/rllib/tests/test_eager_support.py index b08918e04c28..95e6c69fc9e6 100644 --- a/rllib/tests/test_eager_support.py +++ b/rllib/tests/test_eager_support.py @@ -2,7 +2,7 @@ import ray from ray import tune -from ray.rllib.agents.registry import get_trainer_class +from ray.rllib.agents.registry import get_agent_class from ray.rllib.utils.framework import try_import_tf tf1, tf, tfv = try_import_tf() @@ -23,7 +23,7 @@ def check_support(alg, config, test_eager=False, test_trace=True): else: config["env"] = "CartPole-v0" - a = get_trainer_class(alg) + a = get_agent_class(alg) if test_eager: print("tf-eager: alg={} cont.act={}".format(alg, cont)) config["eager_tracing"] = False diff --git a/rllib/tests/test_export.py b/rllib/tests/test_export.py index 711cc85b5956..f2f61b00545f 100644 --- a/rllib/tests/test_export.py +++ b/rllib/tests/test_export.py @@ -5,7 +5,7 @@ import unittest import ray -from ray.rllib.agents.registry import get_trainer_class +from ray.rllib.agents.registry import get_agent_class from ray.tune.trial import ExportFormat CONFIGS = { @@ -74,7 +74,7 @@ def valid_tf_checkpoint(checkpoint_dir): and os.path.exists(os.path.join(checkpoint_dir, "model.index")) \ and os.path.exists(os.path.join(checkpoint_dir, "checkpoint")) - cls = get_trainer_class(alg_name) + cls = get_agent_class(alg_name) if "DDPG" in alg_name or "SAC" in alg_name: algo = cls(config=CONFIGS[alg_name], env="Pendulum-v0") else: diff --git a/rllib/tests/test_ignore_worker_failure.py b/rllib/tests/test_ignore_worker_failure.py index a49d068f4ec0..8cb9962ce8a0 100644 --- a/rllib/tests/test_ignore_worker_failure.py +++ b/rllib/tests/test_ignore_worker_failure.py @@ -3,7 +3,7 @@ import ray from ray.rllib import _register_all -from ray.rllib.agents.registry import get_trainer_class +from ray.rllib.agents.registry import get_agent_class from ray.rllib.utils.test_utils import framework_iterator from ray.tune.registry import register_env @@ -37,7 +37,7 @@ def do_test(self, alg, config, fn=None): def _do_test_fault_recover(self, alg, config): register_env("fault_env", lambda c: FaultInjectEnv(c)) - agent_cls = get_trainer_class(alg) + agent_cls = get_agent_class(alg) # Test fault handling config["num_workers"] = 2 @@ -51,7 +51,7 @@ def _do_test_fault_recover(self, alg, config): def _do_test_fault_fatal(self, alg, config): register_env("fault_env", lambda c: FaultInjectEnv(c)) - agent_cls = get_trainer_class(alg) + agent_cls = get_agent_class(alg) # Test raises real error when out of workers config["num_workers"] = 2 config["ignore_worker_failures"] = True diff --git a/rllib/tests/test_model_imports.py b/rllib/tests/test_model_imports.py index d4d1c8545311..2a03b3789ff3 100644 --- a/rllib/tests/test_model_imports.py +++ b/rllib/tests/test_model_imports.py @@ -6,7 +6,7 @@ import unittest import ray -from ray.rllib.agents.registry import get_trainer_class +from ray.rllib.agents.registry import get_agent_class from ray.rllib.models.catalog import ModelCatalog from ray.rllib.models.tf.misc import normc_initializer from ray.rllib.models.tf.tf_modelv2 import TFModelV2 @@ -127,7 +127,7 @@ def model_import_test(algo, config, env): rllib_dir = Path(__file__).parent.parent import_file = str(rllib_dir) + "/tests/data/model_weights/weights.h5" - agent_cls = get_trainer_class(algo) + agent_cls = get_agent_class(algo) for fw in framework_iterator(config, ["tf", "torch"]): config["model"]["custom_model"] = "keras_model" if fw != "torch" else \ diff --git a/rllib/tests/test_pettingzoo_env.py b/rllib/tests/test_pettingzoo_env.py index d56d82c53d07..bf3fc4aaa4cd 100644 --- a/rllib/tests/test_pettingzoo_env.py +++ b/rllib/tests/test_pettingzoo_env.py @@ -4,7 +4,7 @@ import ray from ray.tune.registry import register_env from ray.rllib.env import PettingZooEnv -from ray.rllib.agents.registry import get_trainer_class +from ray.rllib.agents.registry import get_agent_class from pettingzoo.mpe import simple_spread_v2 @@ -20,7 +20,7 @@ def test_pettingzoo_env(self): register_env("simple_spread", lambda _: PettingZooEnv(simple_spread_v2.env())) - agent_class = get_trainer_class("PPO") + agent_class = get_agent_class("PPO") config = deepcopy(agent_class._default_config) diff --git a/rllib/tests/test_supported_multi_agent.py b/rllib/tests/test_supported_multi_agent.py index 933c2d60814e..7e7eecc41b60 100644 --- a/rllib/tests/test_supported_multi_agent.py +++ b/rllib/tests/test_supported_multi_agent.py @@ -1,7 +1,7 @@ import unittest import ray -from ray.rllib.agents.registry import get_trainer_class +from ray.rllib.agents.registry import get_agent_class from ray.rllib.examples.env.multi_agent import MultiAgentCartPole, \ MultiAgentMountainCar from ray.rllib.utils.test_utils import framework_iterator @@ -19,11 +19,10 @@ def check_support_multiagent(alg, config): alg in ["A3C", "APEX", "APEX_DDPG", "IMPALA"]: continue if alg in ["DDPG", "APEX_DDPG", "SAC"]: - a = get_trainer_class(alg)( + a = get_agent_class(alg)( config=config, env="multi_agent_mountaincar") else: - a = get_trainer_class(alg)( - config=config, env="multi_agent_cartpole") + a = get_agent_class(alg)(config=config, env="multi_agent_cartpole") print(a.train()) a.stop() diff --git a/rllib/tests/test_supported_spaces.py b/rllib/tests/test_supported_spaces.py index 05b90cba52d2..40bba43b2cb8 100644 --- a/rllib/tests/test_supported_spaces.py +++ b/rllib/tests/test_supported_spaces.py @@ -3,7 +3,7 @@ import unittest import ray -from ray.rllib.agents.registry import get_trainer_class +from ray.rllib.agents.registry import get_agent_class from ray.rllib.examples.env.random_env import RandomEnv from ray.rllib.models.tf.fcnet import FullyConnectedNetwork as FCNetV2 from ray.rllib.models.tf.visionnet import VisionNetwork as VisionNetV2 @@ -65,7 +65,7 @@ def _do_check(alg, config, a_name, o_name): stat = "ok" try: - a = get_trainer_class(alg)(config=config, env=RandomEnv) + a = get_agent_class(alg)(config=config, env=RandomEnv) except UnsupportedSpaceException: stat = "unsupported" else: diff --git a/rllib/train.py b/rllib/train.py index 8314556d045a..228dcbfbca36 100755 --- a/rllib/train.py +++ b/rllib/train.py @@ -60,7 +60,8 @@ def create_parser(parser_creator=None): parser.add_argument( "--local-mode", action="store_true", - help="Run ray in local mode for easier debugging.") + help="Whether to run ray with `local_mode=True`. " + "Only if --ray-num-nodes is not used.") parser.add_argument( "--ray-num-cpus", default=None, From c346c05e42db44b488d759696d5c71eb48c1fabd Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 066/244] Revert "[Java] fix test hang occasionally when running FailureTest (#13934)" This reverts commit 3ca9c5d889bf965f61437954f08745d345d35d27. --- .../io/ray/runtime/runner/RunManager.java | 2 +- java/test.sh | 57 +++--- .../io/ray/test/TestProgressListener.java | 166 ++---------------- java/testng.xml | 2 +- src/ray/core_worker/core_worker.cc | 16 +- src/ray/core_worker/core_worker.h | 2 - 6 files changed, 44 insertions(+), 201 deletions(-) diff --git a/java/runtime/src/main/java/io/ray/runtime/runner/RunManager.java b/java/runtime/src/main/java/io/ray/runtime/runner/RunManager.java index 192e5550ceb4..2307b0489d3c 100644 --- a/java/runtime/src/main/java/io/ray/runtime/runner/RunManager.java +++ b/java/runtime/src/main/java/io/ray/runtime/runner/RunManager.java @@ -96,7 +96,7 @@ public static void getAddressInfoAndFillConfig(RayConfig rayConfig) { * * @param command The command to start the process with. */ - public static String runCommand(List command) throws IOException, InterruptedException { + private static String runCommand(List command) throws IOException, InterruptedException { if (LOGGER.isDebugEnabled()) { LOGGER.debug("Starting process with command: {}", Joiner.on(" ").join(command)); } diff --git a/java/test.sh b/java/test.sh index b49f06037c10..a842194e67fb 100755 --- a/java/test.sh +++ b/java/test.sh @@ -16,27 +16,30 @@ pushd "$ROOT_DIR" mvn -T16 checkstyle:check popd +on_exit() { + exit_code=$? + if [ $exit_code -ne 0 ]; then + echo "Exit trap, printing ray logs" + cat /tmp/ray/session_latest/logs/* + fi +} + +trap on_exit EXIT + run_testng() { - local pid local exit_code - "$@" & - pid=$! - if wait $pid; then + if "$@"; then exit_code=0 else exit_code=$? fi # exit_code == 2 means there are skipped tests. if [ $exit_code -ne 2 ] && [ $exit_code -ne 0 ] ; then - # Only print log files if it ran in cluster mode - if [[ ! "$*" =~ SINGLE_PROCESS ]]; then - if [ $exit_code -gt 128 ] ; then - # Test crashed. Print the driver log for diagnosis. - cat /tmp/ray/session_latest/logs/java-core-driver-*$pid* - fi + if [ $exit_code -gt 128 ] ; then + # Test crashed. Print the driver log for diagnosis. + cat /tmp/ray/session_latest/logs/java-core-driver-* fi - # Only print the hs_err_pid file of TestNG process - find . -name "hs_err_pid$pid.log" -exec cat {} + + find . -name "hs_err_*log" -exec cat {} + exit $exit_code fi } @@ -57,31 +60,11 @@ if ! git diff --exit-code -- java src/ray/core_worker/lib/java; then exit 1 fi -# NOTE(kfstrom): Java test troubleshooting only. -# Set MAX_ROUNDS to a big number (e.g. 1000) to run Java tests repeatedly. -# You may also want to modify java/testng.xml to run only a subset of test cases. -MAX_ROUNDS=1 -if [ $MAX_ROUNDS -gt 1 ]; then - export RAY_BACKEND_LOG_LEVEL=debug -fi - -round=1 -while true; do - echo Starting cluster mode test round $round - - echo "Running tests under cluster mode." - # TODO(hchen): Ideally, we should use the following bazel command to run Java tests. However, if there're skipped tests, - # TestNG will exit with code 2. And bazel treats it as test failure. - # bazel test //java:all_tests --config=ci || cluster_exit_code=$? - run_testng java -cp "$ROOT_DIR"/../bazel-bin/java/all_tests_deploy.jar org.testng.TestNG -d /tmp/ray_java_test_output "$ROOT_DIR"/testng.xml - - echo Finished cluster mode test round $round - date - round=$((round+1)) - if (( round > MAX_ROUNDS )); then - break - fi -done +echo "Running tests under cluster mode." +# TODO(hchen): Ideally, we should use the following bazel command to run Java tests. However, if there're skipped tests, +# TestNG will exit with code 2. And bazel treats it as test failure. +# bazel test //java:all_tests --config=ci || cluster_exit_code=$? +run_testng java -cp "$ROOT_DIR"/../bazel-bin/java/all_tests_deploy.jar org.testng.TestNG -d /tmp/ray_java_test_output "$ROOT_DIR"/testng.xml echo "Running tests under single-process mode." # bazel test //java:all_tests --jvmopt="-Dray.run-mode=SINGLE_PROCESS" --config=ci || single_exit_code=$? diff --git a/java/test/src/main/java/io/ray/test/TestProgressListener.java b/java/test/src/main/java/io/ray/test/TestProgressListener.java index 915d82af317b..1fed5ac21375 100644 --- a/java/test/src/main/java/io/ray/test/TestProgressListener.java +++ b/java/test/src/main/java/io/ray/test/TestProgressListener.java @@ -1,42 +1,27 @@ package io.ray.test; -import com.google.common.collect.ImmutableList; -import io.ray.runtime.runner.RunManager; -import java.io.File; import java.time.LocalDateTime; -import java.util.Collection; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import org.apache.commons.io.FileUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.lang3.SystemUtils; import org.testng.IInvokedMethod; import org.testng.IInvokedMethodListener; import org.testng.ITestContext; import org.testng.ITestListener; import org.testng.ITestResult; -import org.testng.SkipException; public class TestProgressListener implements IInvokedMethodListener, ITestListener { - // Travis aborts CI if no outputs for 10 minutes. So threshold needs to be smaller than 10m. - private static final long hangDetectionThresholdMillis = 5 * 60 * 1000; - private static final int TAIL_NO_OF_LINES = 500; - private Thread testMainThread; - private long testStartTimeMillis; - private String getFullTestName(ITestResult testResult) { return testResult.getTestClass().getName() + "." + testResult.getMethod().getMethodName(); } - private void printSection(String sectionName) { + private void printInfo(String tag, String content) { System.out.println( - "============ [" + LocalDateTime.now().toString() + "] " + sectionName + " ============"); - } - - private void printTestStage(String tag, String content) { - printSection("[" + tag + "] " + content); + "============ [" + + LocalDateTime.now().toString() + + "] [" + + tag + + "] " + + content + + " ============"); } @Override @@ -47,50 +32,31 @@ public void afterInvocation(IInvokedMethod method, ITestResult testResult) {} @Override public void onTestStart(ITestResult result) { - printTestStage("TEST START", getFullTestName(result)); - testStartTimeMillis = System.currentTimeMillis(); - // TODO(kfstorm): Add a timer to detect hang - if (testMainThread == null) { - testMainThread = Thread.currentThread(); - Thread hangDetectionThread = - new Thread( - () -> { - try { - // If current task case has ran for more than 5 minutes. - while (System.currentTimeMillis() - testStartTimeMillis - < hangDetectionThresholdMillis) { - Thread.sleep(1000); - } - printDebugInfo(null, /*testHanged=*/ true); - } catch (InterruptedException e) { - // ignored - } - }); - hangDetectionThread.setDaemon(true); - hangDetectionThread.start(); - } + printInfo("TEST START", getFullTestName(result)); } @Override public void onTestSuccess(ITestResult result) { - printTestStage("TEST SUCCESS", getFullTestName(result)); + printInfo("TEST SUCCESS", getFullTestName(result)); } @Override public void onTestFailure(ITestResult result) { - printTestStage("TEST FAILURE", getFullTestName(result)); - printDebugInfo(result, /*testHanged=*/ false); + printInfo("TEST FAILURE", getFullTestName(result)); + Throwable throwable = result.getThrowable(); + if (throwable != null) { + throwable.printStackTrace(); + } } @Override public void onTestSkipped(ITestResult result) { - printTestStage("TEST SKIPPED", getFullTestName(result)); - printDebugInfo(result, /*testHanged=*/ false); + printInfo("TEST SKIPPED", getFullTestName(result)); } @Override public void onTestFailedButWithinSuccessPercentage(ITestResult result) { - printTestStage("TEST FAILED BUT WITHIN SUCCESS PERCENTAGE", getFullTestName(result)); + printInfo("TEST FAILED BUT WITHIN SUCCESS PERCENTAGE", getFullTestName(result)); } @Override @@ -98,102 +64,4 @@ public void onStart(ITestContext context) {} @Override public void onFinish(ITestContext context) {} - - private void printDebugInfo(ITestResult result, boolean testHanged) { - boolean testFailed = false; - if (result != null) { - Throwable throwable = result.getThrowable(); - if (throwable != null && !(throwable instanceof SkipException)) { - testFailed = true; - throwable.printStackTrace(); - } - } - if (!testFailed && !testHanged) { - return; - } - - if (testHanged) { - printSection("TEST CASE HANGED"); - printSection("STACK TRACE OF TEST THREAD"); - for (StackTraceElement element : testMainThread.getStackTrace()) { - System.out.println(element.toString()); - } - Set javaPids = getJavaPids(); - for (Integer pid : javaPids) { - runCommandSafely(ImmutableList.of("jstack", pid.toString())); - // TODO(kfstorm): Check lldb or gdb exists rather than detecting OS type. - if (SystemUtils.IS_OS_MAC) { - runCommandSafely( - ImmutableList.of("lldb", "--batch", "-o", "bt all", "-p", pid.toString())); - } else { - runCommandSafely( - ImmutableList.of( - "sudo", "gdb", "-batch", "-ex", "thread apply all bt", "-p", pid.toString())); - } - } - } - - printLogFiles(); - - if (testHanged) { - printSection("ABORT TEST"); - System.exit(1); - } - } - - private String runCommandSafely(List command) { - String output; - String commandString = String.join(" ", command); - printSection(commandString); - try { - output = RunManager.runCommand(command); - System.out.println(output); - } catch (Exception e) { - System.out.println("Failed to execute command: " + commandString); - e.printStackTrace(); - output = ""; - } - return output; - } - - private Set getJavaPids() { - Set javaPids = new HashSet<>(); - String jpsOutput = runCommandSafely(ImmutableList.of("jps", "-v")); - try { - for (String line : StringUtils.split(jpsOutput, "\n")) { - String[] parts = StringUtils.split(line); - if (parts.length > 1 && parts[1].toLowerCase().equals("jps")) { - // Skip jps. - continue; - } - Integer pid = Integer.valueOf(parts[0]); - javaPids.add(pid); - } - } catch (Exception e) { - System.out.println("Failed to parse jps output."); - e.printStackTrace(); - } - - String pgrepJavaResult = runCommandSafely(ImmutableList.of("pgrep", "java")); - try { - for (String line : StringUtils.split(pgrepJavaResult, "\n")) { - Integer pid = Integer.valueOf(line); - javaPids.add(pid); - } - } catch (Exception e) { - System.out.println("Failed to parse pgrep java output."); - e.printStackTrace(); - } - - return javaPids; - } - - private void printLogFiles() { - Collection logFiles = - FileUtils.listFiles(new File("/tmp/ray/session_latest/logs"), null, false); - for (File file : logFiles) { - runCommandSafely( - ImmutableList.of("tail", "-n", String.valueOf(TAIL_NO_OF_LINES), file.getAbsolutePath())); - } - } } diff --git a/java/testng.xml b/java/testng.xml index 0db2704845d4..6cc10b9ab24a 100644 --- a/java/testng.xml +++ b/java/testng.xml @@ -1,6 +1,6 @@ - + diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 262c837011a7..6c8287c1507b 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -161,19 +161,15 @@ CoreWorkerProcess::CoreWorkerProcess(const CoreWorkerOptions &options) // RayConfig is generated in Java_io_ray_runtime_RayNativeRuntime_nativeInitialize // for java worker or in constructor of CoreWorker for python worker. ray::stats::Init(global_tags, options_.metrics_agent_port); - - // NOTE(kfstorm): std::atexit should be put at the end of `CoreWorkerProcess` - // constructor. We assume that spdlog has been initialized before this line. When the - // process is exiting, `HandleAtExit` will be invoked before destructing spdlog static - // variables. We explicitly destruct `CoreWorkerProcess` instance in the callback to - // ensure the static `CoreWorkerProcess` instance is destructed while spdlog is still - // usable. This prevents crashing (or hanging) when using `RAY_LOG` in - // `CoreWorkerProcess` destructor. - RAY_CHECK(std::atexit(CoreWorkerProcess::HandleAtExit) == 0); } CoreWorkerProcess::~CoreWorkerProcess() { RAY_LOG(INFO) << "Destructing CoreWorkerProcess. pid: " << getpid(); + { + // Check that all `CoreWorker` instances have been removed. + absl::ReaderMutexLock lock(&worker_map_mutex_); + RAY_CHECK(workers_.empty()); + } RAY_LOG(DEBUG) << "Stats stop in core worker."; // Shutdown stats module if worker process exits. ray::stats::Shutdown(); @@ -187,8 +183,6 @@ void CoreWorkerProcess::EnsureInitialized() { << "shutdown."; } -void CoreWorkerProcess::HandleAtExit() { instance_.reset(); } - std::shared_ptr CoreWorkerProcess::TryGetWorker(const WorkerID &worker_id) { if (!instance_) { return nullptr; diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h index 72ef4f36ca7b..6fa24c29e94e 100644 --- a/src/ray/core_worker/core_worker.h +++ b/src/ray/core_worker/core_worker.h @@ -265,8 +265,6 @@ class CoreWorkerProcess { /// \return Void. static void EnsureInitialized(); - static void HandleAtExit(); - /// Get the `CoreWorker` instance by worker ID. /// /// \param[in] workerId The worker ID. From 2f221e31eb7d9017bf9877482490b3e6dc20755d Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 067/244] Revert "[core] Java worker should respect the user provided node_ip_address (#13732)" This reverts commit 5c97d3f4d536fa83f7be1d3fa679d286334f26c2. --- .../src/main/java/io/ray/test/NodeIpTest.java | 46 ------------------- python/ray/_private/services.py | 8 +--- 2 files changed, 1 insertion(+), 53 deletions(-) delete mode 100644 java/test/src/main/java/io/ray/test/NodeIpTest.java diff --git a/java/test/src/main/java/io/ray/test/NodeIpTest.java b/java/test/src/main/java/io/ray/test/NodeIpTest.java deleted file mode 100644 index 4aee086efcb7..000000000000 --- a/java/test/src/main/java/io/ray/test/NodeIpTest.java +++ /dev/null @@ -1,46 +0,0 @@ -package io.ray.test; - -import io.ray.api.Ray; -import org.apache.commons.lang3.SystemUtils; -import org.testng.Assert; -import org.testng.SkipException; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -@Test(groups = {"cluster"}) -public class NodeIpTest extends BaseTest { - - private static final String NODE_IP = "127.0.0.2"; - - @BeforeClass - public void setUp() { - if (SystemUtils.IS_OS_MAC) { - throw new SkipException("Skip NodeIpTest on Mac OS"); - } - System.setProperty("ray.head-args.0", "--node-ip-address=127.0.0.2"); - System.setProperty("ray.node-ip", "127.0.0.2"); - } - - @AfterClass - public void tearDown() { - if (!SystemUtils.IS_OS_MAC) { - System.clearProperty("ray.head-args.0"); - System.clearProperty("ray.node-ip"); - } - } - - static String getNodeIp() { - return TestUtils.getRuntime().getRayConfig().nodeIp; - } - - public void testNodeIp() { - // this is on the driver node, and it should be equal with ray.node-ip - String nodeIP = TestUtils.getRuntime().getRayConfig().nodeIp; - Assert.assertEquals(nodeIP, NODE_IP); - - // this is on the worker node, and it should be equal with node-ip-address - nodeIP = Ray.task(NodeIpTest::getNodeIp).remote().get(); - Assert.assertEquals(nodeIP, NODE_IP); - } -} diff --git a/python/ray/_private/services.py b/python/ray/_private/services.py index c3144c05f39c..4ae4fed1758e 100644 --- a/python/ray/_private/services.py +++ b/python/ray/_private/services.py @@ -1370,7 +1370,6 @@ def start_raylet(redis_address, raylet_name, redis_password, session_dir, - node_ip_address, ) else: java_worker_command = [] @@ -1509,8 +1508,7 @@ def get_ray_jars_dir(): def build_java_worker_command(java_worker_options, redis_address, node_manager_port, plasma_store_name, - raylet_name, redis_password, session_dir, - node_ip_address): + raylet_name, redis_password, session_dir): """This method assembles the command used to start a Java worker. Args: @@ -1521,7 +1519,6 @@ def build_java_worker_command(java_worker_options, redis_address, raylet_name (str): The name of the raylet socket to create. redis_password (str): The password of connect to redis. session_dir (str): The path of this session. - node_ip_address (str): The ip address for this node. Returns: The command string for starting Java worker. """ @@ -1539,9 +1536,6 @@ def build_java_worker_command(java_worker_options, redis_address, if redis_password is not None: pairs.append(("ray.redis.password", redis_password)) - if node_ip_address is not None: - pairs.append(("ray.node-ip", node_ip_address)) - pairs.append(("ray.home", RAY_HOME)) pairs.append(("ray.logging.dir", os.path.join(session_dir, "logs"))) pairs.append(("ray.session-dir", session_dir)) From 3cbaffe4a84606ac73415ad10d77f1346ce995a7 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 068/244] Revert "[core/client] enable more tests (#13961)" This reverts commit 0590017e1ff8f36364d479e4c1d22b90e5eebf76. --- python/ray/node.py | 19 ---------- python/ray/tests/test_client_metadata.py | 5 --- python/ray/tests/test_client_references.py | 40 +++----------------- python/ray/tests/test_client_terminate.py | 6 --- python/ray/tests/test_cross_language.py | 4 -- python/ray/tests/test_mini.py | 6 --- python/ray/tests/test_monitor.py | 6 --- python/ray/util/client/ray_client_helpers.py | 19 ---------- 8 files changed, 6 insertions(+), 99 deletions(-) diff --git a/python/ray/node.py b/python/ray/node.py index cd2dc2250677..a63a0a8a8996 100644 --- a/python/ray/node.py +++ b/python/ray/node.py @@ -11,7 +11,6 @@ import subprocess import sys import tempfile -import threading import time from typing import Optional, Dict @@ -92,7 +91,6 @@ def __init__(self, self.kernel_fate_share = bool( spawn_reaper and ray.utils.detect_fate_sharing_support()) self.all_processes = {} - self.removal_lock = threading.Lock() # Try to get node IP address with the parameters. if ray_params.node_ip_address: @@ -925,23 +923,6 @@ def _kill_process_type(self, 2. The process had been started in valgrind and had a non-zero exit code. """ - - # Ensure thread safety - with self.removal_lock: - self._kill_process_impl( - process_type, - allow_graceful=allow_graceful, - check_alive=check_alive, - wait=wait) - - def _kill_process_impl(self, - process_type, - allow_graceful=False, - check_alive=True, - wait=False): - """See `_kill_process_type`.""" - if process_type not in self.all_processes: - return process_infos = self.all_processes[process_type] if process_type != ray_constants.PROCESS_TYPE_REDIS_SERVER: assert len(process_infos) == 1 diff --git a/python/ray/tests/test_client_metadata.py b/python/ray/tests/test_client_metadata.py index 1a6c4e2a5633..ffec75a77c17 100644 --- a/python/ray/tests/test_client_metadata.py +++ b/python/ray/tests/test_client_metadata.py @@ -38,8 +38,3 @@ def test_get_runtime_context(ray_start_regular_shared): with pytest.raises(Exception): _ = rtc.task_id - - -if __name__ == "__main__": - import sys - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_client_references.py b/python/ray/tests/test_client_references.py index 54bfa7f4290c..8a4458e14af8 100644 --- a/python/ray/tests/test_client_references.py +++ b/python/ray/tests/test_client_references.py @@ -1,7 +1,5 @@ -import pytest from ray.util.client.ray_client_helpers import ray_start_client_server -from ray.util.client.ray_client_helpers import ( - ray_start_client_server_pair, ray_start_cluster_client_server_pair) +from ray.util.client.ray_client_helpers import ray_start_client_server_pair from ray.test_utils import wait_for_condition import ray as real_ray from ray.core.generated.gcs_pb2 import ActorTableData @@ -32,14 +30,8 @@ def test_cond(): return test_cond -@pytest.mark.parametrize( - "ray_start_cluster", [{ - "num_nodes": 1, - "do_init": False - }], indirect=True) -def test_delete_refs_on_disconnect(ray_start_cluster): - cluster = ray_start_cluster - with ray_start_cluster_client_server_pair(cluster.address) as pair: +def test_delete_refs_on_disconnect(ray_start_regular): + with ray_start_client_server_pair() as pair: ray, server = pair @ray.remote @@ -57,15 +49,11 @@ def f(x): # And can get the data assert ray.get(thing1) == 8 - # Close the client. + # Close the client ray.close() wait_for_condition(server_object_ref_count(server, 0), timeout=5) - # Connect to the real ray again, since we disconnected - # upon num_clients = 0. - real_ray.init(address=cluster.address) - def test_cond(): return len(real_ray.objects()) == 0 @@ -85,14 +73,8 @@ def test_delete_ref_on_object_deletion(ray_start_regular): wait_for_condition(server_object_ref_count(server, 1), timeout=5) -@pytest.mark.parametrize( - "ray_start_cluster", [{ - "num_nodes": 1, - "do_init": False - }], indirect=True) -def test_delete_actor_on_disconnect(ray_start_cluster): - cluster = ray_start_cluster - with ray_start_cluster_client_server_pair(cluster.address) as pair: +def test_delete_actor_on_disconnect(ray_start_regular): + with ray_start_client_server_pair() as pair: ray, server = pair @ray.remote @@ -124,10 +106,6 @@ def test_cond(): ] return len(alive_actors) == 0 - # Connect to the real ray again, since we disconnected - # upon num_clients = 0. - real_ray.init(address=cluster.address) - wait_for_condition(test_cond, timeout=10) @@ -174,9 +152,3 @@ def get(self): del ref1 assert ray.get(ref2) == "hi" del ref2 - - -if __name__ == "__main__": - import sys - import pytest - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_client_terminate.py b/python/ray/tests/test_client_terminate.py index 6f7af830f349..9016c627a6a5 100644 --- a/python/ray/tests/test_client_terminate.py +++ b/python/ray/tests/test_client_terminate.py @@ -83,9 +83,3 @@ def wait_for(t): signaler2.send.remote() ray.get(obj1) - - -if __name__ == "__main__": - import sys - import pytest - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_cross_language.py b/python/ray/tests/test_cross_language.py index 4ffd6db3e4f1..10766b18bd44 100644 --- a/python/ray/tests/test_cross_language.py +++ b/python/ray/tests/test_cross_language.py @@ -24,7 +24,3 @@ class PythonObject(object): with pytest.raises(Exception, match="transfer"): ray.java_function("a", "b").remote(PythonObject()) - - -if __name__ == "__main__": - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_mini.py b/python/ray/tests/test_mini.py index 724deb542aae..dae1e11bd38f 100644 --- a/python/ray/tests/test_mini.py +++ b/python/ray/tests/test_mini.py @@ -59,9 +59,3 @@ def get(self): x = 1 f = Foo.remote(x) assert (ray.get(f.get.remote()) == x) - - -if __name__ == "__main__": - import pytest - import sys - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_monitor.py b/python/ray/tests/test_monitor.py index e4b14166d747..ac67ddcf2cdc 100644 --- a/python/ray/tests/test_monitor.py +++ b/python/ray/tests/test_monitor.py @@ -37,9 +37,3 @@ def test_parse_resource_demands(): # counted as infeasible or waiting, as long as it's accounted for and # doesn't cause an error. assert len(waiting + infeasible) == 10 - - -if __name__ == "__main__": - import sys - import pytest - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/util/client/ray_client_helpers.py b/python/ray/util/client/ray_client_helpers.py index a7f16c246aa7..77f09346d7af 100644 --- a/python/ray/util/client/ray_client_helpers.py +++ b/python/ray/util/client/ray_client_helpers.py @@ -1,6 +1,5 @@ from contextlib import contextmanager -import ray as real_ray import ray.util.client.server.server as ray_client_server from ray.util.client import ray @@ -23,21 +22,3 @@ def ray_start_client_server_pair(): ray._inside_client_test = False ray.disconnect() server.stop(0) - - -@contextmanager -def ray_start_cluster_client_server_pair(address): - ray._inside_client_test = True - - def ray_connect_handler(): - real_ray.init(address=address) - - server = ray_client_server.serve( - "localhost:50051", ray_connect_handler=ray_connect_handler) - ray.connect("localhost:50051") - try: - yield ray, server - finally: - ray._inside_client_test = False - ray.disconnect() - server.stop(0) From cb00f8d1a4597ab3d678eac94369becc7bb12bd5 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 069/244] Revert "[ray_client] close ray connection upon client deactivation (#13919)" This reverts commit 8787c7010690905e894319780a6a5bc3dcc30322. --- ci/travis/ci.sh | 1 - python/ray/tests/BUILD | 4 +- python/ray/tests/test_client_init.py | 260 ++++++++---------- python/ray/tests/test_job.py | 4 +- python/ray/util/client/server/dataservicer.py | 15 +- python/ray/util/client/server/server.py | 51 +--- python/ray/util/client/worker.py | 31 +-- 7 files changed, 152 insertions(+), 214 deletions(-) diff --git a/ci/travis/ci.sh b/ci/travis/ci.sh index 61b74b082798..2d381ba24b15 100755 --- a/ci/travis/ci.sh +++ b/ci/travis/ci.sh @@ -152,7 +152,6 @@ test_python() { -python/ray/tests:test_basic_3 # timeout -python/ray/tests:test_basic_3_client_mode -python/ray/tests:test_cli - -python/ray/tests:test_client_init # timeout -python/ray/tests:test_failure -python/ray/tests:test_global_gc -python/ray/tests:test_job diff --git a/python/ray/tests/BUILD b/python/ray/tests/BUILD index 2572c50c2dcf..4ef81d504f63 100644 --- a/python/ray/tests/BUILD +++ b/python/ray/tests/BUILD @@ -26,8 +26,6 @@ py_test_module_list( "test_basic_3.py", "test_cancel.py", "test_cli.py", - "test_client.py", - "test_client_init.py", "test_component_failures_2.py", "test_component_failures_3.py", "test_error_ray_not_initialized.py", @@ -82,7 +80,9 @@ py_test_module_list( "test_asyncio.py", "test_autoscaler.py", "test_autoscaler_yaml.py", + "test_client_init.py", "test_client_metadata.py", + "test_client.py", "test_client_references.py", "test_client_terminate.py", "test_command_runner.py", diff --git a/python/ray/tests/test_client_init.py b/python/ray/tests/test_client_init.py index 8053ab5774e6..6b6ce8a42598 100644 --- a/python/ray/tests/test_client_init.py +++ b/python/ray/tests/test_client_init.py @@ -38,146 +38,130 @@ def get(self): return self.val -@pytest.fixture -def init_and_serve(): - server_handle, _ = ray_client_server.init_and_serve("localhost:50051") - yield server_handle - ray_client_server.shutdown_with_server(server_handle.grpc_server) - time.sleep(2) - - -@pytest.fixture -def init_and_serve_lazy(): - cluster = ray.cluster_utils.Cluster() - cluster.add_node(num_cpus=1, num_gpus=0) - address = cluster.address - - def connect(): - ray.init(address=address) - - server_handle = ray_client_server.serve("localhost:50051", connect) - yield server_handle - ray_client_server.shutdown_with_server(server_handle.grpc_server) - time.sleep(2) - - -def test_basic_preregister(init_and_serve): +def test_basic_preregister(): from ray.util.client import ray - ray.connect("localhost:50051") - val = ray.get(hello_world.remote()) - print(val) - assert val >= 20 - assert val <= 200 - c = C.remote(3) - x = c.double.remote() - y = c.double.remote() - ray.wait([x, y]) - val = ray.get(c.get.remote()) - assert val == 12 - ray.disconnect() - - -def test_num_clients(init_and_serve_lazy): + server, _ = ray_client_server.init_and_serve("localhost:50051") + try: + ray.connect("localhost:50051") + val = ray.get(hello_world.remote()) + print(val) + assert val >= 20 + assert val <= 200 + c = C.remote(3) + x = c.double.remote() + y = c.double.remote() + ray.wait([x, y]) + val = ray.get(c.get.remote()) + assert val == 12 + finally: + ray.disconnect() + ray_client_server.shutdown_with_server(server) + time.sleep(2) + + +def test_num_clients(): # Tests num clients reporting; useful if you want to build an app that # load balances clients between Ray client servers. + server_handle, _ = ray_client_server.init_and_serve("localhost:50051") + server = server_handle.grpc_server + try: + api1 = RayAPIStub() + info1 = api1.connect("localhost:50051") + assert info1["num_clients"] == 1, info1 + api2 = RayAPIStub() + info2 = api2.connect("localhost:50051") + assert info2["num_clients"] == 2, info2 + + # Disconnect the first two clients. + api1.disconnect() + api2.disconnect() + time.sleep(1) + + api3 = RayAPIStub() + info3 = api3.connect("localhost:50051") + assert info3["num_clients"] == 1, info3 + + # Check info contains ray and python version. + assert isinstance(info3["ray_version"], str), info3 + assert isinstance(info3["ray_commit"], str), info3 + assert isinstance(info3["python_version"], str), info3 + assert isinstance(info3["protocol_version"], str), info3 + api3.disconnect() + finally: + ray_client_server.shutdown_with_server(server) + time.sleep(2) + + +def test_python_version(): - def get_job_id(api): - return api.get_runtime_context().worker.current_job_id - - api1 = RayAPIStub() - info1 = api1.connect("localhost:50051") - job_id_1 = get_job_id(api1) - assert info1["num_clients"] == 1, info1 - api2 = RayAPIStub() - info2 = api2.connect("localhost:50051") - job_id_2 = get_job_id(api2) - assert info2["num_clients"] == 2, info2 - - assert job_id_1 == job_id_2 - - # Disconnect the first two clients. - api1.disconnect() - api2.disconnect() - time.sleep(1) - - api3 = RayAPIStub() - info3 = api3.connect("localhost:50051") - job_id_3 = get_job_id(api3) - assert info3["num_clients"] == 1, info3 - assert job_id_1 != job_id_3 - - # Check info contains ray and python version. - assert isinstance(info3["ray_version"], str), info3 - assert isinstance(info3["ray_commit"], str), info3 - assert isinstance(info3["python_version"], str), info3 - assert isinstance(info3["protocol_version"], str), info3 - api3.disconnect() - - -def test_python_version(init_and_serve): - server_handle = init_and_serve - ray = RayAPIStub() - info1 = ray.connect("localhost:50051") - assert info1["python_version"] == ".".join( - [str(x) for x in list(sys.version_info)[:3]]) - ray.disconnect() - time.sleep(1) - - def mock_connection_response(): - return ray_client_pb2.ConnectionInfoResponse( - num_clients=1, - python_version="2.7.12", - ray_version="", - ray_commit="", - protocol_version=CURRENT_PROTOCOL_VERSION, - ) - - # inject mock connection function - server_handle.data_servicer._build_connection_response = \ - mock_connection_response - - ray = RayAPIStub() - with pytest.raises(RuntimeError): - _ = ray.connect("localhost:50051") - - ray = RayAPIStub() - info3 = ray.connect("localhost:50051", ignore_version=True) - assert info3["num_clients"] == 1, info3 - ray.disconnect() - - -def test_protocol_version(init_and_serve): - server_handle = init_and_serve - ray = RayAPIStub() - info1 = ray.connect("localhost:50051") - local_py_version = ".".join([str(x) for x in list(sys.version_info)[:3]]) - assert info1["protocol_version"] == CURRENT_PROTOCOL_VERSION, info1 - ray.disconnect() - time.sleep(1) - - def mock_connection_response(): - return ray_client_pb2.ConnectionInfoResponse( - num_clients=1, - python_version=local_py_version, - ray_version="", - ray_commit="", - protocol_version="2050-01-01", # from the future - ) - - # inject mock connection function - server_handle.data_servicer._build_connection_response = \ - mock_connection_response - - ray = RayAPIStub() - with pytest.raises(RuntimeError): - _ = ray.connect("localhost:50051") - - ray = RayAPIStub() - info3 = ray.connect("localhost:50051", ignore_version=True) - assert info3["num_clients"] == 1, info3 - ray.disconnect() - + server_handle, _ = ray_client_server.init_and_serve("localhost:50051") + try: + ray = RayAPIStub() + info1 = ray.connect("localhost:50051") + assert info1["python_version"] == ".".join( + [str(x) for x in list(sys.version_info)[:3]]) + ray.disconnect() + time.sleep(1) + + def mock_connection_response(): + return ray_client_pb2.ConnectionInfoResponse( + num_clients=1, + python_version="2.7.12", + ray_version="", + ray_commit="", + protocol_version=CURRENT_PROTOCOL_VERSION, + ) + + # inject mock connection function + server_handle.data_servicer._build_connection_response = \ + mock_connection_response + + ray = RayAPIStub() + with pytest.raises(RuntimeError): + _ = ray.connect("localhost:50051") + + ray = RayAPIStub() + info3 = ray.connect("localhost:50051", ignore_version=True) + assert info3["num_clients"] == 1, info3 + ray.disconnect() + finally: + ray_client_server.shutdown_with_server(server_handle.grpc_server) + time.sleep(2) + + +def test_protocol_version(): -if __name__ == "__main__": - import pytest - sys.exit(pytest.main(["-v", __file__] + sys.argv[1:])) + server_handle, _ = ray_client_server.init_and_serve("localhost:50051") + try: + ray = RayAPIStub() + info1 = ray.connect("localhost:50051") + local_py_version = ".".join( + [str(x) for x in list(sys.version_info)[:3]]) + assert info1["protocol_version"] == CURRENT_PROTOCOL_VERSION, info1 + ray.disconnect() + time.sleep(1) + + def mock_connection_response(): + return ray_client_pb2.ConnectionInfoResponse( + num_clients=1, + python_version=local_py_version, + ray_version="", + ray_commit="", + protocol_version="2050-01-01", # from the future + ) + + # inject mock connection function + server_handle.data_servicer._build_connection_response = \ + mock_connection_response + + ray = RayAPIStub() + with pytest.raises(RuntimeError): + _ = ray.connect("localhost:50051") + + ray = RayAPIStub() + info3 = ray.connect("localhost:50051", ignore_version=True) + assert info3["num_clients"] == 1, info3 + ray.disconnect() + finally: + ray_client_server.shutdown_with_server(server_handle.grpc_server) + time.sleep(2) diff --git a/python/ray/tests/test_job.py b/python/ray/tests/test_job.py index 15313d7bafbd..cc7909dd8cb9 100644 --- a/python/ray/tests/test_job.py +++ b/python/ray/tests/test_job.py @@ -33,7 +33,7 @@ def __init__(self): assert len(actor_table) == 1 job_table = ray.jobs() - assert len(job_table) == 2 # dash + assert len(job_table) == 3 # dash, ray client server # Kill the driver process. p.kill() @@ -79,7 +79,7 @@ def value(self): assert len(actor_table) == 1 job_table = ray.jobs() - assert len(job_table) == 2 # dash + assert len(job_table) == 3 # dash, ray client server # Kill the driver process. p.kill() diff --git a/python/ray/util/client/server/dataservicer.py b/python/ray/util/client/server/dataservicer.py index c9e345219a9b..82ddc85c6f5f 100644 --- a/python/ray/util/client/server/dataservicer.py +++ b/python/ray/util/client/server/dataservicer.py @@ -3,13 +3,12 @@ import grpc import sys -from typing import TYPE_CHECKING, Callable +from typing import TYPE_CHECKING from threading import Lock import ray.core.generated.ray_client_pb2 as ray_client_pb2 import ray.core.generated.ray_client_pb2_grpc as ray_client_pb2_grpc from ray.util.client import CURRENT_PROTOCOL_VERSION -from ray._private.client_mode_hook import disable_client_hook if TYPE_CHECKING: from ray.util.client.server.server import RayletServicer @@ -18,12 +17,10 @@ class DataServicer(ray_client_pb2_grpc.RayletDataStreamerServicer): - def __init__(self, basic_service: "RayletServicer", - ray_connect_handler: Callable): + def __init__(self, basic_service: "RayletServicer"): self.basic_service = basic_service self._clients_lock = Lock() self._num_clients = 0 # guarded by self._clients_lock - self.ray_connect_handler = ray_connect_handler def Datapath(self, request_iterator, context): metadata = {k: v for k, v in context.invocation_metadata()} @@ -34,9 +31,6 @@ def Datapath(self, request_iterator, context): logger.info(f"New data connection from client {client_id}") try: with self._clients_lock: - with disable_client_hook(): - if self._num_clients == 0 and not ray.is_initialized(): - self.ray_connect_handler() self._num_clients += 1 for req in request_iterator: resp = None @@ -69,14 +63,9 @@ def Datapath(self, request_iterator, context): finally: logger.info(f"Lost data connection from client {client_id}") self.basic_service.release_all(client_id) - with self._clients_lock: self._num_clients -= 1 - with disable_client_hook(): - if self._num_clients == 0: - ray.shutdown() - def _build_connection_response(self): with self._clients_lock: cur_num_clients = self._num_clients diff --git a/python/ray/util/client/server/server.py b/python/ray/util/client/server/server.py index 6e65c929b8d8..6a7badaf703a 100644 --- a/python/ray/util/client/server/server.py +++ b/python/ray/util/client/server/server.py @@ -422,17 +422,10 @@ def __getattr__(self, attr): return getattr(self.grpc_server, attr) -def serve(connection_str, ray_connect_handler=None): - def default_connect_handler(): - with disable_client_hook(): - if not ray.is_initialized(): - return ray.init() - - ray_connect_handler = ray_connect_handler or default_connect_handler +def serve(connection_str): server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) task_servicer = RayletServicer() - data_servicer = DataServicer( - task_servicer, ray_connect_handler=ray_connect_handler) + data_servicer = DataServicer(task_servicer) logs_servicer = LogstreamServicer() ray_client_pb2_grpc.add_RayletDriverServicer_to_server( task_servicer, server) @@ -455,17 +448,7 @@ def init_and_serve(connection_str, *args, **kwargs): with disable_client_hook(): # Disable client mode inside the worker's environment info = ray.init(*args, **kwargs) - - def ray_connect_handler(): - # Ray client will disconnect from ray when - # num_clients == 0. - if ray.is_initialized(): - return info - else: - return ray.init(*args, **kwargs) - - server_handle = serve( - connection_str, ray_connect_handler=ray_connect_handler) + server_handle = serve(connection_str) return (server_handle, info) @@ -475,19 +458,6 @@ def shutdown_with_server(server, _exiting_interpreter=False): ray.shutdown(_exiting_interpreter) -def create_ray_handler(redis_address, redis_password): - def ray_connect_handler(): - if redis_address: - if redis_password: - ray.init(address=redis_address, _redis_password=redis_password) - else: - ray.init(address=redis_address) - else: - ray.init() - - return ray_connect_handler - - def main(): import argparse parser = argparse.ArgumentParser() @@ -507,13 +477,18 @@ def main(): help="Password for connecting to Redis") args = parser.parse_args() logging.basicConfig(level="INFO") - - ray_connect_handler = create_ray_handler(args.redis_address, - args.redis_password) - + if args.redis_address: + if args.redis_password: + ray.init( + address=args.redis_address, + _redis_password=args.redis_password) + else: + ray.init(address=args.redis_address) + else: + ray.init() hostport = "%s:%d" % (args.host, args.port) logger.info(f"Starting Ray Client server on {hostport}") - server = serve(hostport, ray_connect_handler) + server = serve(hostport) try: while True: time.sleep(1000) diff --git a/python/ray/util/client/worker.py b/python/ray/util/client/worker.py index db9a1cc63052..3f04c80a48ca 100644 --- a/python/ray/util/client/worker.py +++ b/python/ray/util/client/worker.py @@ -68,7 +68,6 @@ def __init__(self, """ self.metadata = metadata if metadata else [] self.channel = None - self.server = None self._conn_state = grpc.ChannelConnectivity.IDLE self._client_id = make_client_id() self._converted: Dict[str, ClientStub] = {} @@ -84,7 +83,7 @@ def __init__(self, # looking like a gRPC connection, though it may be a proxy. conn_attempts = 0 timeout = INITIAL_TIMEOUT_SEC - service_ready = False + ray_ready = False while conn_attempts < max(connection_retries, 1): conn_attempts += 1 try: @@ -95,8 +94,13 @@ def __init__(self, # RayletDriverStub, allowing for unary requests. self.server = ray_client_pb2_grpc.RayletDriverStub( self.channel) - service_ready = bool(self.ping_server()) - if service_ready: + # Now the HTTP2 channel is ready, or proxied, but the + # servicer may not be ready. Call is_initialized() and if + # it throws, the servicer is not ready. On success, the + # `ray_ready` result is checked. + ray_ready = self.is_initialized() + if ray_ready: + # Ray is ready! Break out of the retry loop break # Ray is not ready yet, wait a timeout time.sleep(timeout) @@ -116,10 +120,9 @@ def __init__(self, f"retry in {timeout}s...") timeout = backoff(timeout) - # If we made it through the loop without service_ready - # it means we've used up our retries and - # should error back to the user. - if not service_ready: + # If we made it through the loop without ray_ready it means we've used + # up our retries and should error back to the user. + if not ray_ready: raise ConnectionError("ray client connection timeout") # Initialize the streams to finish protocol negotiation. @@ -374,18 +377,6 @@ def is_initialized(self) -> bool: ray_client_pb2.ClusterInfoType.IS_INITIALIZED) return False - def ping_server(self) -> bool: - """Simple health check. - - Piggybacks the IS_INITIALIZED call to check if the server provides - an actual response. - """ - if self.server is not None: - result = self.get_cluster_info( - ray_client_pb2.ClusterInfoType.IS_INITIALIZED) - return result is not None - return False - def is_connected(self) -> bool: return self._conn_state == grpc.ChannelConnectivity.READY From 4df7e99b3b0448b15712df2bc95f53fc7453a867 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 070/244] Revert "[Java] fix actor restart failure when multi-worker is turned on (#13793)" This reverts commit 130802fce0e4662612b23d99825fe217c6de1309. --- .../java/io/ray/test/ActorRestartTest.java | 20 ++-------- src/ray/raylet/node_manager.cc | 2 +- src/ray/raylet/worker_pool.cc | 28 ++----------- src/ray/raylet/worker_pool.h | 9 +---- src/ray/raylet/worker_pool_test.cc | 39 +------------------ 5 files changed, 10 insertions(+), 88 deletions(-) diff --git a/java/test/src/main/java/io/ray/test/ActorRestartTest.java b/java/test/src/main/java/io/ray/test/ActorRestartTest.java index c57f9b6142d1..26326073c634 100644 --- a/java/test/src/main/java/io/ray/test/ActorRestartTest.java +++ b/java/test/src/main/java/io/ray/test/ActorRestartTest.java @@ -3,14 +3,15 @@ import io.ray.api.ActorHandle; import io.ray.api.Ray; import io.ray.runtime.exception.RayActorException; -import io.ray.runtime.exception.RayException; import io.ray.runtime.util.SystemUtil; import java.io.IOException; import java.util.concurrent.TimeUnit; import org.testng.Assert; import org.testng.annotations.Test; -@Test(groups = {"cluster"}) +@Test( + groups = {"cluster"}, + enabled = false) public class ActorRestartTest extends BaseTest { public static class Counter { @@ -57,7 +58,6 @@ public void testActorRestart() throws InterruptedException, IOException { // Kill the actor process. killActorProcess(actor); - waitForActorAlive(actor); int value = actor.task(Counter::increase).remote().get(); Assert.assertEquals(value, 1); @@ -83,18 +83,4 @@ private static void killActorProcess(ActorHandle actor) // Wait for the actor to be killed. TimeUnit.SECONDS.sleep(1); } - - private static void waitForActorAlive(ActorHandle actor) { - Assert.assertTrue( - TestUtils.waitForCondition( - () -> { - try { - actor.task(Counter::getPid).remote().get(); - return true; - } catch (RayException e) { - return false; - } - }, - 10000)); - } } diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index 9b66d0a7cc82..d0e3be78b23f 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -1267,7 +1267,7 @@ void NodeManager::DisconnectClient(const std::shared_ptr &clie } // Remove the dead client from the pool and stop listening for messages. - worker_pool_.DisconnectWorker(worker, disconnect_type); + worker_pool_.DisconnectWorker(worker); // Return the resources that were being used by this worker. cluster_task_manager_->ReleaseWorkerResources(worker); diff --git a/src/ray/raylet/worker_pool.cc b/src/ray/raylet/worker_pool.cc index 89749f2d4b26..ff6083199d0a 100644 --- a/src/ray/raylet/worker_pool.cc +++ b/src/ray/raylet/worker_pool.cc @@ -792,8 +792,7 @@ std::shared_ptr WorkerPool::PopWorker( for (auto it = idle_of_all_languages_.rbegin(); it != idle_of_all_languages_.rend(); it++) { if (task_spec.GetLanguage() != it->first->GetLanguage() || - it->first->GetAssignedJobId() != task_spec.JobId() || - state.pending_disconnection_workers.count(it->first) > 0) { + it->first->GetAssignedJobId() != task_spec.JobId()) { continue; } state.idle.erase(it->first); @@ -858,12 +857,9 @@ void WorkerPool::PrestartWorkers(const TaskSpecification &task_spec, } } -bool WorkerPool::DisconnectWorker(const std::shared_ptr &worker, - rpc::WorkerExitType disconnect_type) { +bool WorkerPool::DisconnectWorker(const std::shared_ptr &worker) { auto &state = GetStateForLanguage(worker->GetLanguage()); RAY_CHECK(RemoveWorker(state.registered_workers, worker)); - RAY_UNUSED(RemoveWorker(state.pending_disconnection_workers, worker)); - for (auto it = idle_of_all_languages_.begin(); it != idle_of_all_languages_.end(); it++) { if (it->first == worker) { @@ -874,25 +870,7 @@ bool WorkerPool::DisconnectWorker(const std::shared_ptr &worker } MarkPortAsFree(worker->AssignedPort()); - auto status = RemoveWorker(state.idle, worker); - if (disconnect_type != rpc::WorkerExitType::INTENDED_EXIT) { - // A Java worker process may have multiple workers. If one of them disconnects - // unintentionally (which means that the worker process has died), we remove the - // others from idle pool so that the failed actor will not be rescheduled on the same - // process. - auto pid = worker->GetProcess().GetId(); - for (auto worker2 : state.registered_workers) { - if (worker2->GetProcess().GetId() == pid) { - // NOTE(kfstorm): We have to use a new field to record these workers (instead of - // just removing them from idle sets) because they may haven't announced worker - // port yet. When they announce worker port, they'll be marked idle again. So - // removing them from idle sets here doesn't really prevent them from being popped - // later. - state.pending_disconnection_workers.insert(worker2); - } - } - } - return status; + return RemoveWorker(state.idle, worker); } void WorkerPool::DisconnectDriver(const std::shared_ptr &driver) { diff --git a/src/ray/raylet/worker_pool.h b/src/ray/raylet/worker_pool.h index ae7d1c52cddd..703fbf77b781 100644 --- a/src/ray/raylet/worker_pool.h +++ b/src/ray/raylet/worker_pool.h @@ -184,11 +184,9 @@ class WorkerPool : public WorkerPoolInterface, public IOWorkerPoolInterface { /// Disconnect a registered worker. /// - /// \param worker The worker to disconnect. The worker must be registered. - /// \param disconnect_type Type of a worker exit. + /// \param The worker to disconnect. The worker must be registered. /// \return Whether the given worker was in the pool of idle workers. - bool DisconnectWorker(const std::shared_ptr &worker, - rpc::WorkerExitType disconnect_type); + bool DisconnectWorker(const std::shared_ptr &worker); /// Disconnect a registered driver. /// @@ -369,9 +367,6 @@ class WorkerPool : public WorkerPoolInterface, public IOWorkerPoolInterface { std::unordered_set> registered_workers; /// All drivers that have registered and are still connected. std::unordered_set> registered_drivers; - /// All workers that have registered but is about to disconnect. They shouldn't be - /// popped anymore. - std::unordered_set> pending_disconnection_workers; /// A map from the pids of starting worker processes /// to the number of their unregistered workers. std::unordered_map starting_worker_processes; diff --git a/src/ray/raylet/worker_pool_test.cc b/src/ray/raylet/worker_pool_test.cc index 044dc33a2ede..0d2c0e314f34 100644 --- a/src/ray/raylet/worker_pool_test.cc +++ b/src/ray/raylet/worker_pool_test.cc @@ -268,8 +268,7 @@ TEST_F(WorkerPoolTest, HandleWorkerRegistration) { // Check that there's no starting worker process ASSERT_EQ(worker_pool_->NumWorkerProcessesStarting(), 0); for (const auto &worker : workers) { - worker_pool_->DisconnectWorker( - worker, /*disconnect_type=*/rpc::WorkerExitType::INTENDED_EXIT); + worker_pool_->DisconnectWorker(worker); // Check that we cannot lookup the worker after it's disconnected. ASSERT_EQ(worker_pool_->GetRegisteredWorker(worker->Connection()), nullptr); } @@ -711,42 +710,6 @@ TEST_F(WorkerPoolTest, DeleteWorkerPushPop) { }); } -TEST_F(WorkerPoolTest, NoPopOnCrashedWorkerProcess) { - // Start a Java worker process. - Process proc = - worker_pool_->StartWorkerProcess(Language::JAVA, rpc::WorkerType::WORKER, JOB_ID); - auto worker1 = CreateWorker(Process(), Language::JAVA); - auto worker2 = CreateWorker(Process(), Language::JAVA); - - // We now imitate worker process crashing while core worker initializing. - - // 1. we register both workers. - RAY_CHECK_OK(worker_pool_->RegisterWorker(worker1, proc.GetId(), [](Status, int) {})); - RAY_CHECK_OK(worker_pool_->RegisterWorker(worker2, proc.GetId(), [](Status, int) {})); - - // 2. announce worker port for worker 1. When interacting with worker pool, it's - // PushWorker. - worker_pool_->PushWorker(worker1); - - // 3. kill the worker process. Now let's assume that Raylet found that the connection - // with worker 1 disconnected first. - worker_pool_->DisconnectWorker( - worker1, /*disconnect_type=*/rpc::WorkerExitType::SYSTEM_ERROR_EXIT); - - // 4. but the RPC for announcing worker port for worker 2 is already in Raylet input - // buffer. So now Raylet needs to handle worker 2. - worker_pool_->PushWorker(worker2); - - // 5. Let's try to pop a worker to execute a task. Worker 2 shouldn't be popped because - // the process has crashed. - const auto task_spec = ExampleTaskSpec(); - ASSERT_EQ(worker_pool_->PopWorker(task_spec), nullptr); - - // 6. Now Raylet disconnects with worker 2. - worker_pool_->DisconnectWorker( - worker2, /*disconnect_type=*/rpc::WorkerExitType::SYSTEM_ERROR_EXIT); -} - } // namespace raylet } // namespace ray From bd1a01b3590b0d48a161bea195305e79407d334d Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 071/244] Revert "[docs] page for using Modin with Ray (#13937)" This reverts commit 9e2ce013a349ed1a61164ac777e0653aa6fa7f2b. --- doc/source/index.rst | 1 - doc/source/modin/index.rst | 97 -------------------------------------- doc/source/ray-client.rst | 4 +- 3 files changed, 1 insertion(+), 101 deletions(-) delete mode 100644 doc/source/modin/index.rst diff --git a/doc/source/index.rst b/doc/source/index.rst index a37ff8d6b9a8..76bfa3f60a12 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -305,7 +305,6 @@ Papers joblib.rst iter.rst xgboost-ray.rst - modin/index.rst dask-on-ray.rst mars-on-ray.rst ray-client.rst diff --git a/doc/source/modin/index.rst b/doc/source/modin/index.rst deleted file mode 100644 index f7e62fc3f540..000000000000 --- a/doc/source/modin/index.rst +++ /dev/null @@ -1,97 +0,0 @@ -Modin (Pandas on Ray) -===================== - -Modin_, previously Pandas on Ray, is a dataframe manipulation library that -allows users to speed up their pandas workloads by acting as a drop-in -replacement. Modin also provides support for other APIs (e.g. spreadsheet) -and libraries, like xgboost. - -.. code-block:: python - - import modin.pandas as pd - import ray - - ray.init() - df = pd.read_parquet("s3://my-bucket/big.parquet") - -You can use Modin on Ray with your laptop or cluster. In this document, -we show instructions for how to set up a Modin compatible Ray cluster -and connect Modin to Ray. - -.. note:: In previous versions of Modin, you had to initialize Ray before importing Modin. As of Modin 0.9.0, This is no longer the case. - -Using Modin with Ray's autoscaler ---------------------------------- - -In order to use Modin with :ref:`Ray's autoscaler `, you need to ensure that the -correct dependencies are installed at startup. Modin's repository has an -example `yaml file and set of tutorial notebooks`_ to ensure that the Ray -cluster has the correct dependencies. Once the cluster is up, connect Modin -by simply importing. - -.. code-block:: python - - import modin.pandas as pd - import ray - - ray.init(address="auto") - df = pd.read_parquet("s3://my-bucket/big.parquet") - -As long as Ray is initialized before any dataframes are created, Modin -will be able to connect to and use the Ray cluster. - -Modin with the Ray Client -------------------------- - -When using Modin with the :ref:`Ray Client `, it is important to ensure that the -cluster has all dependencies installed. - -.. code-block:: python - - import modin.pandas as pd - import ray - import ray.util - - ray.util.connect() - df = pd.read_parquet("s3://my-bucket/big.parquet") - -Modin will automatically use the Ray Client for computation when the file -is read. - -How Modin uses Ray ------------------- - -Modin has a layered architecture, and the core abstraction for data manipulation -is the Modin Dataframe, which implements a novel algebra that enables Modin to -handle all of pandas (see Modin's documentation_ for more on the architecture). -Modin's internal dataframe object has a scheduling layer that is able to partition -and operate on data with Ray. - -Dataframe operations -'''''''''''''''''''' - -The Modin Dataframe uses Ray tasks to perform data manipulations. Ray Tasks have -a number of benefits over the actor model for data manipulation: - -- Multiple tasks may be manipulating the same objects simultaneously -- Objects in Ray's object store are immutable, making provenance and lineage easier - to track -- As new workers come online the shuffling of data will happen as tasks are - scheduled on the new node -- Identical partitions need not be replicated, especially beneficial for operations - that selectively mutate the data (e.g. ``fillna``). -- Finer grained parallelism with finer grained placement control - -Machine Learning -'''''''''''''''' - -Modin uses Ray Actors for the machine learning support it currently provides. -Modin's implementation of XGBoost is able to spin up one actor for each node -and aggregate all of the partitions on that node to the XGBoost Actor. Modin -is able to specify precisely the node IP for each actor on creation, giving -fine-grained control over placement - a must for distributed training -performance. - -.. _Modin: https://github.com/modin-project/modin -.. _documentation: https://modin.readthedocs.io/en/latest/developer/architecture.html -.. _yaml file and set of tutorial notebooks: https://github.com/modin-project/modin/tree/master/examples/tutorial/tutorial_notebooks/cluster diff --git a/doc/source/ray-client.rst b/doc/source/ray-client.rst index 487c24696330..a0cd6292a5d9 100644 --- a/doc/source/ray-client.rst +++ b/doc/source/ray-client.rst @@ -1,5 +1,3 @@ -.. _ray-client: - ********** Ray Client ********** @@ -36,7 +34,7 @@ From here, another Ray script can access that server from a networked machine wi do_work.remote(2) #.... - + When the client disconnects, any object or actor references held by the server on behalf of the client are dropped, as if directly disconnecting from the cluster. ============ From a9a85bee1eb64b8fb59af479071a68646cf8a960 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 072/244] Revert "[dask-on-ray] Fix Dask-on-Ray test: Python 3 dictionary .values() is a view, and is not indexable (#13945)" This reverts commit cb0303ac9b19adc80255e446dc15731e1c661891. --- python/ray/tests/test_dask_scheduler.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/ray/tests/test_dask_scheduler.py b/python/ray/tests/test_dask_scheduler.py index 54ba40521a81..28a98a76eda8 100644 --- a/python/ray/tests/test_dask_scheduler.py +++ b/python/ray/tests/test_dask_scheduler.py @@ -35,9 +35,7 @@ def call_add(): def test_ray_dask_persist(ray_start_regular_shared): arr = da.ones(5) + 2 result = arr.persist(scheduler=ray_dask_get) - np.testing.assert_array_equal( - next(iter(result.dask.values())), - np.ones(5) + 2) + np.testing.assert_array_equal(result.dask.values()[0], np.ones(5) + 2) if __name__ == "__main__": From f35fa0250b7bd3330e11e95ad0155cb15646d268 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 073/244] Revert "[Hotfix] Master compilation error on MacOS. (#13946)" This reverts commit 861d5906b274bdafe76319501602cfbf11f30a77. --- src/ray/core_worker/reference_count.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ray/core_worker/reference_count.cc b/src/ray/core_worker/reference_count.cc index db05320a9c8b..a38a98d801ed 100644 --- a/src/ray/core_worker/reference_count.cc +++ b/src/ray/core_worker/reference_count.cc @@ -1010,7 +1010,7 @@ void ReferenceCounter::PushToLocationSubscribers(ReferenceTable::iterator it) { const auto callbacks = it->second.location_subscription_callbacks; it->second.location_subscription_callbacks.clear(); it->second.location_version++; - for (const auto &callback : callbacks) { + for (const auto callback : callbacks) { callback(it->second.locations, it->second.object_size, it->second.location_version); } } From 7bfd0768a36bff77f081d586087dc8f287bc155f Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 074/244] Revert "[tune] Fixed wait_for_gpu to handle str representations of ordinal IDs (#13936)" This reverts commit 35e20cc5c487006b25f211e033cfcf00acd1d13d. --- python/ray/tune/tests/test_trainable_util.py | 107 +------------------ python/ray/tune/utils/util.py | 74 +++++-------- python/ray/tune/utils/util_test.py | 43 ++++++++ 3 files changed, 76 insertions(+), 148 deletions(-) create mode 100644 python/ray/tune/utils/util_test.py diff --git a/python/ray/tune/tests/test_trainable_util.py b/python/ray/tune/tests/test_trainable_util.py index 23dfb35733e7..25860eb1c569 100644 --- a/python/ray/tune/tests/test_trainable_util.py +++ b/python/ray/tune/tests/test_trainable_util.py @@ -1,14 +1,10 @@ -from collections import OrderedDict import os -import sys +import pickle import shutil import unittest -from unittest.mock import patch import ray.utils -import ray.cloudpickle as cloudpickle -from ray.tune.utils.util import wait_for_gpu -from ray.tune.utils.util import unflatten_dict + from ray.tune.utils.trainable import TrainableUtil @@ -16,15 +12,13 @@ class TrainableUtilTest(unittest.TestCase): def setUp(self): self.checkpoint_dir = os.path.join(ray.utils.get_user_temp_dir(), "tune", "MyTrainable123") - self.checkpoint_dir = TrainableUtil.make_checkpoint_dir( - self.checkpoint_dir, "0") + TrainableUtil.make_checkpoint_dir(self.checkpoint_dir) def tearDown(self): self.addCleanup(shutil.rmtree, self.checkpoint_dir) def testFindCheckpointDir(self): - checkpoint_path = os.path.join(self.checkpoint_dir, - "0/my/nested/chkpt") + checkpoint_path = os.path.join(self.checkpoint_dir, "my/nested/chkpt") os.makedirs(checkpoint_path) found_dir = TrainableUtil.find_checkpoint_dir(checkpoint_path) self.assertEquals(self.checkpoint_dir, found_dir) @@ -42,7 +36,7 @@ def testPickleCheckpoint(self): checkpoint_path = os.path.join(self.checkpoint_dir, "0") data_dict = TrainableUtil.pickle_checkpoint(checkpoint_path) - loaded = cloudpickle.loads(data_dict) + loaded = pickle.loads(data_dict) checkpoint_name = os.path.basename(checkpoint_path) self.assertEqual(loaded["checkpoint_name"], checkpoint_name) @@ -50,94 +44,3 @@ def testPickleCheckpoint(self): for i in range(5): path = os.path.join(self.checkpoint_dir, str(i)) self.assertEquals(loaded["data"][str(i)], open(path, "rb").read()) - - -class UnflattenDictTest(unittest.TestCase): - def test_output_type(self): - in_ = OrderedDict({"a/b": 1, "c/d": 2, "e": 3}) - out = unflatten_dict(in_) - assert type(in_) is type(out) - - def test_one_level_nested(self): - result = unflatten_dict({"a/b": 1, "c/d": 2, "e": 3}) - assert result == {"a": {"b": 1}, "c": {"d": 2}, "e": 3} - - def test_multi_level_nested(self): - result = unflatten_dict({"a/b/c/d": 1, "b/c/d": 2, "c/d": 3, "e": 4}) - assert result == { - "a": { - "b": { - "c": { - "d": 1, - }, - }, - }, - "b": { - "c": { - "d": 2, - }, - }, - "c": { - "d": 3, - }, - "e": 4, - } - - -class GPUUtilMock: - class GPU: - def __init__(self, id, uuid, util=None): - self.id = id - self.uuid = uuid - self.util = [0.5, 0.0] - - @property - def memoryUtil(self): - if self.util: - return self.util.pop(0) - return 0 - - def __init__(self, gpus, gpu_uuids): - self.gpus = gpus - self.uuids = gpu_uuids - self.gpu_list = [ - self.GPU(gpu, uuid) for gpu, uuid in zip(self.gpus, self.uuids) - ] - - def getGPUs(self): - return self.gpu_list - - -class GPUTest(unittest.TestCase): - def setUp(self): - sys.modules["GPUtil"] = GPUUtilMock([0, 1], ["GPU-aaa", "GPU-bbb"]) - - def testGPUWait1(self): - wait_for_gpu(0, delay_s=0) - - def testGPUWait2(self): - wait_for_gpu("1", delay_s=0) - - def testGPUWait3(self): - wait_for_gpu("GPU-aaa", delay_s=0) - - def testGPUWaitFail(self): - with self.assertRaises(ValueError): - wait_for_gpu(2, delay_s=0) - - with self.assertRaises(ValueError): - wait_for_gpu("4", delay_s=0) - - with self.assertRaises(ValueError): - wait_for_gpu(1.23, delay_s=0) - - @patch("ray.get_gpu_ids", lambda: ["0"]) - def testDefaultGPU(self): - import sys - sys.modules["GPUtil"] = GPUUtilMock([0], ["GPU-aaa"]) - wait_for_gpu(delay_s=0) - - -if __name__ == "__main__": - import pytest - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/utils/util.py b/python/ray/tune/utils/util.py index 73c56a013279..02daa858fd75 100644 --- a/python/ray/tune/utils/util.py +++ b/python/ray/tune/utils/util.py @@ -21,14 +21,10 @@ logger = logging.getLogger(__name__) - -def _import_gputil(): - try: - import GPUtil - except ImportError: - GPUtil = None - return GPUtil - +try: + import GPUtil +except ImportError: + GPUtil = None _pinned_objects = [] PINNED_OBJECT_PREFIX = "ray.tune.PinnedObject:" @@ -47,8 +43,6 @@ class UtilMonitor(Thread): def __init__(self, start=True, delay=0.7): self.stopped = True - GPUtil = _import_gputil() - self.GPUtil = GPUtil if GPUtil is None and start: logger.warning("Install gputil for GPU system monitoring.") @@ -73,10 +67,10 @@ def _read_utilization(self): float(psutil.cpu_percent(interval=None))) self.values["ram_util_percent"].append( float(getattr(psutil.virtual_memory(), "percent"))) - if self.GPUtil is not None: + if GPUtil is not None: gpu_list = [] try: - gpu_list = self.GPUtil.getGPUs() + gpu_list = GPUtil.getGPUs() except Exception: logger.debug("GPUtil failed to retrieve GPUs.") for gpu in gpu_list: @@ -471,7 +465,6 @@ def load_newest_checkpoint(dirpath: str, ckpt_pattern: str) -> dict: def wait_for_gpu(gpu_id=None, target_util=0.01, retry=20, - delay_s=5, gpu_memory_limit=None): """Checks if a given GPU has freed memory. @@ -483,9 +476,8 @@ def wait_for_gpu(gpu_id=None, the first item returned from `ray.get_gpu_ids()`. target_util (float): The utilization threshold to reach to unblock. Set this to 0 to block until the GPU is completely free. - retry (int): Number of times to check GPU limit. Sleeps `delay_s` + retry (int): Number of times to check GPU limit. Sleeps 5 seconds between checks. - delay_s (int): Seconds to wait before check. gpu_memory_limit (float): Deprecated. Returns: @@ -505,54 +497,44 @@ def tune_func(config): tune.run(tune_func, resources_per_trial={"GPU": 1}, num_samples=10) """ - GPUtil = _import_gputil() if gpu_memory_limit: raise ValueError("'gpu_memory_limit' is deprecated. " "Use 'target_util' instead.") if GPUtil is None: raise RuntimeError( "GPUtil must be installed if calling `wait_for_gpu`.") - if gpu_id is None: gpu_id_list = ray.get_gpu_ids() if not gpu_id_list: - raise RuntimeError("No GPU ids found from `ray.get_gpu_ids()`. " + raise RuntimeError(f"No GPU ids found from {ray.get_gpu_ids()}. " "Did you set Tune resources correctly?") gpu_id = gpu_id_list[0] - gpu_attr = "id" - if isinstance(gpu_id, str): - if gpu_id.isdigit(): - # GPU ID returned from `ray.get_gpu_ids()` is a str representation - # of the int GPU ID - gpu_id = int(gpu_id) - else: - # Could not coerce gpu_id to int, so assume UUID - # and compare against `uuid` attribute e.g., - # 'GPU-04546190-b68d-65ac-101b-035f8faed77d' - gpu_attr = "uuid" - elif not isinstance(gpu_id, int): - raise ValueError(f"gpu_id ({type(gpu_id)}) must be type str/int.") - - def gpu_id_fn(g): - # Returns either `g.id` or `g.uuid` depending on - # the format of the input `gpu_id` - return getattr(g, gpu_attr) - - gpu_ids = {gpu_id_fn(g) for g in GPUtil.getGPUs()} - if gpu_id not in gpu_ids: - raise ValueError( - f"{gpu_id} not found in set of available GPUs: {gpu_ids}. " - "`wait_for_gpu` takes either GPU ordinal ID (e.g., '0') or " - "UUID (e.g., 'GPU-04546190-b68d-65ac-101b-035f8faed77d').") + if isinstance(gpu_id, int): + list_gpu_ids = [g.id for g in GPUtil.getGPUs()] + if gpu_id not in list_gpu_ids: + raise ValueError( + f"{gpu_id} (int) not found in GPU ids: {list_gpu_ids}. " + "wait_for_gpu takes either int (gpu id) or str (gpu uuid).") + elif isinstance(gpu_id, str): + list_uuids = [g.uuid for g in GPUtil.getGPUs()] + if gpu_id not in list_uuids: + raise ValueError( + f"{gpu_id} (str) not found in GPU uuids: {list_uuids}. " + "wait_for_gpu takes either int (gpu id) or str (gpu uuid).") + else: + raise ValueError(f"gpu_id must be int or str -- got ({type(gpu_id)})") for i in range(int(retry)): - gpu_object = next( - g for g in GPUtil.getGPUs() if gpu_id_fn(g) == gpu_id) + if isinstance(gpu_id, int): + gpu_object = [g for g in GPUtil.getGPUs() if g.id == gpu_id][0] + else: + gpu_object = [g for g in GPUtil.getGPUs() if g.uuid == gpu_id][0] + if gpu_object.memoryUtil > target_util: logger.info(f"Waiting for GPU util to reach {target_util}. " f"Util: {gpu_object.memoryUtil:0.3f}") - time.sleep(delay_s) + time.sleep(5) else: return True raise RuntimeError("GPU memory was not freed.") diff --git a/python/ray/tune/utils/util_test.py b/python/ray/tune/utils/util_test.py new file mode 100644 index 000000000000..534061f686d0 --- /dev/null +++ b/python/ray/tune/utils/util_test.py @@ -0,0 +1,43 @@ +from collections import OrderedDict + +import unittest + +from .util import unflatten_dict + + +class UnflattenDictTest(unittest.TestCase): + def test_output_type(self): + in_ = OrderedDict({"a/b": 1, "c/d": 2, "e": 3}) + out = unflatten_dict(in_) + assert type(in_) is type(out) + + def test_one_level_nested(self): + result = unflatten_dict({"a/b": 1, "c/d": 2, "e": 3}) + assert result == {"a": {"b": 1}, "c": {"d": 2}, "e": 3} + + def test_multi_level_nested(self): + result = unflatten_dict({"a/b/c/d": 1, "b/c/d": 2, "c/d": 3, "e": 4}) + assert result == { + "a": { + "b": { + "c": { + "d": 1, + }, + }, + }, + "b": { + "c": { + "d": 2, + }, + }, + "c": { + "d": 3, + }, + "e": 4, + } + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) From 923a8875c233aa1b8b7175a221c9bed083df78f4 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 075/244] Revert "Fix test_actor_restart (#13901)" This reverts commit 4fc420bd289c7be2cb683ae625778ee27d725401. --- python/ray/tests/test_actor_failures.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/ray/tests/test_actor_failures.py b/python/ray/tests/test_actor_failures.py index f26f87a0c101..4e2e19f1bfd0 100644 --- a/python/ray/tests/test_actor_failures.py +++ b/python/ray/tests/test_actor_failures.py @@ -115,8 +115,6 @@ def get_pid(self): ray.get(results[0]) except ray.exceptions.RayActorError: results.pop(0) - else: - break # Check all tasks that executed after the restart. if results: # The actor executed some tasks after the restart. From a6917935f53ac9672136706f877ae84a898f58e9 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 076/244] Revert "Buildkite determine-to-run support (#13866)" This reverts commit 22e03e3c98482668fdf9b0aa30f07b66e955339e. --- .buildkite/pipeline.yml | 121 +++++++++++++++++----------------------- 1 file changed, 52 insertions(+), 69 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 73e715cde885..00931f9ddd54 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -1,155 +1,142 @@ - label: ":book: Lint" commands: - - export LINT=1 - - ./ci/travis/install-dependencies.sh - - ./ci/travis/ci.sh lint - - ./ci/travis/ci.sh build + - export LINT=1 + - ./ci/travis/install-dependencies.sh + - ./ci/travis/ci.sh lint + - ./ci/travis/ci.sh build - label: ":java: Java" - conditions: ["RAY_CI_JAVA_AFFECTED"] commands: - - apt-get install -y openjdk-8-jdk maven clang-format - # Compile Java again so bazel will compile Java as a language. - - RAY_INSTALL_JAVA=1 ./ci/travis/ci.sh build - - ./java/test.sh + - apt-get install -y openjdk-8-jdk maven clang-format + # Compile Java again so bazel will compile Java as a language. + - RAY_INSTALL_JAVA=1 ./ci/travis/ci.sh build + - ./java/test.sh - label: ":java: Streaming" - conditions: - ["RAY_CI_STREAMING_PYTHON_AFFECTED", "RAY_CI_STREAMING_JAVA_AFFECTED"] commands: - - apt-get install -y openjdk-8-jdk maven - # Compile Java again so bazel will compile Java as a language. - - RAY_INSTALL_JAVA=1 ./ci/travis/ci.sh build - - bazel test --config=ci $(./scripts/bazel_export_options) + - apt-get install -y openjdk-8-jdk maven + # Compile Java again so bazel will compile Java as a language. + - RAY_INSTALL_JAVA=1 ./ci/travis/ci.sh build + - bazel test --config=ci $(./scripts/bazel_export_options) //streaming:all - - bash streaming/src/test/run_streaming_queue_test.sh + - bash streaming/src/test/run_streaming_queue_test.sh - label: ":cpp: Worker" commands: - - ./ci/travis/ci.sh test_cpp + - ./ci/travis/ci.sh test_cpp - label: ":cpp: Tests" commands: - - bazel test --config=ci $(./scripts/bazel_export_options) + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only -- //:all -rllib/... -core_worker_test - label: ":cpp: Tests (ASAN)" commands: - - bazel test --config=ci --config=asan $(./scripts/bazel_export_options) + - bazel test --config=ci --config=asan $(./scripts/bazel_export_options) --build_tests_only --config=asan-buildkite --jobs=2 -- //:all -//:core_worker_test - label: ":serverless: Dashboard + Serve Tests" - conditions: - [ - "RAY_CI_SERVE_AFFECTED", - "RAY_CI_DASHBOARD_AFFECTED", - "RAY_CI_PYTHON_AFFECTED", - ] - commands: - - TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh - - bazel test --config=ci $(./scripts/bazel_export_options) + commands: + - TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh + - bazel test --config=ci $(./scripts/bazel_export_options) python/ray/new_dashboard/... - - bazel test --config=ci $(./scripts/bazel_export_options) + - bazel test --config=ci $(./scripts/bazel_export_options) python/ray/serve/... - label: ":python: (Small & Large)" - conditions: ["RAY_CI_PYTHON_AFFECTED"] commands: - - bazel test --config=ci $(./scripts/bazel_export_options) + - bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=-kubernetes,-jenkins_only,-medium_size_python_tests_a_to_j,-medium_size_python_tests_k_to_z python/ray/tests/... - - bazel test --config=ci $(./scripts/bazel_export_options) + - bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=-kubernetes,-jenkins_only,client_tests --test_env=RAY_CLIENT_MODE=1 python/ray/tests/... - label: ":python: (Medium A-J)" - conditions: ["RAY_CI_PYTHON_AFFECTED"] commands: - - bazel test --config=ci $(./scripts/bazel_export_options) + - bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=-kubernetes,-jenkins_only,medium_size_python_tests_a_to_j python/ray/tests/... - label: ":python: (Medium K-Z)" - conditions: ["RAY_CI_PYTHON_AFFECTED"] commands: - - bazel test --config=ci $(./scripts/bazel_export_options) + - bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=-kubernetes,-jenkins_only,medium_size_python_tests_k_to_z python/ray/tests/... - label: ":brain: RLlib: Learning tests (from rllib/tuned_examples/*.yaml)" - conditions: ["RAY_CI_RLLIB_AFFECTED"] commands: - - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh - - bazel test --config=ci $(./scripts/bazel_export_options) + - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=learning_tests_tf rllib/... + - label: ":brain: RLlib: Learning tests with tf=1.x (from rllib/tuned_examples/*.yaml)" - conditions: ["RAY_CI_RLLIB_AFFECTED"] commands: - RLLIB_TESTING=1 TF_VERSION=1.14.0 TFP_VERSION=0.7 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=learning_tests_tf rllib/... + - label: ":brain: RLlib: Learning tests with Torch (from rllib/tuned_examples/*.yaml)" - conditions: ["RAY_CI_RLLIB_AFFECTED"] commands: - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=learning_tests_torch rllib/... + - label: ":brain: RLlib: Quick Agent train.py runs" - conditions: ["RAY_CI_RLLIB_AFFECTED"] commands: - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh - bazel test --config=ci $(./scripts/bazel_export_options) - --build_tests_only - --test_tag_filters=quick_train - --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 - rllib/... + --build_tests_only + --test_tag_filters=quick_train + --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 + rllib/... # Test everything that does not have any of the "main" labels: # "learning_tests|quick_train|examples|tests_dir". - bazel test --config=ci $(./scripts/bazel_export_options) - --build_tests_only - --test_tag_filters=-learning_tests_tf,-learning_tests_torch,-quick_train,-examples,-tests_dir - --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 - rllib/... + --build_tests_only + --test_tag_filters=-learning_tests_tf,-learning_tests_torch,-quick_train,-examples,-tests_dir + --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 + rllib/... + - label: ":brain: RLlib: rllib/examples/" - conditions: ["RAY_CI_RLLIB_AFFECTED"] commands: - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only - --test_tag_filters=examples_A,examples_B --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... + --test_tag_filters=examples_A,examples_B --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only - --test_tag_filters=examples_C,examples_D --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... + --test_tag_filters=examples_C,examples_D --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only - --test_tag_filters=examples_E,examples_F,examples_G,examples_H,examples_I,examples_J,examples_K,examples_L,examples_M,examples_N,examples_O,examples_P --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 - rllib/... + --test_tag_filters=examples_E,examples_F,examples_G,examples_H,examples_I,examples_J,examples_K,examples_L,examples_M,examples_N,examples_O,examples_P --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 + rllib/... - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only - --test_tag_filters=examples_Q,examples_R,examples_S,examples_T,examples_U,examples_V,examples_W,examples_X,examples_Y,examples_Z --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 - rllib/... + --test_tag_filters=examples_Q,examples_R,examples_S,examples_T,examples_U,examples_V,examples_W,examples_X,examples_Y,examples_Z --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 + rllib/... + - label: ":brain: RLlib: rllib/tests/ (A-L)" - conditions: ["RAY_CI_RLLIB_AFFECTED"] commands: - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only - --test_tag_filters=tests_dir_A,tests_dir_B,tests_dir_C,tests_dir_D,tests_dir_E,tests_dir_F,tests_dir_G,tests_dir_H,tests_dir_I,tests_dir_J,tests_dir_K,tests_dir_L --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 - rllib/... + --test_tag_filters=tests_dir_A,tests_dir_B,tests_dir_C,tests_dir_D,tests_dir_E,tests_dir_F,tests_dir_G,tests_dir_H,tests_dir_I,tests_dir_J,tests_dir_K,tests_dir_L --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 + rllib/... + - label: ":brain: RLlib: rllib/tests/ (M-Z)" - conditions: ["RAY_CI_RLLIB_AFFECTED"] commands: - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only - --test_tag_filters=tests_dir_M,tests_dir_N,tests_dir_O,tests_dir_P,tests_dir_Q,tests_dir_R,tests_dir_S,tests_dir_T,tests_dir_U,tests_dir_V,tests_dir_W,tests_dir_X,tests_dir_Y,tests_dir_Z --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 - rllib/... + --test_tag_filters=tests_dir_M,tests_dir_N,tests_dir_O,tests_dir_P,tests_dir_Q,tests_dir_R,tests_dir_S,tests_dir_T,tests_dir_U,tests_dir_V,tests_dir_W,tests_dir_X,tests_dir_Y,tests_dir_Z --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 + rllib/... + - label: ":octopus: Tune tests and examples" - conditions: ["RAY_CI_TUNE_AFFECTED"] commands: - TUNE_TESTING=1 ./ci/travis/install-dependencies.sh - bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=-jenkins_only,-example python/ray/tune/... @@ -159,14 +146,12 @@ - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-py37,flaky python/ray/tune/... - label: ":octopus: SGD tests and examples" - conditions: ["RAY_CI_SGD_AFFECTED"] commands: - SGD_TESTING=1 ./ci/travis/install-dependencies.sh - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tf,-pytorch,-py37 python/ray/util/sgd/... - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,pytorch,-py37 python/ray/util/sgd/... - label: ":octopus: Tune/SGD tests and examples. Python 3.7" - conditions: ["RAY_CI_TUNE_AFFECTED", "RAY_CI_SGD_AFFECTED"] commands: - TUNE_TESTING=1 PYTHON=3.7 INSTALL_HOROVOD=1 ./ci/travis/install-dependencies.sh # Bcause Python version changed, we need to re-install Ray here @@ -175,10 +160,8 @@ - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only python/ray/util/xgboost/... - label: ":book: Doc tests and examples" - conditions: - ["RAY_CI_PYTHON_AFFECTED", "RAY_CI_TUNE_AFFECTED", "RAY_CI_DOC_AFFECTED"] commands: - DOC_TESTING=1 ./ci/travis/install-dependencies.sh - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,-pytorch,-py37 doc/... - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tf,-pytorch,-py37 doc/... - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,pytorch,-py37 doc/... + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,pytorch,-py37 doc/... \ No newline at end of file From e237771b1d080c815639397d5f5e6d9e9ae7c5f0 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 077/244] Revert "[Tune] Add try-except to FailureInjectorCallback (#13939)" This reverts commit 897e4ceccbfded47bdbf2b874bab7a66542b1eeb. --- python/ray/tune/utils/mock.py | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/python/ray/tune/utils/mock.py b/python/ray/tune/utils/mock.py index eea7b194d9ea..cc92fae26dee 100644 --- a/python/ray/tune/utils/mock.py +++ b/python/ray/tune/utils/mock.py @@ -1,6 +1,4 @@ -import logging import os - import numpy as np import json import random @@ -20,8 +18,6 @@ LOCAL_SYNC_TEMPLATE = "mkdir -p {target} && rsync -avz {source}/ {target}/" LOCAL_DELETE_TEMPLATE = "rm -rf {target}" -logger = logging.getLogger(__name__) - def mock_storage_client(): """Mocks storage client that treats a local dir as durable storage.""" @@ -114,25 +110,13 @@ def __init__(self, self.disable = disable def on_step_begin(self, **info): - import click from ray.autoscaler._private.commands import kill_node - failures = 0 - max_failures = 3 # With 10% probability inject failure to a worker. if random.random() < self.probability and not self.disable: # With 10% probability fully terminate the node. should_terminate = random.random() < self.probability - while failures < max_failures: - try: - kill_node( - self.config_path, - yes=True, - hard=should_terminate, - override_cluster_name=None) - except click.exceptions.ClickException: - failures += 1 - logger.exception("Killing random node failed in attempt " - "{}. " - "Retrying {} more times".format( - str(failures), - str(max_failures - failures))) + kill_node( + self.config_path, + yes=True, + hard=should_terminate, + override_cluster_name=None) From 5993b965f62a5007cc06508307f4fa8cb4855db4 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 078/244] Revert "Ray client version check strict eq (#13926)" This reverts commit 4edbb2343181952d52d624724fc048a767f3f39d. --- python/ray/util/client/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ray/util/client/__init__.py b/python/ray/util/client/__init__.py index 94a664a80e33..3fdcd4f8810c 100644 --- a/python/ray/util/client/__init__.py +++ b/python/ray/util/client/__init__.py @@ -85,8 +85,8 @@ def _check_versions(self, conn_info: Dict[str, Any], logger.warning(msg) else: raise RuntimeError(msg) - if CURRENT_PROTOCOL_VERSION != conn_info["protocol_version"]: - msg = "Client Ray installation incompatible with server:" + \ + if CURRENT_PROTOCOL_VERSION < conn_info["protocol_version"]: + msg = "Client Ray installation out of date:" + \ f" client is {CURRENT_PROTOCOL_VERSION}," + \ f" server is {conn_info['protocol_version']}" if ignore_version: From 8b49b4df210dc44a45459b74a6028fbce2e82667 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 079/244] Revert "Revert "Fix passing env on windows (#13253)" (#13828)" This reverts commit e1d0f41916f8896681ef135ac619e036b082e70e. --- src/ray/util/process.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/ray/util/process.cc b/src/ray/util/process.cc index 0928c4402a72..a9008df32e6c 100644 --- a/src/ray/util/process.cc +++ b/src/ray/util/process.cc @@ -139,6 +139,15 @@ class ProcessFD { STARTUPINFO si = {sizeof(si)}; RAY_UNUSED( new_env_block.c_str()); // Ensure there's a final terminator for Windows + // MSDN: + // https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-createprocessa + // Note that an ANSI environment block is terminated by two zero bytes: + // one for the last string, one more to terminate the block. + // A Unicode environment block is terminated by four zero bytes: + // two for the last string, two more to terminate the block. + if (!new_env_block.empty()) { + new_env_block += '\0'; + } char *const envp = &new_env_block[0]; if (CreateProcessA(NULL, cmdline, NULL, NULL, FALSE, 0, envp, NULL, &si, &pi)) { succeeded = true; From ca0b6b7885dd02b667833d8c3d9a663b6c49225b Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 080/244] Revert "[GCS]Fix bug that gcs client does not set last_resource_usage_ (#13856)" This reverts commit ca15a730ac335e15261dc3cbe9f4298f9365fd6a. --- src/ray/gcs/accessor.h | 3 ++- src/ray/gcs/gcs_client/service_based_accessor.cc | 6 ------ .../gcs/gcs_client/test/service_based_gcs_client_test.cc | 8 -------- src/ray/raylet/node_manager.task.cc | 2 ++ src/ray/raylet/scheduling/cluster_resource_scheduler.h | 2 +- .../scheduling/cluster_resource_scheduler_interface.h | 2 +- .../raylet/scheduling/old_cluster_resource_scheduler.cc | 4 ++++ .../raylet/scheduling/old_cluster_resource_scheduler.h | 4 ++-- 8 files changed, 12 insertions(+), 19 deletions(-) diff --git a/src/ray/gcs/accessor.h b/src/ray/gcs/accessor.h index be929ec3ff0d..034e91082bc5 100644 --- a/src/ray/gcs/accessor.h +++ b/src/ray/gcs/accessor.h @@ -565,7 +565,7 @@ class NodeResourceInfoAccessor { virtual void AsyncReReportResourceUsage() = 0; /// Return resources in last report. Used by light heartbeat. - const std::shared_ptr &GetLastResourceUsage() { + std::shared_ptr &GetLastResourceUsage() { return last_resource_usage_; } @@ -589,6 +589,7 @@ class NodeResourceInfoAccessor { protected: NodeResourceInfoAccessor() = default; + private: /// Cache which stores resource usage in last report used to check if they are changed. /// Used by light resource usage report. std::shared_ptr last_resource_usage_ = diff --git a/src/ray/gcs/gcs_client/service_based_accessor.cc b/src/ray/gcs/gcs_client/service_based_accessor.cc index a82e0ab6bcdd..015da29f3e0f 100644 --- a/src/ray/gcs/gcs_client/service_based_accessor.cc +++ b/src/ray/gcs/gcs_client/service_based_accessor.cc @@ -707,12 +707,6 @@ Status ServiceBasedNodeResourceInfoAccessor::AsyncUpdateResources( Status ServiceBasedNodeResourceInfoAccessor::AsyncReportResourceUsage( const std::shared_ptr &data_ptr, const StatusCallback &callback) { absl::MutexLock lock(&mutex_); - last_resource_usage_->SetAvailableResources( - ResourceSet(MapFromProtobuf(data_ptr->resources_available()))); - last_resource_usage_->SetTotalResources( - ResourceSet(MapFromProtobuf(data_ptr->resources_total()))); - last_resource_usage_->SetLoadResources( - ResourceSet(MapFromProtobuf(data_ptr->resource_load()))); cached_resource_usage_.mutable_resources()->CopyFrom(*data_ptr); client_impl_->GetGcsRpcClient().ReportResourceUsage( cached_resource_usage_, diff --git a/src/ray/gcs/gcs_client/test/service_based_gcs_client_test.cc b/src/ray/gcs/gcs_client/test/service_based_gcs_client_test.cc index 191ffa0fff0f..3b1a6a69ad7a 100644 --- a/src/ray/gcs/gcs_client/test/service_based_gcs_client_test.cc +++ b/src/ray/gcs/gcs_client/test/service_based_gcs_client_test.cc @@ -715,16 +715,8 @@ TEST_F(ServiceBasedGcsClientTest, TestNodeResourceUsage) { auto resource = std::make_shared(); resource->set_node_id(node_id.Binary()); resource->set_should_global_gc(true); - std::string resource_name = "CPU"; - double resource_value = 1.0; - (*resource->mutable_resources_total())[resource_name] = resource_value; ASSERT_TRUE(ReportResourceUsage(resource)); WaitForExpectedCount(resource_batch_count, 1); - - // Get and check last report resource usage. - auto last_resource_usage = gcs_client_->NodeResources().GetLastResourceUsage(); - ASSERT_EQ(last_resource_usage->GetTotalResources().GetResource(resource_name), - resource_value); } TEST_F(ServiceBasedGcsClientTest, TestNodeResourceUsageWithLightResourceUsageReport) { diff --git a/src/ray/raylet/node_manager.task.cc b/src/ray/raylet/node_manager.task.cc index 2fec7360b354..150ecb02d2ba 100644 --- a/src/ray/raylet/node_manager.task.cc +++ b/src/ray/raylet/node_manager.task.cc @@ -116,6 +116,8 @@ void NodeManager::FillResourceUsage(std::shared_ptr resource (*resources_data->mutable_resource_load())[resource_pair.first] = resource_pair.second; } + last_heartbeat_resources->SetLoadResources( + ResourceSet(local_resources.GetLoadResources())); } // Add resource load by shape. This will be used by the new autoscaler. diff --git a/src/ray/raylet/scheduling/cluster_resource_scheduler.h b/src/ray/raylet/scheduling/cluster_resource_scheduler.h index 892db9e8b6a3..747fe6f6fba2 100644 --- a/src/ray/raylet/scheduling/cluster_resource_scheduler.h +++ b/src/ray/raylet/scheduling/cluster_resource_scheduler.h @@ -387,7 +387,7 @@ class ClusterResourceScheduler : public ClusterResourceSchedulerInterface { /// /// \param gcs_resources: The remote cache from gcs. void UpdateLastResourceUsage( - const std::shared_ptr gcs_resources) override; + std::shared_ptr gcs_resources) override; /// Return human-readable string for this scheduler state. std::string DebugString() const; diff --git a/src/ray/raylet/scheduling/cluster_resource_scheduler_interface.h b/src/ray/raylet/scheduling/cluster_resource_scheduler_interface.h index 21c6b6edccd3..ca2ba5237d71 100644 --- a/src/ray/raylet/scheduling/cluster_resource_scheduler_interface.h +++ b/src/ray/raylet/scheduling/cluster_resource_scheduler_interface.h @@ -54,7 +54,7 @@ class ClusterResourceSchedulerInterface { /// /// \param gcs_resources: The remote cache from gcs. virtual void UpdateLastResourceUsage( - const std::shared_ptr gcs_resources) {} + std::shared_ptr gcs_resources) {} /// Populate the relevant parts of the heartbeat table. This is intended for /// sending raylet <-> gcs heartbeats. In particular, this should fill in diff --git a/src/ray/raylet/scheduling/old_cluster_resource_scheduler.cc b/src/ray/raylet/scheduling/old_cluster_resource_scheduler.cc index 9801e57c6311..9d5c5a9e95e9 100644 --- a/src/ray/raylet/scheduling/old_cluster_resource_scheduler.cc +++ b/src/ray/raylet/scheduling/old_cluster_resource_scheduler.cc @@ -71,6 +71,8 @@ void OldClusterResourceScheduler::FillResourceUsage( (*resources_data->mutable_resources_total())[resource_pair.first] = resource_pair.second; } + last_heartbeat_resources_->SetTotalResources( + ResourceSet(local_resources.GetTotalResources())); } if (!last_heartbeat_resources_->GetAvailableResources().IsEqual( @@ -81,6 +83,8 @@ void OldClusterResourceScheduler::FillResourceUsage( (*resources_data->mutable_resources_available())[resource_pair.first] = resource_pair.second; } + last_heartbeat_resources_->SetAvailableResources( + ResourceSet(local_resources.GetAvailableResources())); } } diff --git a/src/ray/raylet/scheduling/old_cluster_resource_scheduler.h b/src/ray/raylet/scheduling/old_cluster_resource_scheduler.h index 927442c6c078..288a85c1c37a 100644 --- a/src/ray/raylet/scheduling/old_cluster_resource_scheduler.h +++ b/src/ray/raylet/scheduling/old_cluster_resource_scheduler.h @@ -23,7 +23,7 @@ class OldClusterResourceScheduler : public ClusterResourceSchedulerInterface { explicit OldClusterResourceScheduler( const NodeID &self_node_id, ResourceIdSet &local_available_resources, std::unordered_map &cluster_resource_map, - const std::shared_ptr last_heartbeat_resources); + std::shared_ptr last_heartbeat_resources); /// Remove node from the cluster data structure. This happens /// when a node fails or it is removed from the cluster. @@ -67,6 +67,6 @@ class OldClusterResourceScheduler : public ClusterResourceSchedulerInterface { std::string self_node_id_string_; ResourceIdSet &local_available_resources_; std::unordered_map &cluster_resource_map_; - const std::shared_ptr last_heartbeat_resources_; + std::shared_ptr last_heartbeat_resources_; }; } // namespace ray From 042d3a8ff6c9fbe4702f947c81621aae17652614 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 081/244] Revert "[Placement Group] Support named placement group (#13755)" This reverts commit 94297849a1cd78166a64c058e6cefcd82733dde6. --- doc/source/placement-group.rst | 35 -------- python/ray/includes/global_state_accessor.pxd | 2 - python/ray/includes/global_state_accessor.pxi | 10 --- python/ray/state.py | 14 --- python/ray/tests/test_placement_group.py | 86 +------------------ python/ray/util/__init__.py | 4 +- python/ray/util/placement_group.py | 26 +----- src/ray/gcs/accessor.h | 10 +-- .../gcs/gcs_client/global_state_accessor.cc | 12 --- .../gcs/gcs_client/global_state_accessor.h | 13 +-- .../gcs/gcs_client/service_based_accessor.cc | 20 ----- .../gcs/gcs_client/service_based_accessor.h | 4 - .../gcs_server/gcs_placement_group_manager.cc | 66 ++------------ .../gcs_server/gcs_placement_group_manager.h | 13 +-- .../test/gcs_placement_group_manager_test.cc | 25 ------ src/ray/protobuf/gcs_service.proto | 14 --- src/ray/rpc/gcs_server/gcs_rpc_client.h | 4 - src/ray/rpc/gcs_server/gcs_rpc_server.h | 5 -- 18 files changed, 17 insertions(+), 346 deletions(-) diff --git a/doc/source/placement-group.rst b/doc/source/placement-group.rst index 7db38fd84512..1424b850c9c8 100644 --- a/doc/source/placement-group.rst +++ b/doc/source/placement-group.rst @@ -252,41 +252,6 @@ Note that you can anytime remove the placement group to clean up resources. ray.shutdown() -Named Placement Groups ----------------------- - -A placement group can be given a globally unique name. -This allows you to retrieve the placement group from any job in the Ray cluster. -This can be useful if you cannot directly pass the placement group handle to -the actor or task that needs it, or if you are trying to -access a placement group launched by another driver. -Note that the placement group will still be destroyed if it's lifetime isn't `detached`. -See :ref:`placement-group-lifetimes` for more details. - -.. tabs:: - .. group-tab:: Python - - .. code-block:: python - - # first_driver.py - # Create a placement group with a global name. - pg = placement_group([{"CPU": 2}, {"CPU": 2}], strategy="STRICT_SPREAD", lifetime="detached", name="global_name") - ray.get(pg.ready()) - - Then, we can retrieve the actor later somewhere. - - .. code-block:: python - - # second_driver.py - # Retrieve a placement group with a global name. - pg = ray.util.get_placement_group("global_name") - - .. group-tab:: Java - - The named placement group is not implemented for Java APIs yet. - -.. _placement-group-lifetimes: - Placement Group Lifetimes ------------------------- diff --git a/python/ray/includes/global_state_accessor.pxd b/python/ray/includes/global_state_accessor.pxd index e27aa0547d2a..31418f10c0af 100644 --- a/python/ray/includes/global_state_accessor.pxd +++ b/python/ray/includes/global_state_accessor.pxd @@ -32,6 +32,4 @@ cdef extern from "ray/gcs/gcs_client/global_state_accessor.h" nogil: c_bool AddWorkerInfo(const c_string &serialized_string) unique_ptr[c_string] GetPlacementGroupInfo( const CPlacementGroupID &placement_group_id) - unique_ptr[c_string] GetPlacementGroupByName( - const c_string &placement_group_name) c_vector[c_string] GetAllPlacementGroupInfo() diff --git a/python/ray/includes/global_state_accessor.pxi b/python/ray/includes/global_state_accessor.pxi index 5690d3bab65e..cbb1bac0aed9 100644 --- a/python/ray/includes/global_state_accessor.pxi +++ b/python/ray/includes/global_state_accessor.pxi @@ -147,13 +147,3 @@ cdef class GlobalStateAccessor: if result: return c_string(result.get().data(), result.get().size()) return None - - def get_placement_group_by_name(self, placement_group_name): - cdef unique_ptr[c_string] result - cdef c_string cplacement_group_name = placement_group_name - with nogil: - result = self.inner.get().GetPlacementGroupByName( - cplacement_group_name) - if result: - return c_string(result.get().data(), result.get().size()) - return None diff --git a/python/ray/state.py b/python/ray/state.py index 7524ea1244b2..aa3488e20e78 100644 --- a/python/ray/state.py +++ b/python/ray/state.py @@ -388,20 +388,6 @@ def profile_table(self): return dict(result) - def get_placement_group_by_name(self, placement_group_name): - self._check_connected() - - placement_group_info = ( - self.global_state_accessor.get_placement_group_by_name( - placement_group_name)) - if placement_group_info is None: - return None - else: - placement_group_table_data = \ - gcs_utils.PlacementGroupTableData.FromString( - placement_group_info) - return self._gen_placement_group_info(placement_group_table_data) - def placement_group_table(self, placement_group_id=None): self._check_connected() diff --git a/python/ray/tests/test_placement_group.py b/python/ray/tests/test_placement_group.py index 024ff6c5557a..87273a4998c9 100644 --- a/python/ray/tests/test_placement_group.py +++ b/python/ray/tests/test_placement_group.py @@ -375,7 +375,6 @@ def test_remove_pending_placement_group(ray_start_cluster): # Create a placement group that cannot be scheduled now. placement_group = ray.util.placement_group([{"GPU": 2}, {"CPU": 2}]) ray.util.remove_placement_group(placement_group) - # TODO(sang): Add state check here. @ray.remote(num_cpus=4) def f(): @@ -798,10 +797,10 @@ def random_tasks(): pg_tasks = [] # total bundle gpu usage = bundles_per_pg * total_num_pg * per_bundle_gpus # Note this is half of total - for index in range(total_num_pg): + for _ in range(total_num_pg): pgs.append( ray.util.placement_group( - name=f"name{index}", + name="name", strategy="PACK", bundles=[{ "GPU": per_bundle_gpus @@ -1424,86 +1423,5 @@ def schedule_nested_actor_with_detached_pg(self): assert assert_alive_num_actor(4) -def test_named_placement_group(ray_start_cluster): - cluster = ray_start_cluster - for _ in range(2): - cluster.add_node(num_cpus=3) - cluster.wait_for_nodes() - info = ray.init(address=cluster.address) - global_placement_group_name = "named_placement_group" - - # Create a detached placement group with name. - driver_code = f""" -import ray - -ray.init(address="{info["redis_address"]}") - -pg = ray.util.placement_group( - [{{"CPU": 1}} for _ in range(2)], - strategy="STRICT_SPREAD", - name="{global_placement_group_name}", - lifetime="detached") -ray.get(pg.ready()) - -ray.shutdown() - """ - - run_string_as_driver(driver_code) - - # Wait until the driver is reported as dead by GCS. - def is_job_done(): - jobs = ray.jobs() - for job in jobs: - if "StopTime" in job: - return True - return False - - wait_for_condition(is_job_done) - - @ray.remote(num_cpus=1) - class Actor: - def ping(self): - return "pong" - - # Get the named placement group and schedule a actor. - placement_group = ray.util.get_placement_group(global_placement_group_name) - assert placement_group is not None - assert placement_group.wait(5) - actor = Actor.options( - placement_group=placement_group, - placement_group_bundle_index=0).remote() - - ray.get(actor.ping.remote()) - - # Create another placement group and make sure its creation will failed. - same_name_pg = ray.util.placement_group( - [{ - "CPU": 1 - } for _ in range(2)], - strategy="STRICT_SPREAD", - name=global_placement_group_name) - assert not same_name_pg.wait(10) - - # Remove a named placement group and make sure the second creation - # will successful. - ray.util.remove_placement_group(placement_group) - same_name_pg = ray.util.placement_group( - [{ - "CPU": 1 - } for _ in range(2)], - strategy="STRICT_SPREAD", - name=global_placement_group_name) - assert same_name_pg.wait(10) - - # Get a named placement group with a name that doesn't exist - # and make sure it will raise ValueError correctly. - error_count = 0 - try: - ray.util.get_placement_group("inexistent_pg") - except ValueError: - error_count = error_count + 1 - assert error_count == 1 - - if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/util/__init__.py b/python/ray/util/__init__.py index d20bac2a3ef4..b682f15dc878 100644 --- a/python/ray/util/__init__.py +++ b/python/ray/util/__init__.py @@ -4,8 +4,7 @@ from ray.util.debug import log_once, disable_log_once_globally, \ enable_periodic_logging from ray.util.placement_group import (placement_group, placement_group_table, - remove_placement_group, - get_placement_group) + remove_placement_group) from ray.util import rpdb as pdb from ray.util.serialization import register_serializer, deregister_serializer @@ -20,7 +19,6 @@ "pdb", "placement_group", "placement_group_table", - "get_placement_group", "remove_placement_group", "inspect_serializability", "collective", diff --git a/python/ray/util/placement_group.py b/python/ray/util/placement_group.py index c723f77d3ecc..6d15f607f22c 100644 --- a/python/ray/util/placement_group.py +++ b/python/ray/util/placement_group.py @@ -4,7 +4,6 @@ import ray from ray._raylet import PlacementGroupID, ObjectRef -from ray.utils import hex_to_binary bundle_reservation_check = None @@ -146,7 +145,7 @@ def _fill_bundle_cache_if_needed(self): def placement_group(bundles: List[Dict[str, float]], strategy: str = "PACK", - name: str = "", + name: str = "unnamed_group", lifetime=None) -> PlacementGroup: """Asynchronously creates a PlacementGroup. @@ -212,29 +211,6 @@ def remove_placement_group(placement_group: PlacementGroup): worker.core_worker.remove_placement_group(placement_group.id) -def get_placement_group(placement_group_name: str): - """Get a placement group object with a global name. - - Returns: - None if can't find a placement group with the given name. - The placement group object otherwise. - """ - if not placement_group_name: - raise ValueError( - "Please supply a non-empty value to get_placement_group") - worker = ray.worker.global_worker - worker.check_connected() - placement_group_info = ray.state.state.get_placement_group_by_name( - placement_group_name) - if placement_group_info is None: - raise ValueError( - f"Failed to look up actor with name: {placement_group_name}") - else: - return PlacementGroup( - PlacementGroupID( - hex_to_binary(placement_group_info["placement_group_id"]))) - - def placement_group_table(placement_group: PlacementGroup = None) -> list: """Get the state of the placement group from GCS. diff --git a/src/ray/gcs/accessor.h b/src/ray/gcs/accessor.h index 034e91082bc5..e7ddb765b9d3 100644 --- a/src/ray/gcs/accessor.h +++ b/src/ray/gcs/accessor.h @@ -727,7 +727,7 @@ class PlacementGroupInfoAccessor { virtual Status AsyncCreatePlacementGroup( const PlacementGroupSpecification &placement_group_spec) = 0; - /// Get a placement group data from GCS asynchronously by id. + /// Get a placement group data from GCS asynchronously. /// /// \param placement_group_id The id of a placement group to obtain from GCS. /// \return Status. @@ -735,14 +735,6 @@ class PlacementGroupInfoAccessor { const PlacementGroupID &placement_group_id, const OptionalItemCallback &callback) = 0; - /// Get a placement group data from GCS asynchronously by name. - /// - /// \param placement_group_name The name of a placement group to obtain from GCS. - /// \return Status. - virtual Status AsyncGetByName( - const std::string &placement_group_name, - const OptionalItemCallback &callback) = 0; - /// Get all placement group info from GCS asynchronously. /// /// \param callback Callback that will be called after lookup finished. diff --git a/src/ray/gcs/gcs_client/global_state_accessor.cc b/src/ray/gcs/gcs_client/global_state_accessor.cc index 669b16e2b4a6..4e9a6fa18cef 100644 --- a/src/ray/gcs/gcs_client/global_state_accessor.cc +++ b/src/ray/gcs/gcs_client/global_state_accessor.cc @@ -259,17 +259,5 @@ std::unique_ptr GlobalStateAccessor::GetPlacementGroupInfo( return placement_group_table_data; } -std::unique_ptr GlobalStateAccessor::GetPlacementGroupByName( - const std::string &placement_group_name) { - std::unique_ptr placement_group_table_data; - std::promise promise; - RAY_CHECK_OK(gcs_client_->PlacementGroups().AsyncGetByName( - placement_group_name, - TransformForOptionalItemCallback( - placement_group_table_data, promise))); - promise.get_future().get(); - return placement_group_table_data; -} - } // namespace gcs } // namespace ray diff --git a/src/ray/gcs/gcs_client/global_state_accessor.h b/src/ray/gcs/gcs_client/global_state_accessor.h index c15963587d65..0c5695780c2a 100644 --- a/src/ray/gcs/gcs_client/global_state_accessor.h +++ b/src/ray/gcs/gcs_client/global_state_accessor.h @@ -151,24 +151,15 @@ class GlobalStateAccessor { /// deserialized with protobuf function. std::vector GetAllPlacementGroupInfo(); - /// Get information of a placement group from GCS Service by ID. + /// Get information of a placement group from GCS Service. /// - /// \param placement_group_id The ID of placement group to look up in the GCS Service. + /// \param placement_group The ID of placement group to look up in the GCS Service. /// \return Placement group info. To support multi-language, we serialize each /// PlacementGroupTableData and return the serialized string. Where used, it needs to be /// deserialized with protobuf function. std::unique_ptr GetPlacementGroupInfo( const PlacementGroupID &placement_group_id); - /// Get information of a placement group from GCS Service by name. - /// - /// \param placement_group_name The name of placement group to look up in the GCS - /// Service. \return Placement group info. To support multi-language, we serialize each - /// PlacementGroupTableData and return the serialized string. Where used, it needs to be - /// deserialized with protobuf function. - std::unique_ptr GetPlacementGroupByName( - const std::string &placement_group_name); - private: /// MultiItem transformation helper in template style. /// diff --git a/src/ray/gcs/gcs_client/service_based_accessor.cc b/src/ray/gcs/gcs_client/service_based_accessor.cc index 015da29f3e0f..c4f550e5075b 100644 --- a/src/ray/gcs/gcs_client/service_based_accessor.cc +++ b/src/ray/gcs/gcs_client/service_based_accessor.cc @@ -1466,26 +1466,6 @@ Status ServiceBasedPlacementGroupInfoAccessor::AsyncGet( return Status::OK(); } -Status ServiceBasedPlacementGroupInfoAccessor::AsyncGetByName( - const std::string &name, - const OptionalItemCallback &callback) { - RAY_LOG(DEBUG) << "Getting named placement group info, name = " << name; - rpc::GetNamedPlacementGroupRequest request; - request.set_name(name); - client_impl_->GetGcsRpcClient().GetNamedPlacementGroup( - request, [name, callback](const Status &status, - const rpc::GetNamedPlacementGroupReply &reply) { - if (reply.has_placement_group_table_data()) { - callback(status, reply.placement_group_table_data()); - } else { - callback(status, boost::none); - } - RAY_LOG(DEBUG) << "Finished getting named placement group info, status = " - << status << ", name = " << name; - }); - return Status::OK(); -} - Status ServiceBasedPlacementGroupInfoAccessor::AsyncGetAll( const MultiItemCallback &callback) { RAY_LOG(DEBUG) << "Getting all placement group info."; diff --git a/src/ray/gcs/gcs_client/service_based_accessor.h b/src/ray/gcs/gcs_client/service_based_accessor.h index c883e7b626a7..79deb2a6c3b2 100644 --- a/src/ray/gcs/gcs_client/service_based_accessor.h +++ b/src/ray/gcs/gcs_client/service_based_accessor.h @@ -453,10 +453,6 @@ class ServiceBasedPlacementGroupInfoAccessor : public PlacementGroupInfoAccessor const PlacementGroupID &placement_group_id, const OptionalItemCallback &callback) override; - Status AsyncGetByName( - const std::string &name, - const OptionalItemCallback &callback) override; - Status AsyncGetAll( const MultiItemCallback &callback) override; diff --git a/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc b/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc index 12260d867d37..a856002b6465 100644 --- a/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc @@ -65,8 +65,7 @@ rpc::PlacementStrategy GcsPlacementGroup::GetStrategy() const { return placement_group_table_data_.strategy(); } -const rpc::PlacementGroupTableData &GcsPlacementGroup::GetPlacementGroupTableData() - const { +const rpc::PlacementGroupTableData &GcsPlacementGroup::GetPlacementGroupTableData() { return placement_group_table_data_; } @@ -148,21 +147,6 @@ void GcsPlacementGroupManager::RegisterPlacementGroup( } return; } - if (!placement_group->GetName().empty()) { - auto it = named_placement_groups_.find(placement_group->GetName()); - if (it == named_placement_groups_.end()) { - named_placement_groups_.emplace(placement_group->GetName(), - placement_group->GetPlacementGroupID()); - } else { - std::stringstream stream; - stream << "Failed to create placement group '" - << placement_group->GetPlacementGroupID() << "' because name '" - << placement_group->GetName() << "' already exists."; - RAY_LOG(WARNING) << stream.str(); - callback(Status::Invalid(stream.str())); - return; - } - } // Mark the callback as pending and invoke it after the placement_group has been // successfully created. @@ -194,9 +178,11 @@ void GcsPlacementGroupManager::RegisterPlacementGroup( PlacementGroupID GcsPlacementGroupManager::GetPlacementGroupIDByName( const std::string &name) { PlacementGroupID placement_group_id = PlacementGroupID::Nil(); - auto it = named_placement_groups_.find(name); - if (it != named_placement_groups_.end()) { - placement_group_id = it->second; + for (const auto &iter : registered_placement_groups_) { + if (iter.second->GetName() == name) { + placement_group_id = iter.first; + break; + } } return placement_group_id; } @@ -329,19 +315,10 @@ void GcsPlacementGroupManager::RemovePlacementGroup( on_placement_group_removed(Status::OK()); return; } - auto placement_group = std::move(placement_group_it->second); + auto placement_group = placement_group_it->second; registered_placement_groups_.erase(placement_group_it); placement_group_to_create_callbacks_.erase(placement_group_id); - // Remove placement group from `named_placement_groups_` if its name is not empty. - if (!placement_group->GetName().empty()) { - auto it = named_placement_groups_.find(placement_group->GetName()); - if (it != named_placement_groups_.end() && - it->second == placement_group->GetPlacementGroupID()) { - named_placement_groups_.erase(it); - } - } - // Destroy all bundles. gcs_placement_group_scheduler_->DestroyPlacementGroupBundleResourcesIfExists( placement_group_id); @@ -408,30 +385,6 @@ void GcsPlacementGroupManager::HandleGetPlacementGroup( ++counts_[CountType::GET_PLACEMENT_GROUP_REQUEST]; } -void GcsPlacementGroupManager::HandleGetNamedPlacementGroup( - const rpc::GetNamedPlacementGroupRequest &request, - rpc::GetNamedPlacementGroupReply *reply, rpc::SendReplyCallback send_reply_callback) { - const std::string &name = request.name(); - RAY_LOG(DEBUG) << "Getting named placement group info, name = " << name; - - // Try to look up the placement Group ID for the named placement group. - auto placement_group_id = GetPlacementGroupIDByName(name); - - if (placement_group_id.IsNil()) { - // The placement group was not found. - RAY_LOG(DEBUG) << "Placement Group with name '" << name << "' was not found"; - } else { - const auto &iter = registered_placement_groups_.find(placement_group_id); - RAY_CHECK(iter != registered_placement_groups_.end()); - reply->mutable_placement_group_table_data()->CopyFrom( - iter->second->GetPlacementGroupTableData()); - RAY_LOG(DEBUG) << "Finished get named placement group info, placement group id = " - << placement_group_id; - } - GCS_RPC_SEND_REPLY(send_reply_callback, reply, Status::OK()); - ++counts_[CountType::GET_NAMED_PLACEMENT_GROUP_REQUEST]; -} - void GcsPlacementGroupManager::HandleGetAllPlacementGroup( const rpc::GetAllPlacementGroupRequest &request, rpc::GetAllPlacementGroupReply *reply, rpc::SendReplyCallback send_reply_callback) { @@ -597,10 +550,6 @@ void GcsPlacementGroupManager::Initialize(const GcsInitData &gcs_init_data) { auto placement_group = std::make_shared(item.second); if (item.second.state() != rpc::PlacementGroupTableData::REMOVED) { registered_placement_groups_.emplace(item.first, placement_group); - if (!placement_group->GetName().empty()) { - named_placement_groups_.emplace(placement_group->GetName(), - placement_group->GetPlacementGroupID()); - } if (item.second.state() == rpc::PlacementGroupTableData::PENDING || item.second.state() == rpc::PlacementGroupTableData::RESCHEDULING) { @@ -638,7 +587,6 @@ std::string GcsPlacementGroupManager::DebugString() const { << ", WaitPlacementGroupUntilReady request count: " << counts_[CountType::WAIT_PLACEMENT_GROUP_UNTIL_READY_REQUEST] << ", Registered placement groups count: " << registered_placement_groups_.size() - << ", Named placement group count: " << named_placement_groups_.size() << ", Pending placement groups count: " << pending_placement_groups_.size() << "}"; return stream.str(); diff --git a/src/ray/gcs/gcs_server/gcs_placement_group_manager.h b/src/ray/gcs/gcs_server/gcs_placement_group_manager.h index 49a7634dfc0f..28ce82090077 100644 --- a/src/ray/gcs/gcs_server/gcs_placement_group_manager.h +++ b/src/ray/gcs/gcs_server/gcs_placement_group_manager.h @@ -65,7 +65,7 @@ class GcsPlacementGroup { } /// Get the immutable PlacementGroupTableData of this placement group. - const rpc::PlacementGroupTableData &GetPlacementGroupTableData() const; + const rpc::PlacementGroupTableData &GetPlacementGroupTableData(); /// Get the mutable bundle of this placement group. rpc::Bundle *GetMutableBundle(int bundle_index); @@ -155,13 +155,10 @@ class GcsPlacementGroupManager : public rpc::PlacementGroupInfoHandler { rpc::GetPlacementGroupReply *reply, rpc::SendReplyCallback send_reply_callback) override; - void HandleGetNamedPlacementGroup(const rpc::GetNamedPlacementGroupRequest &request, - rpc::GetNamedPlacementGroupReply *reply, - rpc::SendReplyCallback send_reply_callback) override; - void HandleGetAllPlacementGroup(const rpc::GetAllPlacementGroupRequest &request, rpc::GetAllPlacementGroupReply *reply, rpc::SendReplyCallback send_reply_callback) override; + void HandleWaitPlacementGroupUntilReady( const rpc::WaitPlacementGroupUntilReadyRequest &request, rpc::WaitPlacementGroupUntilReadyReply *reply, @@ -318,9 +315,6 @@ class GcsPlacementGroupManager : public rpc::PlacementGroupInfoHandler { /// Reference of GcsResourceManager. GcsResourceManager &gcs_resource_manager_; - /// Maps placement group names to their placement group ID for lookups by name. - absl::flat_hash_map named_placement_groups_; - // Debug info. enum CountType { CREATE_PLACEMENT_GROUP_REQUEST = 0, @@ -328,8 +322,7 @@ class GcsPlacementGroupManager : public rpc::PlacementGroupInfoHandler { GET_PLACEMENT_GROUP_REQUEST = 2, GET_ALL_PLACEMENT_GROUP_REQUEST = 3, WAIT_PLACEMENT_GROUP_UNTIL_READY_REQUEST = 4, - GET_NAMED_PLACEMENT_GROUP_REQUEST = 5, - CountType_MAX = 6, + CountType_MAX = 5, }; uint64_t counts_[CountType::CountType_MAX] = {0}; }; diff --git a/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_test.cc index 77784e44b9e4..fec3f2540401 100644 --- a/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_test.cc @@ -174,31 +174,6 @@ TEST_F(GcsPlacementGroupManagerTest, TestGetPlacementGroupIDByName) { PlacementGroupID::FromBinary(request.placement_group_spec().placement_group_id())); } -TEST_F(GcsPlacementGroupManagerTest, TestRemoveNamedPlacementGroup) { - auto request = Mocker::GenCreatePlacementGroupRequest("test_name"); - std::atomic finished_placement_group_count(0); - gcs_placement_group_manager_->RegisterPlacementGroup( - std::make_shared(request), - [&finished_placement_group_count](const Status &status) { - ++finished_placement_group_count; - }); - - ASSERT_EQ(finished_placement_group_count, 0); - WaitForExpectedPgCount(1); - auto placement_group = mock_placement_group_scheduler_->placement_groups_.back(); - mock_placement_group_scheduler_->placement_groups_.pop_back(); - - gcs_placement_group_manager_->OnPlacementGroupCreationSuccess(placement_group); - WaitForExpectedCount(finished_placement_group_count, 1); - ASSERT_EQ(placement_group->GetState(), rpc::PlacementGroupTableData::CREATED); - // Remove the named placement group. - gcs_placement_group_manager_->RemovePlacementGroup( - placement_group->GetPlacementGroupID(), - [](const Status &status) { ASSERT_TRUE(status.ok()); }); - ASSERT_EQ(gcs_placement_group_manager_->GetPlacementGroupIDByName("test_name"), - PlacementGroupID::Nil()); -} - TEST_F(GcsPlacementGroupManagerTest, TestRescheduleWhenNodeAdd) { auto request = Mocker::GenCreatePlacementGroupRequest(); std::atomic finished_placement_group_count(0); diff --git a/src/ray/protobuf/gcs_service.proto b/src/ray/protobuf/gcs_service.proto index ed5ca92e2a42..8922ce6f466b 100644 --- a/src/ray/protobuf/gcs_service.proto +++ b/src/ray/protobuf/gcs_service.proto @@ -504,17 +504,6 @@ message WaitPlacementGroupUntilReadyReply { GcsStatus status = 1; } -message GetNamedPlacementGroupRequest { - // Name of the placement group. - string name = 1; -} - -message GetNamedPlacementGroupReply { - GcsStatus status = 1; - // Data of placement group. - PlacementGroupTableData placement_group_table_data = 2; -} - // Service for placement group info access. service PlacementGroupInfoGcsService { // Create placement group via gcs service. @@ -525,9 +514,6 @@ service PlacementGroupInfoGcsService { returns (RemovePlacementGroupReply); // Get placement group information via gcs service. rpc GetPlacementGroup(GetPlacementGroupRequest) returns (GetPlacementGroupReply); - // Get named placement group information via gcs service. - rpc GetNamedPlacementGroup(GetNamedPlacementGroupRequest) - returns (GetNamedPlacementGroupReply); // Get information of all placement group from GCS Service. rpc GetAllPlacementGroup(GetAllPlacementGroupRequest) returns (GetAllPlacementGroupReply); diff --git a/src/ray/rpc/gcs_server/gcs_rpc_client.h b/src/ray/rpc/gcs_server/gcs_rpc_client.h index bf9a72bed7db..fa77fddd2845 100644 --- a/src/ray/rpc/gcs_server/gcs_rpc_client.h +++ b/src/ray/rpc/gcs_server/gcs_rpc_client.h @@ -254,10 +254,6 @@ class GcsRpcClient { VOID_GCS_RPC_CLIENT_METHOD(PlacementGroupInfoGcsService, GetPlacementGroup, placement_group_info_grpc_client_, ) - /// Get placement group data from GCS Service by name. - VOID_GCS_RPC_CLIENT_METHOD(PlacementGroupInfoGcsService, GetNamedPlacementGroup, - placement_group_info_grpc_client_, ) - /// Get information of all placement group from GCS Service. VOID_GCS_RPC_CLIENT_METHOD(PlacementGroupInfoGcsService, GetAllPlacementGroup, placement_group_info_grpc_client_, ) diff --git a/src/ray/rpc/gcs_server/gcs_rpc_server.h b/src/ray/rpc/gcs_server/gcs_rpc_server.h index 328aa5f7382d..0add85c0e04b 100644 --- a/src/ray/rpc/gcs_server/gcs_rpc_server.h +++ b/src/ray/rpc/gcs_server/gcs_rpc_server.h @@ -522,10 +522,6 @@ class PlacementGroupInfoGcsServiceHandler { const WaitPlacementGroupUntilReadyRequest &request, WaitPlacementGroupUntilReadyReply *reply, SendReplyCallback send_reply_callback) = 0; - - virtual void HandleGetNamedPlacementGroup(const GetNamedPlacementGroupRequest &request, - GetNamedPlacementGroupReply *reply, - SendReplyCallback send_reply_callback) = 0; }; /// The `GrpcService` for `PlacementGroupInfoGcsService`. @@ -547,7 +543,6 @@ class PlacementGroupInfoGrpcService : public GrpcService { PLACEMENT_GROUP_INFO_SERVICE_RPC_HANDLER(CreatePlacementGroup); PLACEMENT_GROUP_INFO_SERVICE_RPC_HANDLER(RemovePlacementGroup); PLACEMENT_GROUP_INFO_SERVICE_RPC_HANDLER(GetPlacementGroup); - PLACEMENT_GROUP_INFO_SERVICE_RPC_HANDLER(GetNamedPlacementGroup); PLACEMENT_GROUP_INFO_SERVICE_RPC_HANDLER(GetAllPlacementGroup); PLACEMENT_GROUP_INFO_SERVICE_RPC_HANDLER(WaitPlacementGroupUntilReady); } From b6fa5bae837e93379ae0a16709683b458678094f Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 082/244] Revert "[hotfix][test][windows] Exclude k8s operator mock test from build. (#13924)" This reverts commit 81abe8b137d0c20af6e6c286ce130d7ffbff9ae5. --- ci/travis/ci.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/travis/ci.sh b/ci/travis/ci.sh index 2d381ba24b15..ee339ead2779 100755 --- a/ci/travis/ci.sh +++ b/ci/travis/ci.sh @@ -170,7 +170,6 @@ test_python() { -python/ray/tests:test_stress_sharded # timeout -python/ray/tests:test_k8s_cluster_launcher -python/ray/tests:test_k8s_operator_examples - -python/ray/tests:test_k8s_operator_mock ) fi if [ 0 -lt "${#args[@]}" ]; then # Any targets to test? From b59164e184968cc294d6c93f4eef04f5c79a53a1 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 083/244] Revert "Add more user-friendly error message upon `async def` remote task (#13915)" This reverts commit 36d39851ec584732089c125d14c5662be8d521e5. --- doc/source/async_api.rst | 25 ------------------------- python/ray/_raylet.pyx | 6 ------ python/ray/tests/test_asyncio.py | 11 ----------- 3 files changed, 42 deletions(-) diff --git a/doc/source/async_api.rst b/doc/source/async_api.rst index 644699d8833b..a305c2dd1be3 100644 --- a/doc/source/async_api.rst +++ b/doc/source/async_api.rst @@ -162,28 +162,3 @@ Instead, you can use the ``max_concurrency`` Actor options without any async met Each invocation of the threaded actor will be running in a thread pool. The size of the threadpool is limited by the ``max_concurrency`` value. - -AsyncIO for Remote Tasks ------------------------- - -We don't support asyncio for remote tasks. The following snippet will fail: - -.. code-block:: python - - @ray.remote - async def f(): - pass - -Instead, you can wrap the ``async`` function with a wrapper to run the task synchronously: - -.. code-block:: python - - async def f(): - pass - - @ray.remote - def wrapper(): - import asyncio - asyncio.get_event_loop().run_until_complete(f()) - - \ No newline at end of file diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 47b6aa4f8358..3d2b9ea737c4 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -477,12 +477,6 @@ cdef execute_task( if debugger_breakpoint != b"": ray.util.pdb.set_trace( breakpoint_uuid=debugger_breakpoint) - if inspect.iscoroutinefunction(function_executor): - raise ValueError( - "'async def' should not be used for remote " - "tasks. You can wrap the async function with " - "`asyncio.get_event_loop.run_until(f())`. " - "See more at docs.ray.io/async_api.html") outputs = function_executor(*args, **kwargs) next_breakpoint = ( ray.worker.global_worker.debugger_breakpoint) diff --git a/python/ray/tests/test_asyncio.py b/python/ray/tests/test_asyncio.py index fd99343254d5..31f03aefa546 100644 --- a/python/ray/tests/test_asyncio.py +++ b/python/ray/tests/test_asyncio.py @@ -244,17 +244,6 @@ def wait(): wait_for_condition(lambda: "completed-2" in global_set) -def test_async_function_errored(ray_start_regular_shared): - @ray.remote - async def f(): - pass - - ref = f.remote() - - with pytest.raises(ValueError): - ray.get(ref) - - if __name__ == "__main__": import pytest sys.exit(pytest.main(["-v", __file__])) From 31c8f3bc6f1ba99a49056ba1eff21c5789521600 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 084/244] Revert "[Serve] Revert "Revert "[Serve] Fix ServeHandle serialization"" and disable failing Windows test (#13771)" This reverts commit 14c5222b35cdf272abf0c39adc46639284e780a5. --- ci/travis/ci.sh | 1 - python/ray/serve/api.py | 7 ----- python/ray/serve/handle.py | 24 ++++----------- python/ray/serve/tests/test_handle.py | 44 +-------------------------- 4 files changed, 7 insertions(+), 69 deletions(-) diff --git a/ci/travis/ci.sh b/ci/travis/ci.sh index ee339ead2779..6267a232125a 100755 --- a/ci/travis/ci.sh +++ b/ci/travis/ci.sh @@ -140,7 +140,6 @@ test_python() { python/ray/serve/... python/ray/tests/... -python/ray/serve:test_api # segfault on windows? https://github.com/ray-project/ray/issues/12541 - -python/ray/serve:test_handle # "fatal error" (?) https://github.com/ray-project/ray/pull/13695 -python/ray/tests:test_actor_advanced # timeout -python/ray/tests:test_advanced_2 -python/ray/tests:test_advanced_3 # test_invalid_unicode_in_worker_log() fails on Windows diff --git a/python/ray/serve/api.py b/python/ray/serve/api.py index 4c0a0a91ff7b..2e0490631d59 100644 --- a/python/ray/serve/api.py +++ b/python/ray/serve/api.py @@ -66,8 +66,6 @@ def check(self, *args, **kwargs): class ThreadProxiedRouter: def __init__(self, controller_handle, sync: bool): - self.controller_handle = controller_handle - self.sync = sync self.router = Router(controller_handle) if sync: @@ -94,11 +92,6 @@ def _remote(self, endpoint_name, handle_options, request_data, **kwargs) return coro - def __reduce__(self): - deserializer = ThreadProxiedRouter - serialized_data = (self.controller_handle, self.sync) - return deserializer, serialized_data - class Client: def __init__(self, diff --git a/python/ray/serve/handle.py b/python/ray/serve/handle.py index 3659e5978bf2..475f64556cb5 100644 --- a/python/ray/serve/handle.py +++ b/python/ray/serve/handle.py @@ -6,6 +6,7 @@ from ray.serve.utils import get_random_letters from ray.util import metrics +from ray.serve.router import Router @dataclass(frozen=True) @@ -41,11 +42,10 @@ class RayServeHandle: # raises RayTaskError Exception """ - def __init__( - self, - router, # ThreadProxiedRouter - endpoint_name, - handle_options: Optional[HandleOptions] = None): + def __init__(self, + router: Router, + endpoint_name, + handle_options: Optional[HandleOptions] = None): self.router = router self.endpoint_name = endpoint_name self.handle_options = handle_options or HandleOptions() @@ -91,7 +91,7 @@ def options(self, async def remote(self, request_data: Optional[Union[Dict, Any]] = None, **kwargs): - """Issue an asynchronous request to the endpoint. + """Issue an asynchrounous request to the endpoint. Returns a Ray ObjectRef whose results can be waited for or retrieved using ray.wait or ray.get (or ``await object_ref``), respectively. @@ -112,12 +112,6 @@ async def remote(self, def __repr__(self): return f"{self.__class__.__name__}(endpoint='{self.endpoint_name}')" - def __reduce__(self): - deserializer = RayServeHandle - serialized_data = (self.router, self.endpoint_name, - self.handle_options) - return deserializer, serialized_data - class RayServeSyncHandle(RayServeHandle): def remote(self, request_data: Optional[Union[Dict, Any]] = None, @@ -144,9 +138,3 @@ def remote(self, request_data: Optional[Union[Dict, Any]] = None, future: concurrent.futures.Future = asyncio.run_coroutine_threadsafe( coro, self.router.async_loop) return future.result() - - def __reduce__(self): - deserializer = RayServeSyncHandle - serialized_data = (self.router, self.endpoint_name, - self.handle_options) - return deserializer, serialized_data diff --git a/python/ray/serve/tests/test_handle.py b/python/ray/serve/tests/test_handle.py index 88ab9d2c2b7a..c17db7686aad 100644 --- a/python/ray/serve/tests/test_handle.py +++ b/python/ray/serve/tests/test_handle.py @@ -1,51 +1,9 @@ import requests -import pytest + import ray from ray import serve -@pytest.mark.asyncio -async def test_async_handle_serializable(serve_instance): - client = serve_instance - - def f(_): - return "hello" - - client.create_backend("f", f) - client.create_endpoint("f", backend="f") - - @ray.remote - class TaskActor: - async def task(self, handle): - ref = await handle.remote() - output = await ref - return output - - handle = client.get_handle("f", sync=False) - - task_actor = TaskActor.remote() - result = await task_actor.task.remote(handle) - assert result == "hello" - - -def test_sync_handle_serializable(serve_instance): - client = serve_instance - - def f(_): - return "hello" - - client.create_backend("f", f) - client.create_endpoint("f", backend="f") - - @ray.remote - def task(handle): - return ray.get(handle.remote()) - - handle = client.get_handle("f", sync=True) - result_ref = task.remote(handle) - assert ray.get(result_ref) == "hello" - - def test_handle_in_endpoint(serve_instance): client = serve_instance From 6c0f487f431d58645edc90e4f6f0d8c88088bcf5 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 085/244] Revert "[serve] Built-in support for imported backends (#13867)" This reverts commit 68b6101a3cfa09a30cb0ccc9c64750c2a1209e2a. --- doc/source/serve/advanced.rst | 9 +-- doc/source/serve/package-ref.rst | 4 ++ python/ray/serve/api.py | 23 ++++--- python/ray/serve/backend_state.py | 4 +- python/ray/serve/backend_worker.py | 40 ++++------- python/ray/serve/config.py | 68 ++++++++----------- .../serve/examples/doc/imported_backend.py | 5 +- python/ray/serve/tests/test_backend_worker.py | 4 +- python/ray/serve/tests/test_config.py | 3 +- .../ray/serve/tests/test_imported_backend.py | 16 +---- python/ray/serve/tests/test_util.py | 14 ++-- python/ray/serve/utils.py | 16 ++--- 12 files changed, 88 insertions(+), 118 deletions(-) diff --git a/doc/source/serve/advanced.rst b/doc/source/serve/advanced.rst index 7a6027ad54c3..542a3ce188ec 100644 --- a/doc/source/serve/advanced.rst +++ b/doc/source/serve/advanced.rst @@ -398,9 +398,10 @@ as shown below. The dependencies required in the backend may be different than the dependencies installed in the driver program (the one running Serve API -calls). In this case, you can pass the backend in as an import path that will -be imported in the Python environment in the workers, but not the driver. -Example: +calls). In this case, you can use an +:mod:`ImportedBackend ` to specify a +backend based on a class that is installed in the Python environment that +the workers will run in. Example: .. literalinclude:: ../../../python/ray/serve/examples/doc/imported_backend.py @@ -420,4 +421,4 @@ in :mod:`serve.start `: .. note:: Using the "EveryNode" option, you can point a cloud load balancer to the instance group of Ray cluster to achieve high availability of Serve's HTTP - proxies. + proxies. \ No newline at end of file diff --git a/doc/source/serve/package-ref.rst b/doc/source/serve/package-ref.rst index 20ed340be1fb..3df9c291557f 100644 --- a/doc/source/serve/package-ref.rst +++ b/doc/source/serve/package-ref.rst @@ -37,3 +37,7 @@ objects instead of Starlette requests. Batching Requests ----------------- .. autofunction:: ray.serve.accept_batch + +Built-in Backends +----------------- +.. autoclass:: ray.serve.backends.ImportedBackend diff --git a/python/ray/serve/api.py b/python/ray/serve/api.py index 2e0490631d59..b42cd78464a7 100644 --- a/python/ray/serve/api.py +++ b/python/ray/serve/api.py @@ -323,23 +323,22 @@ def get_backend_config(self, backend_tag: str) -> BackendConfig: def create_backend( self, backend_tag: str, - backend_def: Union[Callable, Type[Callable], str], - *init_args: Any, + func_or_class: Union[Callable, Type[Callable]], + *actor_init_args: Any, ray_actor_options: Optional[Dict] = None, config: Optional[Union[BackendConfig, Dict[str, Any]]] = None, env: Optional[CondaEnv] = None) -> None: """Create a backend with the provided tag. + The backend will serve requests with func_or_class. + Args: backend_tag (str): a unique tag assign to identify this backend. - backend_def (callable, class, str): a function or class - implementing __call__ and returning a JSON-serializable object - or a Starlette Response object. A string import path can also - be provided (e.g., "my_module.MyClass"), in which case the - underlying function or class will be imported dynamically in - the worker replicas. - *init_args (optional): the arguments to pass to the class - initialization method. Not valid if backend_def is a function. + func_or_class (callable, class): a function or a class implementing + __call__, returning a JSON-serializable object or a + Starlette Response object. + *actor_init_args (optional): the arguments to pass to the class + initialization method. ray_actor_options (optional): options to be passed into the @ray.remote decorator for the backend actor. config (dict, serve.BackendConfig, optional): configuration options @@ -387,7 +386,9 @@ def create_backend( ray_actor_options.update( override_environment_variables={"PYTHONHOME": conda_env_dir}) replica_config = ReplicaConfig( - backend_def, *init_args, ray_actor_options=ray_actor_options) + func_or_class, + *actor_init_args, + ray_actor_options=ray_actor_options) metadata = BackendMetadata( accepts_batches=replica_config.accepts_batches, is_blocking=replica_config.is_blocking) diff --git a/python/ray/serve/backend_state.py b/python/ray/serve/backend_state.py index ba6e2260f2f8..418ab3b2ad12 100644 --- a/python/ray/serve/backend_state.py +++ b/python/ray/serve/backend_state.py @@ -97,7 +97,7 @@ def start(self, backend_info: Optional[BackendInfo]): max_task_retries=-1, **backend_info.replica_config.ray_actor_options).remote( self._backend_tag, self._replica_tag, - backend_info.replica_config.init_args, + backend_info.replica_config.actor_init_args, backend_info.backend_config, self._controller_name) self._startup_obj_ref = self._actor_handle.ready.remote() self._state = ReplicaState.STARTING @@ -277,7 +277,7 @@ def create_backend(self, backend_tag: BackendTag, return None backend_replica_class = create_backend_replica( - replica_config.backend_def) + replica_config.func_or_class) # Save creator that starts replicas, the arguments to be passed in, # and the configuration for the backends. diff --git a/python/ray/serve/backend_worker.py b/python/ray/serve/backend_worker.py index 5740cf4f5a6d..da087efa5434 100644 --- a/python/ray/serve/backend_worker.py +++ b/python/ray/serve/backend_worker.py @@ -13,7 +13,7 @@ from ray.async_compat import sync_to_async from ray.serve.utils import (parse_request_item, _get_logger, chain_future, - unpack_future, import_attr) + unpack_future) from ray.serve.exceptions import RayServeException from ray.util import metrics from ray.serve.config import BackendConfig @@ -94,40 +94,33 @@ async def wait_for_batch(self) -> List[Query]: return batch -def create_backend_replica(backend_def: Union[Callable, Type[Callable], str]): +def create_backend_replica(func_or_class: Union[Callable, Type[Callable]]): """Creates a replica class wrapping the provided function or class. This approach is picked over inheritance to avoid conflict between user provided class and the RayServeReplica class. """ - backend_def = backend_def + + if inspect.isfunction(func_or_class): + is_function = True + elif inspect.isclass(func_or_class): + is_function = False + else: + assert False, "func_or_class must be function or class." # TODO(architkulkarni): Add type hints after upgrading cloudpickle class RayServeWrappedReplica(object): def __init__(self, backend_tag, replica_tag, init_args, backend_config: BackendConfig, controller_name: str): - if isinstance(backend_def, str): - backend = import_attr(backend_def) - else: - backend = backend_def - - if inspect.isfunction(backend): - is_function = True - elif inspect.isclass(backend): - is_function = False - else: - assert False, ("backend_def must be function, class, or " - "corresponding import path.") - # Set the controller name so that serve.connect() in the user's # backend code will connect to the instance that this backend is # running in. ray.serve.api._set_internal_replica_context( backend_tag, replica_tag, controller_name) if is_function: - _callable = backend + _callable = func_or_class else: - _callable = backend(*init_args) + _callable = func_or_class(*init_args) assert controller_name, "Must provide a valid controller_name" controller_handle = ray.get_actor(controller_name) @@ -151,12 +144,8 @@ def ready(self): async def drain_pending_queries(self): return await self.backend.drain_pending_queries() - if isinstance(backend_def, str): - RayServeWrappedReplica.__name__ = "RayServeReplica_{}".format( - backend_def) - else: - RayServeWrappedReplica.__name__ = "RayServeReplica_{}".format( - backend_def.__name__) + RayServeWrappedReplica.__name__ = "RayServeReplica_{}".format( + func_or_class.__name__) return RayServeWrappedReplica @@ -426,7 +415,8 @@ def reconfigure(self, user_config) -> None: if user_config: if self.is_function: raise ValueError( - "backend_def must be a class to use user_config") + "argument func_or_class must be a class to use user_config" + ) elif not hasattr(self.callable, BACKEND_RECONFIGURE_METHOD): raise RayServeException("user_config specified but backend " + self.backend_tag + " missing " + diff --git a/python/ray/serve/config.py b/python/ray/serve/config.py index 8060b406f0de..41a1eca08ae8 100644 --- a/python/ray/serve/config.py +++ b/python/ray/serve/config.py @@ -5,29 +5,22 @@ import pydantic from pydantic import BaseModel, confloat, PositiveFloat, PositiveInt, validator -from ray.serve.constants import DEFAULT_HTTP_HOST, DEFAULT_HTTP_PORT +from ray.serve.constants import (ASYNC_CONCURRENCY, DEFAULT_HTTP_HOST, + DEFAULT_HTTP_PORT) -def _callable_accepts_batch(backend_def): - if inspect.isfunction(backend_def): - return hasattr(backend_def, "_serve_accept_batch") - elif inspect.isclass(backend_def): - return hasattr(backend_def.__call__, "_serve_accept_batch") - elif isinstance(backend_def, str): - return True - else: - raise TypeError("backend_def must be function, class, or str.") +def _callable_accepts_batch(func_or_class): + if inspect.isfunction(func_or_class): + return hasattr(func_or_class, "_serve_accept_batch") + elif inspect.isclass(func_or_class): + return hasattr(func_or_class.__call__, "_serve_accept_batch") -def _callable_is_blocking(backend_def): - if inspect.isfunction(backend_def): - return not inspect.iscoroutinefunction(backend_def) - elif inspect.isclass(backend_def): - return not inspect.iscoroutinefunction(backend_def.__call__) - elif isinstance(backend_def, str): - return False - else: - raise TypeError("backend_def must be function, class, or str.") +def _callable_is_blocking(func_or_class): + if inspect.isfunction(func_or_class): + return not inspect.iscoroutinefunction(func_or_class) + elif inspect.isclass(func_or_class): + return not inspect.iscoroutinefunction(func_or_class.__call__) @dataclass @@ -112,11 +105,8 @@ def set_max_queries_by_mode(cls, v, values): # noqa 805 # Pipeline/async mode: if the servable is not blocking, # router should just keep pushing queries to the replicas # until a high limit. - # TODO(edoakes): setting this to a relatively low constant because - # we can't determine if imported backends are sync or async, but we - # may consider tweaking it in the future. if not values["internal_metadata"].is_blocking: - v = 100 + v = ASYNC_CONCURRENCY # Batch inference mode: user specifies non zero timeout to wait for # full batch. We will use 2*max_batch_size to perform double @@ -129,11 +119,12 @@ def set_max_queries_by_mode(cls, v, values): # noqa 805 class ReplicaConfig: - def __init__(self, backend_def, *init_args, ray_actor_options=None): - self.backend_def = backend_def - self.accepts_batches = _callable_accepts_batch(backend_def) - self.is_blocking = _callable_is_blocking(backend_def) - self.init_args = list(init_args) + def __init__(self, func_or_class, *actor_init_args, + ray_actor_options=None): + self.func_or_class = func_or_class + self.accepts_batches = _callable_accepts_batch(func_or_class) + self.is_blocking = _callable_is_blocking(func_or_class) + self.actor_init_args = list(actor_init_args) if ray_actor_options is None: self.ray_actor_options = {} else: @@ -143,28 +134,27 @@ def __init__(self, backend_def, *init_args, ray_actor_options=None): self._validate() def _validate(self): - # Validate that backend_def is an import path, function, or class. - if isinstance(self.backend_def, str): - pass - elif inspect.isfunction(self.backend_def): - if len(self.init_args) != 0: + # Validate that func_or_class is a function or class. + if inspect.isfunction(self.func_or_class): + if len(self.actor_init_args) != 0: raise ValueError( - "init_args not supported for function backend.") - elif not inspect.isclass(self.backend_def): + "actor_init_args not supported for function backend.") + elif not inspect.isclass(self.func_or_class): raise TypeError( "Backend must be a function or class, it is {}.".format( - type(self.backend_def))) + type(self.func_or_class))) if not isinstance(self.ray_actor_options, dict): raise TypeError("ray_actor_options must be a dictionary.") elif "lifetime" in self.ray_actor_options: raise ValueError( - "Specifying lifetime in init_args is not allowed.") + "Specifying lifetime in actor_init_args is not allowed.") elif "name" in self.ray_actor_options: - raise ValueError("Specifying name in init_args is not allowed.") + raise ValueError( + "Specifying name in actor_init_args is not allowed.") elif "max_restarts" in self.ray_actor_options: raise ValueError("Specifying max_restarts in " - "init_args is not allowed.") + "actor_init_args is not allowed.") else: # Ray defaults to zero CPUs for placement, we default to one here. if "num_cpus" not in self.ray_actor_options: diff --git a/python/ray/serve/examples/doc/imported_backend.py b/python/ray/serve/examples/doc/imported_backend.py index 596604aaa4d9..d80d73b4a72c 100644 --- a/python/ray/serve/examples/doc/imported_backend.py +++ b/python/ray/serve/examples/doc/imported_backend.py @@ -1,12 +1,13 @@ import requests from ray import serve +from ray.serve.backends import ImportedBackend client = serve.start() # Include your class as input to the ImportedBackend constructor. -import_path = "ray.serve.utils.MockImportedBackend" -client.create_backend("imported", import_path, "input_arg") +backend_class = ImportedBackend("ray.serve.utils.MockImportedBackend") +client.create_backend("imported", backend_class, "input_arg") client.create_endpoint("imported", backend="imported", route="/imported") print(requests.get("http://127.0.0.1:8000/imported").text) diff --git a/python/ray/serve/tests/test_backend_worker.py b/python/ray/serve/tests/test_backend_worker.py index 11c22e02e976..74c5418df253 100644 --- a/python/ray/serve/tests/test_backend_worker.py +++ b/python/ray/serve/tests/test_backend_worker.py @@ -16,7 +16,7 @@ def setup_worker(name, - backend_def, + func_or_class, init_args=None, backend_config=BackendConfig(), controller_name=""): @@ -26,7 +26,7 @@ def setup_worker(name, @ray.remote class WorkerActor: def __init__(self): - self.worker = create_backend_replica(backend_def)( + self.worker = create_backend_replica(func_or_class)( name, name + ":tag", init_args, backend_config, controller_name) diff --git a/python/ray/serve/tests/test_config.py b/python/ray/serve/tests/test_config.py index 5227b3ff5c53..40942ad767eb 100644 --- a/python/ray/serve/tests/test_config.py +++ b/python/ray/serve/tests/test_config.py @@ -3,6 +3,7 @@ from ray import serve from ray.serve.config import (BackendConfig, DeploymentMode, HTTPOptions, ReplicaConfig, BackendMetadata) +from ray.serve.constants import ASYNC_CONCURRENCY from pydantic import ValidationError @@ -41,7 +42,7 @@ def test_backend_config_validation(): assert BackendConfig( max_batch_size=10, internal_metadata=BackendMetadata( - is_blocking=False)).max_concurrent_queries == 100 + is_blocking=False)).max_concurrent_queries == ASYNC_CONCURRENCY assert BackendConfig( max_batch_size=7, batch_wait_timeout=1.0).max_concurrent_queries == 14 diff --git a/python/ray/serve/tests/test_imported_backend.py b/python/ray/serve/tests/test_imported_backend.py index 4b13980725ac..99f08a04ba07 100644 --- a/python/ray/serve/tests/test_imported_backend.py +++ b/python/ray/serve/tests/test_imported_backend.py @@ -1,16 +1,15 @@ import ray +from ray.serve.backends import ImportedBackend from ray.serve.config import BackendConfig def test_imported_backend(serve_instance): client = serve_instance + backend_class = ImportedBackend("ray.serve.utils.MockImportedBackend") config = BackendConfig(user_config="config", max_batch_size=2) client.create_backend( - "imported", - "ray.serve.utils.MockImportedBackend", - "input_arg", - config=config) + "imported", backend_class, "input_arg", config=config) client.create_endpoint("imported", backend="imported") # Basic sanity check. @@ -28,12 +27,3 @@ def test_imported_backend(serve_instance): # Check that other call methods work. handle = handle.options(method_name="other_method") assert ray.get(handle.remote("hello")) == "hello" - - # Check that functions work as well. - client.create_backend( - "imported_func", - "ray.serve.utils.mock_imported_function", - config=BackendConfig(max_batch_size=2)) - client.create_endpoint("imported_func", backend="imported_func") - handle = client.get_handle("imported_func") - assert ray.get(handle.remote("hello")) == "hello" diff --git a/python/ray/serve/tests/test_util.py b/python/ray/serve/tests/test_util.py index 95f526c31288..9893bc4cee3e 100644 --- a/python/ray/serve/tests/test_util.py +++ b/python/ray/serve/tests/test_util.py @@ -9,7 +9,7 @@ import ray from ray.serve.utils import (ServeEncoder, chain_future, unpack_future, try_schedule_resources_on_nodes, - get_conda_env_dir, import_attr) + get_conda_env_dir, import_class) def test_bytes_encoder(): @@ -126,11 +126,11 @@ def test_get_conda_env_dir(tmp_path): os.environ["CONDA_PREFIX"] = "" -def test_import_attr(): - assert import_attr("ray.serve.Client") == ray.serve.api.Client - assert import_attr("ray.serve.api.Client") == ray.serve.api.Client +def test_import_class(): + assert import_class("ray.serve.Client") == ray.serve.api.Client + assert import_class("ray.serve.api.Client") == ray.serve.api.Client - policy_cls = import_attr("ray.serve.controller.TrafficPolicy") + policy_cls = import_class("ray.serve.controller.TrafficPolicy") assert policy_cls == ray.serve.controller.TrafficPolicy policy = policy_cls({"endpoint1": 0.5, "endpoint2": 0.5}) @@ -140,10 +140,6 @@ def test_import_attr(): print(repr(policy)) - # Very meta... - import_attr_2 = import_attr("ray.serve.utils.import_attr") - assert import_attr_2 == import_attr - if __name__ == "__main__": import sys diff --git a/python/ray/serve/utils.py b/python/ray/serve/utils.py index 1d19593e63b1..10753fcb5a2c 100644 --- a/python/ray/serve/utils.py +++ b/python/ray/serve/utils.py @@ -359,26 +359,22 @@ def get_node_id_for_actor(actor_handle): return ray.actors()[actor_handle._actor_id.hex()]["Address"]["NodeID"] -def import_attr(full_path: str): - """Given a full import path to a module attr, return the imported attr. +def import_class(full_path: str): + """Given a full import path to a class name, return the imported class. For example, the following are equivalent: - MyClass = import_attr("module.submodule.MyClass") + MyClass = import_class("module.submodule.MyClass") from module.submodule import MyClass Returns: - Imported attr + Imported class """ last_period_idx = full_path.rfind(".") - attr_name = full_path[last_period_idx + 1:] + class_name = full_path[last_period_idx + 1:] module_name = full_path[:last_period_idx] module = importlib.import_module(module_name) - return getattr(module, attr_name) - - -async def mock_imported_function(batch): - return [await request.body() for request in batch] + return getattr(module, class_name) class MockImportedBackend: From d37e8e4b529ab1c3e7c37d6aa505b37af2eadce7 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 086/244] Revert "[autoscaler][kubernetes] Add ability to not copy cluster config to head node when calling `create_or_update_head_node`. (#13720)" This reverts commit 729883ae3ed86cf1b8e8b4c21a60a529a55e1f16. --- python/ray/autoscaler/_private/commands.py | 129 ++++++-------- .../operator_configs/example_cluster.yaml | 2 +- .../operator_configs/example_cluster2.yaml | 2 +- python/ray/ray_operator/operator.py | 3 +- python/ray/tests/BUILD | 1 - python/ray/tests/test_k8s_operator_mock.py | 162 ------------------ 6 files changed, 53 insertions(+), 246 deletions(-) delete mode 100644 python/ray/tests/test_k8s_operator_mock.py diff --git a/python/ray/autoscaler/_private/commands.py b/python/ray/autoscaler/_private/commands.py index 84d3b15694ad..df0a104493eb 100644 --- a/python/ray/autoscaler/_private/commands.py +++ b/python/ray/autoscaler/_private/commands.py @@ -34,7 +34,7 @@ from ray.autoscaler._private.cli_logger import cli_logger, cf from ray.autoscaler._private.updater import NodeUpdaterThread from ray.autoscaler._private.command_runner import set_using_login_shells, \ - set_rsync_silent + set_rsync_silent from ray.autoscaler._private.event_system import (CreateClusterEvent, global_event_system) from ray.autoscaler._private.log_timer import LogTimer @@ -137,22 +137,17 @@ def request_resources(num_cpus: Optional[int] = None, overwrite=True) -def create_or_update_cluster( - config_file: str, - override_min_workers: Optional[int], - override_max_workers: Optional[int], - no_restart: bool, - restart_only: bool, - yes: bool, - override_cluster_name: Optional[str] = None, - no_config_cache: bool = False, - redirect_command_output: Optional[bool] = False, - use_login_shells: bool = True, - no_monitor_on_head: bool = False) -> Dict[str, Any]: +def create_or_update_cluster(config_file: str, + override_min_workers: Optional[int], + override_max_workers: Optional[int], + no_restart: bool, + restart_only: bool, + yes: bool, + override_cluster_name: Optional[str] = None, + no_config_cache: bool = False, + redirect_command_output: Optional[bool] = False, + use_login_shells: bool = True) -> Dict[str, Any]: """Create or updates an autoscaling Ray cluster from a config json.""" - # no_monitor_on_head is an internal flag used by the Ray K8s operator. - # If True, prevents autoscaling config sync to the Ray head during cluster - # creation. See https://github.com/ray-project/ray/pull/13720. set_using_login_shells(use_login_shells) if not use_login_shells: cmd_output_util.set_allow_interactive(False) @@ -230,7 +225,7 @@ def handle_cli_override(key, override): try_logging_config(config) get_or_create_head_node(config, config_file, no_restart, restart_only, yes, - override_cluster_name, no_monitor_on_head) + override_cluster_name) return config @@ -490,17 +485,13 @@ def monitor_cluster(cluster_config_file: str, num_lines: int, port_forward=None) -def warn_about_bad_start_command(start_commands: List[str], - no_monitor_on_head: bool = False) -> None: +def warn_about_bad_start_command(start_commands: List[str]) -> None: ray_start_cmd = list(filter(lambda x: "ray start" in x, start_commands)) if len(ray_start_cmd) == 0: cli_logger.warning( "Ray runtime will not be started because `{}` is not in `{}`.", cf.bold("ray start"), cf.bold("head_start_ray_commands")) - - autoscaling_config_in_ray_start_cmd = any( - "autoscaling-config" in x for x in ray_start_cmd) - if not (autoscaling_config_in_ray_start_cmd or no_monitor_on_head): + if not any("autoscaling-config" in x for x in ray_start_cmd): cli_logger.warning( "The head node will not launch any workers because " "`{}` does not have `{}` set.\n" @@ -516,7 +507,6 @@ def get_or_create_head_node(config: Dict[str, Any], restart_only: bool, yes: bool, override_cluster_name: Optional[str], - no_monitor_on_head: bool = False, _provider: Optional[NodeProvider] = None, _runner: ModuleType = subprocess) -> None: """Create the cluster head node, which in turn creates the workers.""" @@ -639,11 +629,41 @@ def get_or_create_head_node(config: Dict[str, Any], (runtime_hash, file_mounts_contents_hash) = hash_runtime_conf( config["file_mounts"], None, config) - if not no_monitor_on_head: - # Return remote_config_file to avoid prematurely closing it. - config, remote_config_file = _set_up_config_for_head_node( - config, provider, no_restart) - cli_logger.print("Prepared bootstrap config") + # Rewrite the auth config so that the head + # node can update the workers + remote_config = copy.deepcopy(config) + + # drop proxy options if they exist, otherwise + # head node won't be able to connect to workers + remote_config["auth"].pop("ssh_proxy_command", None) + + if "ssh_private_key" in config["auth"]: + remote_key_path = "~/ray_bootstrap_key.pem" + remote_config["auth"]["ssh_private_key"] = remote_key_path + + # Adjust for new file locations + new_mounts = {} + for remote_path in config["file_mounts"]: + new_mounts[remote_path] = remote_path + remote_config["file_mounts"] = new_mounts + remote_config["no_restart"] = no_restart + + remote_config = provider.prepare_for_head_node(remote_config) + + # Now inject the rewritten config and SSH key into the head node + remote_config_file = tempfile.NamedTemporaryFile( + "w", prefix="ray-bootstrap-") + remote_config_file.write(json.dumps(remote_config)) + remote_config_file.flush() + config["file_mounts"].update({ + "~/ray_bootstrap_config.yaml": remote_config_file.name + }) + + if "ssh_private_key" in config["auth"]: + config["file_mounts"].update({ + remote_key_path: config["auth"]["ssh_private_key"], + }) + cli_logger.print("Prepared bootstrap config") if restart_only: setup_commands = [] @@ -656,8 +676,7 @@ def get_or_create_head_node(config: Dict[str, Any], ray_start_commands = config["head_start_ray_commands"] if not no_restart: - warn_about_bad_start_command(ray_start_commands, - no_monitor_on_head) + warn_about_bad_start_command(ray_start_commands) updater = NodeUpdaterThread( node_id=head_node, @@ -718,54 +737,6 @@ def get_or_create_head_node(config: Dict[str, Any], cli_logger.print(" {}", remote_shell_str.strip()) -def _set_up_config_for_head_node(config: Dict[str, Any], - provider: NodeProvider, - no_restart: bool) ->\ - Tuple[Dict[str, Any], Any]: - """Prepares autoscaling config and, if needed, ssh key, to be mounted onto - the Ray head node for use by the autoscaler. - - Returns the modified config and the temporary config file that will be - mounted onto the head node. - """ - # Rewrite the auth config so that the head - # node can update the workers - remote_config = copy.deepcopy(config) - - # drop proxy options if they exist, otherwise - # head node won't be able to connect to workers - remote_config["auth"].pop("ssh_proxy_command", None) - - if "ssh_private_key" in config["auth"]: - remote_key_path = "~/ray_bootstrap_key.pem" - remote_config["auth"]["ssh_private_key"] = remote_key_path - - # Adjust for new file locations - new_mounts = {} - for remote_path in config["file_mounts"]: - new_mounts[remote_path] = remote_path - remote_config["file_mounts"] = new_mounts - remote_config["no_restart"] = no_restart - - remote_config = provider.prepare_for_head_node(remote_config) - - # Now inject the rewritten config and SSH key into the head node - remote_config_file = tempfile.NamedTemporaryFile( - "w", prefix="ray-bootstrap-") - remote_config_file.write(json.dumps(remote_config)) - remote_config_file.flush() - config["file_mounts"].update({ - "~/ray_bootstrap_config.yaml": remote_config_file.name - }) - - if "ssh_private_key" in config["auth"]: - config["file_mounts"].update({ - remote_key_path: config["auth"]["ssh_private_key"], - }) - - return config, remote_config_file - - def attach_cluster(config_file: str, start: bool, use_screen: bool, diff --git a/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml b/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml index 2735c72eb948..8d2aa4561936 100644 --- a/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml +++ b/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml @@ -119,7 +119,7 @@ spec: # Note dashboard-host is set to 0.0.0.0 so that Kubernetes can port forward. headStartRayCommands: - ray stop - - ulimit -n 65536; ray start --head --no-monitor --port=6379 --object-manager-port=8076 --dashboard-host 0.0.0.0 + - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --dashboard-host 0.0.0.0 # Commands to start Ray on worker nodes. You don't need to change this. workerStartRayCommands: - ray stop diff --git a/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml b/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml index 7341e16fa914..0c6eb604e1eb 100644 --- a/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml +++ b/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml @@ -119,7 +119,7 @@ spec: # Note dashboard-host is set to 0.0.0.0 so that Kubernetes can port forward. headStartRayCommands: - ray stop - - ulimit -n 65536; ray start --head --no-monitor --port=6379 --object-manager-port=8076 --dashboard-host 0.0.0.0 + - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --dashboard-host 0.0.0.0 # Commands to start Ray on worker nodes. You don't need to change this. workerStartRayCommands: - ray stop diff --git a/python/ray/ray_operator/operator.py b/python/ray/ray_operator/operator.py index bfbde80553ce..e39f4cfef322 100644 --- a/python/ray/ray_operator/operator.py +++ b/python/ray/ray_operator/operator.py @@ -62,8 +62,7 @@ def start_head(self) -> None: no_restart=False, restart_only=False, yes=True, - no_config_cache=True, - no_monitor_on_head=True) + no_config_cache=True) self.write_config() def start_monitor(self) -> None: diff --git a/python/ray/tests/BUILD b/python/ray/tests/BUILD index 4ef81d504f63..806f04fe56df 100644 --- a/python/ray/tests/BUILD +++ b/python/ray/tests/BUILD @@ -92,7 +92,6 @@ py_test_module_list( "test_dask_scheduler.py", "test_debug_tools.py", "test_job.py", - "test_k8s_operator_mock.py", "test_memstat.py", "test_metrics_agent.py", "test_microbenchmarks.py", diff --git a/python/ray/tests/test_k8s_operator_mock.py b/python/ray/tests/test_k8s_operator_mock.py deleted file mode 100644 index a3bbf5766922..000000000000 --- a/python/ray/tests/test_k8s_operator_mock.py +++ /dev/null @@ -1,162 +0,0 @@ -import os -import unittest -from unittest.mock import patch - -import pytest -import tempfile -import yaml - -from ray.autoscaler.tags import TAG_RAY_NODE_KIND, NODE_KIND_HEAD -from ray.autoscaler.node_provider import NodeProvider -from ray.ray_operator.operator import RayCluster -from ray.ray_operator.operator_utils import cr_to_config -from ray.autoscaler._private.kubernetes.node_provider import\ - KubernetesNodeProvider -from ray.autoscaler._private.updater import NodeUpdaterThread -""" -Tests that, when the K8s operator launches a cluster, no files are mounted onto -the head node. -The main idea is to mock the NodeUpdaterThread to report if it received any -file mounts. -""" - -# NodeUpdaterThread mock methods -START = "start" -JOIN = "join" - - -def mock_start(self): - # Detects any file mounts passed in NodeUpdaterThread.__init__() - if self.file_mounts: - raise ValueError("File mounts in operator's code path.") - - -def mock_join(self): - # Fake success - self.exitcode = 0 - return - - -# RayCluster mock methods -SETUP_LOGGING = "setup_logging" -WRITE_CONFIG = "write_config" - - -def mock_setup_logging(self): - return - - -def mock_write_config(self): - # Use a named temporary file instead of a real one. - self.config_file = tempfile.NamedTemporaryFile("w") - self.config_path = self.config_file.name - yaml.dump(self.config, self.config_file) - self.config_file.flush() - - -# KubernetesNodeProvider mock methods -INIT = "__init__" -NON_TERMINATED_NODES = "non_terminated_nodes" -CREATE_NODE = "create_node" -BOOTSTRAP_CONFIG = "bootstrap_config" - -HEAD_NODE_TAGS = {TAG_RAY_NODE_KIND: NODE_KIND_HEAD} - - -def mock_init(self, provider_config, cluster_name): - # Adds an attribute to detect if the provider has created the head. - NodeProvider.__init__(self, provider_config, cluster_name) - self.cluster_name = cluster_name - self.namespace = provider_config["namespace"] - - self._head_created = False - - -def mock_non_terminated_nodes(self, node_tags): - # First time this is called, it returns an empty list. - # Second time, returns a mock head node id. - if HEAD_NODE_TAGS.items() <= node_tags.items() and self._head_created: - # Second call. - return ["HEAD"] - elif node_tags == HEAD_NODE_TAGS: - # First call. - return [] - else: - # Should not go here. - raise ValueError("Test passed invalid parameters.") - - -def mock_create_node(self, node_config, tags, count): - # Called during head node creation. Marks that a head node has been - # created. - if HEAD_NODE_TAGS.items() <= tags.items() and count == 1: - self._head_created = True - else: - raise ValueError(f"Test passed invalid parameter {tags} {count}.") - - -def mock_bootstrap_config(cluster_config): - # KubernetesNodeProvider.bootstrap_config has no side effects - # on cluster_config -- the method just creates K8s API objects. - # Thus it makes sense to dummy out the K8s API calls and return - # the config. - return cluster_config - - -def custom_resources(): - # K8s custom resources used in test. - here = os.path.realpath(__file__) - ray_python_root = os.path.dirname(os.path.dirname(here)) - relative_path = "autoscaler/kubernetes/operator_configs" - abs_path = os.path.join(ray_python_root, relative_path) - cluster1, cluster2 = "example_cluster.yaml", "example_cluster2.yaml" - path1, path2 = os.path.join(abs_path, cluster1), os.path.join( - abs_path, cluster2) - cr1, cr2 = (yaml.safe_load(open(path1).read()), - yaml.safe_load(open(path2).read())) - # Metadata and field is filled by K8s in real life. - cr1["metadata"]["uid"] = "abc" - cr2["metadata"]["uid"] = "xyz" - return cr1, cr2 - - -class OperatorTest(unittest.TestCase): - def test_no_file_mounts_k8s_operator_cluster_launch(self): - with patch.object(NodeUpdaterThread, START, mock_start),\ - patch.object(NodeUpdaterThread, JOIN, mock_join),\ - patch.object(RayCluster, SETUP_LOGGING, mock_setup_logging),\ - patch.object(RayCluster, WRITE_CONFIG, mock_write_config),\ - patch.object(KubernetesNodeProvider, INIT, mock_init),\ - patch.object(KubernetesNodeProvider, NON_TERMINATED_NODES, - mock_non_terminated_nodes),\ - patch.object(KubernetesNodeProvider, CREATE_NODE, - mock_create_node),\ - patch.object(KubernetesNodeProvider, BOOTSTRAP_CONFIG, - mock_bootstrap_config): - - cluster_cr1, cluster_cr2 = custom_resources() - - # Ensure that operator does not mount any files during cluster - # launch. - config1 = cr_to_config(cluster_cr1) - config1["provider"]["namespace"] = "test" - cluster1 = RayCluster(config1) - cluster1.start_head() - - # Check that this test is working correctly by inserting extraneous - # file mounts and confirming a ValueError from the mocked - # NodeUpdater. - config2 = cr_to_config(cluster_cr2) - config2["provider"]["namespace"] = "test" - # Note: There is no user interface for adding file mounts - # to the config of a Ray cluster run via the operator. - # This purely for purposes of testing this test. - config2["file_mounts"] = {"remote_foo": os.path.abspath(__file__)} - cluster2 = RayCluster(config2) - with pytest.raises(ValueError): - cluster2.start_head() - - -if __name__ == "__main__": - import sys - sys.exit(pytest.main(["-v", __file__])) From 7de90c8c01b41f109f9532ab685f3e72c4ad8f89 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 087/244] Revert "[tune/xgboost] Update release test docs (#13880)" This reverts commit fa03e44a788cdd5a79c3f726a67dfc19f20f801c. --- release/RELEASE_CHECKLIST.md | 14 -------------- release/RELEASE_PROCESS.rst | 25 +++---------------------- release/xgboost_tests/README.rst | 32 -------------------------------- 3 files changed, 3 insertions(+), 68 deletions(-) delete mode 100644 release/xgboost_tests/README.rst diff --git a/release/RELEASE_CHECKLIST.md b/release/RELEASE_CHECKLIST.md index da2d9145a825..9ab85f30bac0 100644 --- a/release/RELEASE_CHECKLIST.md +++ b/release/RELEASE_CHECKLIST.md @@ -60,20 +60,6 @@ This checklist is meant to be used in conjunction with the RELEASE_PROCESS.rst d - [ ] K8s Test - [ ] K8s cluster launcher test - [ ] K8s operator test -- [ ] Data processing tests - - [ ] streaming_shuffle -- [x] Tune tests - - [x] ignore for now -- [ ] XGBoost Tests - - [ ] distributed_api_test - - [ ] train_small - - [ ] train_moderate - - [ ] train_gpu - - [ ] tune_small - - [ ] tune_4x32 - - [ ] tune_32x4 - - [ ] ft_small_non_elastic (flaky!) - - [ ] ft_small_elastic (flaky!) ## Final Steps - [ ] Wheels uploaded to Test PyPI diff --git a/release/RELEASE_PROCESS.rst b/release/RELEASE_PROCESS.rst index f1decb4b6f99..80afb3589316 100644 --- a/release/RELEASE_PROCESS.rst +++ b/release/RELEASE_PROCESS.rst @@ -144,11 +144,11 @@ is generally the easiest way to run release tests. Run the ``ci/asan_tests`` with the commit. This will enable ASAN build and run the whole Python tests to detect memory leaks. -7. **K8s operator tests** +6. **K8s operator tests** Run the ``python/ray/tests/test_k8s_*`` to make sure K8s cluster launcher and operator works. Make sure the docker image is the released version. -8. **Data processing tests** +6. **Data processing tests** .. code-block:: bash @@ -162,26 +162,7 @@ is generally the easiest way to run release tests. **IMPORTANT** Check if the workload scripts has terminated. If so, please record the result (both read/write bandwidth and the shuffle result) to the ``release_logs/data_processing_tests/[test_name]``. Both shuffling runtime and read/write bandwidth shouldn't be decreasing more than 15% compared to the previous release. - -9. **Ray Tune release tests** - - General Ray Tune functionality is implicitly tested via RLLib and XGBoost release tests. - We are in the process of introducing scalability envelopes for Ray Tune. - This is an ongoing effort and will only be introduced in the next release. - For now, **you can ignore the tune_tests directory**. - -10. **XGBoost release tests** - - .. code-block:: bash - - xgboost_tests/README.rst - - Follow the instructions to kick off the tests and check the status of the workloads. - The XGBoost release tests use assertions or fail with exceptions and thus - should automatically tell you if they failed or not. - Only in the case of the fault tolerance tests you might want - to check the logs. See the readme for more information. - + Identify and Resolve Release Blockers ------------------------------------- diff --git a/release/xgboost_tests/README.rst b/release/xgboost_tests/README.rst deleted file mode 100644 index 303b09ef92e9..000000000000 --- a/release/xgboost_tests/README.rst +++ /dev/null @@ -1,32 +0,0 @@ -XGBoost on Ray tests -==================== - -This directory contains various XGBoost on Ray release tests. - -You should run these tests with the `releaser `_ tool. - -Overview --------- -There are four kinds of tests: - -1. ``distributed_api_test`` - checks general API functionality and should finish very quickly (< 1 minute) -2. ``train_*`` - checks single trial training on different setups. -3. ``tune_*`` - checks multi trial training via Ray Tune. -4. ``ft_*`` - checks fault tolerance. **These tests are currently flaky** - -Generally the releaser tool will run all tests in parallel, but if you do -it sequentially, be sure to do it in the order above. If ``train_*`` fails, -``tune_*`` will fail, too. - -Flaky fault tolerance tests ---------------------------- -The fault tolerance tests are currently flaky. In some runs, more nodes die -than expected, causing the test to fail. In other cases, the re-scheduled -actors become available too soon after crashing, causing the assertions to -fail. Please consider re-running the test a couple of times or contact the -test owner with outputs from the tests for further questions. - -Acceptance criteria -------------------- -These tests are considered passing when they throw no error at the end of -the output log. From 4bf022689986b22a3eb910c8c2449ea64d71f8d0 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 088/244] Revert "[docs] ray slack remove banners (#13898)" This reverts commit e780c6fab62970af7c5a252c5010d886a468014b. --- README.rst | 2 ++ doc/source/getting-involved.rst | 3 ++- doc/source/installation.rst | 2 ++ doc/source/raysgd/raysgd.rst | 2 ++ doc/source/rllib.rst | 2 ++ doc/source/serve/index.rst | 2 +- doc/source/tune/index.rst | 3 +++ 7 files changed, 14 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index a69fc92272bd..ee025cb38751 100644 --- a/README.rst +++ b/README.rst @@ -300,6 +300,7 @@ More Information Getting Involved ---------------- +- `Community Slack`_: Join our Slack workspace. - `Forum`_: For discussions about development, questions about usage, and feature requests. - `GitHub Issues`_: For reporting bugs. - `Twitter`_: Follow updates on Twitter. @@ -310,4 +311,5 @@ Getting Involved .. _`GitHub Issues`: https://github.com/ray-project/ray/issues .. _`StackOverflow`: https://stackoverflow.com/questions/tagged/ray .. _`Meetup Group`: https://www.meetup.com/Bay-Area-Ray-Meetup/ +.. _`Community Slack`: https://forms.gle/9TSdDYUgxYs8SA9e8 .. _`Twitter`: https://twitter.com/raydistributed diff --git a/doc/source/getting-involved.rst b/doc/source/getting-involved.rst index f1ef61b0938e..2ee0318a24a4 100644 --- a/doc/source/getting-involved.rst +++ b/doc/source/getting-involved.rst @@ -6,7 +6,8 @@ Getting Involved / Contributing Ray is more than a framework for distributed applications but also an active community of developers, researchers, and folks that love machine learning. -.. tip:: Ask questions on `our forum `_! The +.. tip:: Join our `community Slack `_ to + discuss Ray or ask questions on `our forum `_! The community is extremely active in helping people succeed in building their Ray applications. diff --git a/doc/source/installation.rst b/doc/source/installation.rst index a35dffea39cc..049d3ed28038 100644 --- a/doc/source/installation.rst +++ b/doc/source/installation.rst @@ -3,6 +3,8 @@ Installing Ray ============== +.. tip:: Join our `community slack `_ to discuss Ray! + Ray currently supports MacOS and Linux. Windows wheels are now available, but :ref:`Windows support ` is experimental and under development. diff --git a/doc/source/raysgd/raysgd.rst b/doc/source/raysgd/raysgd.rst index 85fd335f3fd8..5ab6503e44ad 100644 --- a/doc/source/raysgd/raysgd.rst +++ b/doc/source/raysgd/raysgd.rst @@ -14,6 +14,8 @@ The main features are: - **Composability**: RaySGD is built on top of the Ray Actor API, enabling seamless integration with existing Ray applications such as RLlib, Tune, and Ray.Serve. - **Scale up and down**: Start on single CPU. Scale up to multi-node, multi-CPU, or multi-GPU clusters by changing 2 lines of code. +.. tip:: Join our `community slack `_ to discuss Ray! + Getting Started --------------- diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst index 33a808a042cd..bbe35f36ea60 100644 --- a/doc/source/rllib.rst +++ b/doc/source/rllib.rst @@ -9,6 +9,8 @@ RLlib is an open-source library for reinforcement learning that offers both high To get started, take a look over the `custom env example `__ and the `API documentation `__. If you're looking to develop custom algorithms with RLlib, also check out `concepts and custom algorithms `__. +.. tip:: Join our `community slack `_ to discuss Ray/RLlib! + RLlib in 60 seconds ------------------- diff --git a/doc/source/serve/index.rst b/doc/source/serve/index.rst index f15093b6c0cb..e9f76d89b7a9 100644 --- a/doc/source/serve/index.rst +++ b/doc/source/serve/index.rst @@ -30,7 +30,7 @@ Ray Serve can be used in two primary ways to deploy your models at scale: .. tip:: - Chat with Ray Serve users and developers on our `forum `_! + Chat with Ray Serve users and developers on our `community Slack `_ in the #serve channel and on our `forum `_! .. note:: Starting with Ray version 1.2.0, Ray Serve backends take in a Starlette Request object instead of a Flask Request object. diff --git a/doc/source/tune/index.rst b/doc/source/tune/index.rst index 59fd6ad0efaf..2003b2eacb80 100644 --- a/doc/source/tune/index.rst +++ b/doc/source/tune/index.rst @@ -21,6 +21,9 @@ Tune is a Python library for experiment execution and hyperparameter tuning at a **Want to get started?** Head over to the :doc:`Key Concepts page `. +.. tip:: Join the `Ray community slack `_ to discuss Ray Tune (and other Ray libraries)! + + Quick Start ----------- From 819b7fd98241d9b5231d6f4d5fbd13d68fbfdc5f Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 089/244] Revert "[tune] fix gpu check (#13825)" This reverts commit de2a997cd86805b1736f2dccf8bd0d0d9f9fc1a7. --- python/ray/tune/utils/util.py | 53 +++++++++-------------------------- 1 file changed, 14 insertions(+), 39 deletions(-) diff --git a/python/ray/tune/utils/util.py b/python/ray/tune/utils/util.py index 02daa858fd75..688261fdb2c0 100644 --- a/python/ray/tune/utils/util.py +++ b/python/ray/tune/utils/util.py @@ -462,29 +462,27 @@ def load_newest_checkpoint(dirpath: str, ckpt_pattern: str) -> dict: return checkpoint_state -def wait_for_gpu(gpu_id=None, - target_util=0.01, - retry=20, - gpu_memory_limit=None): +def wait_for_gpu(gpu_id=None, gpu_memory_limit=0.1, retry=20): """Checks if a given GPU has freed memory. Requires ``gputil`` to be installed: ``pip install gputil``. Args: - gpu_id (Optional[Union[int, str]]): GPU id or uuid to check. - Must be found within GPUtil.getGPUs(). If none, resorts to + gpu_id (Optional[str]): GPU id to check. Must be found + within GPUtil.getGPUs(). If none, resorts to the first item returned from `ray.get_gpu_ids()`. - target_util (float): The utilization threshold to reach to unblock. - Set this to 0 to block until the GPU is completely free. + gpu_memory_limit (float): If memory usage is below + this quantity, the check will break. retry (int): Number of times to check GPU limit. Sleeps 5 seconds between checks. - gpu_memory_limit (float): Deprecated. Returns: - bool: True if free. + bool + True if free. Raises: - RuntimeError: If GPUtil is not found, if no GPUs are detected + RuntimeError + If GPUtil is not found, if no GPUs are detected or if the check fails. Example: @@ -497,43 +495,20 @@ def tune_func(config): tune.run(tune_func, resources_per_trial={"GPU": 1}, num_samples=10) """ - if gpu_memory_limit: - raise ValueError("'gpu_memory_limit' is deprecated. " - "Use 'target_util' instead.") if GPUtil is None: raise RuntimeError( "GPUtil must be installed if calling `wait_for_gpu`.") - if gpu_id is None: + if not gpu_id: gpu_id_list = ray.get_gpu_ids() if not gpu_id_list: raise RuntimeError(f"No GPU ids found from {ray.get_gpu_ids()}. " "Did you set Tune resources correctly?") gpu_id = gpu_id_list[0] - - if isinstance(gpu_id, int): - list_gpu_ids = [g.id for g in GPUtil.getGPUs()] - if gpu_id not in list_gpu_ids: - raise ValueError( - f"{gpu_id} (int) not found in GPU ids: {list_gpu_ids}. " - "wait_for_gpu takes either int (gpu id) or str (gpu uuid).") - elif isinstance(gpu_id, str): - list_uuids = [g.uuid for g in GPUtil.getGPUs()] - if gpu_id not in list_uuids: - raise ValueError( - f"{gpu_id} (str) not found in GPU uuids: {list_uuids}. " - "wait_for_gpu takes either int (gpu id) or str (gpu uuid).") - else: - raise ValueError(f"gpu_id must be int or str -- got ({type(gpu_id)})") - + gpu_object = GPUtil.getGPUs()[gpu_id] for i in range(int(retry)): - if isinstance(gpu_id, int): - gpu_object = [g for g in GPUtil.getGPUs() if g.id == gpu_id][0] - else: - gpu_object = [g for g in GPUtil.getGPUs() if g.uuid == gpu_id][0] - - if gpu_object.memoryUtil > target_util: - logger.info(f"Waiting for GPU util to reach {target_util}. " - f"Util: {gpu_object.memoryUtil:0.3f}") + if gpu_object.memoryUsed > gpu_memory_limit: + logger.info(f"Waiting for GPU {gpu_id} memory to free. " + f"Mem: {gpu_object.memoryUsed:0.3f}") time.sleep(5) else: return True From c4c7f6102126a03350c8ce0ce11cb6ca4759fba1 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 090/244] Revert "Check in shuffle code as experimental (#13899)" This reverts commit 154730bfae934112d0c3c89d94bc5c3bdae05ade. --- python/ray/experimental/shuffle.py | 213 ----------------------------- python/ray/tests/BUILD | 1 - python/ray/tests/test_shuffle.py | 12 -- 3 files changed, 226 deletions(-) delete mode 100644 python/ray/experimental/shuffle.py delete mode 100644 python/ray/tests/test_shuffle.py diff --git a/python/ray/experimental/shuffle.py b/python/ray/experimental/shuffle.py deleted file mode 100644 index 6b7936ddf85b..000000000000 --- a/python/ray/experimental/shuffle.py +++ /dev/null @@ -1,213 +0,0 @@ -"""A simple distributed shuffle implementation in Ray. - -This utility provides a `simple_shuffle` function that can be used to -redistribute M input partitions into N output partitions. It does this with -a single wave of shuffle map tasks followed by a single wave of shuffle reduce -tasks. Each shuffle map task generates O(N) output objects, and each shuffle -reduce task consumes O(M) input objects, for a total of O(N*M) objects. - -To try an example 10GB shuffle, run: - - $ python -m ray.experimental.shuffle \ - --num-partitions=50 --partition-size=200e6 \ - --object-store-memory=1e9 - -This will print out some statistics on the shuffle execution such as: - - --- Aggregate object store stats across all nodes --- - Plasma memory usage 0 MiB, 0 objects, 0.0% full - Spilled 9487 MiB, 2487 objects, avg write throughput 1023 MiB/s - Restored 9487 MiB, 2487 objects, avg read throughput 1358 MiB/s - Objects consumed by Ray tasks: 9537 MiB. - - Shuffled 9536 MiB in 16.579771757125854 seconds -""" - -from typing import List, Iterable, Tuple, Callable, Any - -import ray -from ray import ObjectRef - -# TODO(ekl) why doesn't TypeVar() deserialize properly in Ray? -# The type produced by the input reader function. -InType = Any -# The type produced by the output writer function. -OutType = Any -# Integer identifying the partition number. -PartitionID = int - - -class ObjectStoreWriter: - """This class is used to stream shuffle map outputs to the object store. - - It can be subclassed to optimize writing (e.g., batching together small - records into larger objects). This will be performance critical if your - input records are small (the example shuffle uses very large records, so - the naive strategy works well). - """ - - def __init__(self): - self.results = [] - - def add(self, item: InType) -> None: - """Queue a single item to be written to the object store. - - This base implementation immediately writes each given item to the - object store as a standalone object. - """ - self.results.append(ray.put(item)) - - def finish(self) -> List[ObjectRef]: - """Return list of object refs representing written items.""" - return self.results - - -def round_robin_partitioner(input_stream: Iterable[InType], num_partitions: int - ) -> Iterable[Tuple[PartitionID, InType]]: - """Round robin partitions items from the input reader. - - You can write custom partitioning functions for your use case. - - Args: - input_stream: Iterator over items from the input reader. - num_partitions: Number of output partitions. - - Yields: - Tuples of (partition id, input item). - """ - i = 0 - for item in input_stream: - yield (i, item) - i += 1 - i %= num_partitions - - -def simple_shuffle( - *, - input_reader: Callable[[PartitionID], Iterable[InType]], - input_num_partitions: int, - output_num_partitions: int, - output_writer: Callable[[PartitionID, List[ObjectRef]], OutType], - partitioner: Callable[[Iterable[InType], int], Iterable[ - PartitionID]] = round_robin_partitioner, - object_store_writer: ObjectStoreWriter = ObjectStoreWriter, -) -> List[OutType]: - """Simple distributed shuffle in Ray. - - Args: - input_reader: Function that generates the input items for a - partition (e.g., data records). - input_num_partitions: The number of input partitions. - output_num_partitions: The desired number of output partitions. - output_writer: Function that consumes a iterator of items for a - given output partition. It returns a single value that will be - collected across all output partitions. - partitioner: Partitioning function to use. Defaults to round-robin - partitioning of input items. - object_store_writer: Class used to write input items to the - object store in an efficient way. Defaults to a naive - implementation that writes each input record as one object. - - Returns: - List of outputs from the output writers. - """ - - @ray.remote(num_returns=output_num_partitions) - def shuffle_map(i: PartitionID) -> List[List[ObjectRef]]: - writers = [object_store_writer() for _ in range(output_num_partitions)] - for out_i, item in partitioner(input_reader(i), output_num_partitions): - writers[out_i].add(item) - return [c.finish() for c in writers] - - @ray.remote - def shuffle_reduce(i: PartitionID, - *mapper_outputs: List[List[ObjectRef]]) -> OutType: - input_objects = [] - assert len(mapper_outputs) == input_num_partitions - for obj_refs in mapper_outputs: - for obj_ref in obj_refs: - input_objects.append(obj_ref) - return output_writer(i, input_objects) - - shuffle_map_out = [ - shuffle_map.remote(i) for i in range(input_num_partitions) - ] - - shuffle_reduce_out = [ - shuffle_reduce.remote( - j, *[shuffle_map_out[i][j] for i in range(input_num_partitions)]) - for j in range(output_num_partitions) - ] - - return ray.get(shuffle_reduce_out) - - -@ray.remote -class _StatusTracker: - def __init__(self): - self.num_map = 0 - self.num_reduce = 0 - - def inc(self): - self.num_map += 1 - print("Num map tasks finished", self.num_map) - - def inc2(self): - self.num_reduce += 1 - print("Num reduce tasks finished", self.num_reduce) - - -def main(): - import argparse - import numpy as np - import time - - parser = argparse.ArgumentParser() - parser.add_argument("--ray-address", type=str, default=None) - parser.add_argument("--object-store-memory", type=float, default=1e9) - parser.add_argument("--num-partitions", type=int, default=5) - parser.add_argument("--partition-size", type=float, default=200e6) - args = parser.parse_args() - - ray.init( - address=args.ray_address, object_store_memory=args.object_store_memory) - - partition_size = int(args.partition_size) - num_partitions = args.num_partitions - rows_per_partition = partition_size // (8 * 2) - tracker = _StatusTracker.remote() - - def input_reader(i: PartitionID) -> Iterable[InType]: - for _ in range(num_partitions): - yield np.ones( - (rows_per_partition // num_partitions, 2), dtype=np.int64) - tracker.inc.remote() - - def output_writer(i: PartitionID, - shuffle_inputs: List[ObjectRef]) -> OutType: - total = 0 - # TODO(ekl) using ray.wait can be more efficient for pipelining. - for obj_ref in shuffle_inputs: - arr = ray.get(obj_ref) - total += arr.size * arr.itemsize - tracker.inc2.remote() - return total - - start = time.time() - output_sizes = simple_shuffle( - input_reader=input_reader, - input_num_partitions=num_partitions, - output_num_partitions=num_partitions, - output_writer=output_writer) - delta = time.time() - start - - time.sleep(.5) - print() - print(ray.internal.internal_api.memory_summary(stats_only=True)) - print() - print("Shuffled", int(sum(output_sizes) / (1024 * 1024)), "MiB in", delta, - "seconds") - - -if __name__ == "__main__": - main() diff --git a/python/ray/tests/BUILD b/python/ray/tests/BUILD index 806f04fe56df..6bb68b8543cb 100644 --- a/python/ray/tests/BUILD +++ b/python/ray/tests/BUILD @@ -102,7 +102,6 @@ py_test_module_list( "test_queue.py", "test_ray_debugger.py", "test_ray_init.py", - "test_shuffle.py", "test_tempfile.py", ], size = "small", diff --git a/python/ray/tests/test_shuffle.py b/python/ray/tests/test_shuffle.py deleted file mode 100644 index 31a62f691c9b..000000000000 --- a/python/ray/tests/test_shuffle.py +++ /dev/null @@ -1,12 +0,0 @@ -import pytest -import sys - -from ray.experimental import shuffle - - -def test_shuffle(): - shuffle.main() - - -if __name__ == "__main__": - sys.exit(pytest.main(["-v", __file__])) From 5736823a32346bcfa27da2e843a827922405ec26 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 091/244] Revert "Fall back to random port instead of default port for non-primary Redis shards; attempt to cluster Redis shard ports close to each other. (#13847)" This reverts commit 6452d71984ac7f28c0ecba1424500527439070ef. --- python/ray/_private/services.py | 45 ++++++++++++--------------------- python/ray/parameter.py | 7 ++--- 2 files changed, 18 insertions(+), 34 deletions(-) diff --git a/python/ray/_private/services.py b/python/ray/_private/services.py index 4ae4fed1758e..996cede111d6 100644 --- a/python/ray/_private/services.py +++ b/python/ray/_private/services.py @@ -829,13 +829,6 @@ def start_redis(node_ip_address, redis_modules = [REDIS_MODULE] redis_stdout_file, redis_stderr_file = redirect_files[0] - # If no port is given, fallback to default Redis port for the primary - # shard. - if port is None: - port = ray_constants.DEFAULT_PORT - num_retries = 20 - else: - num_retries = 1 # Start the primary Redis shard. port, p = _start_redis_instance( redis_executable, @@ -843,7 +836,6 @@ def start_redis(node_ip_address, port=port, password=password, redis_max_clients=redis_max_clients, - num_retries=num_retries, # Below we use None to indicate no limit on the memory of the # primary Redis shard. redis_max_memory=None, @@ -877,29 +869,17 @@ def start_redis(node_ip_address, # Start other Redis shards. Each Redis shard logs to a separate file, # prefixed by "redis-". redis_shards = [] - # Attempt to start the other Redis shards port range right after the - # primary Redis shard port. - last_shard_port = port for i in range(num_redis_shards): redis_stdout_file, redis_stderr_file = redirect_files[i + 1] redis_executable = REDIS_EXECUTABLE redis_modules = [REDIS_MODULE] - redis_shard_port = redis_shard_ports[i] - # If no shard port is given, try to start this shard's Redis instance - # on the port right after the last shard's port. - if redis_shard_port is None: - redis_shard_port = last_shard_port + 1 - num_retries = 20 - else: - num_retries = 1 redis_shard_port, p = _start_redis_instance( redis_executable, modules=redis_modules, - port=redis_shard_port, + port=redis_shard_ports[i], password=password, redis_max_clients=redis_max_clients, - num_retries=num_retries, redis_max_memory=redis_max_memory, stdout_file=redis_stdout_file, stderr_file=redis_stderr_file, @@ -910,14 +890,13 @@ def start_redis(node_ip_address, redis_shards.append(shard_address) # Store redis shard information in the primary redis shard. primary_redis_client.rpush("RedisShards", shard_address) - last_shard_port = redis_shard_port return redis_address, redis_shards, processes def _start_redis_instance(executable, modules, - port, + port=None, redis_max_clients=None, num_retries=20, stdout_file=None, @@ -928,19 +907,20 @@ def _start_redis_instance(executable, """Start a single Redis server. Notes: - We will initially try to start the Redis instance at the given port, - and then try at most `num_retries - 1` times to start the Redis - instance at successive random ports. + If "port" is not None, then we will only use this port and try + only once. Otherwise, we will first try the default redis port, + and if it is unavailable, we will try random ports with + maximum retries of "num_retries". Args: executable (str): Full path of the redis-server executable. modules (list of str): A list of pathnames, pointing to the redis module(s) that will be loaded in this redis server. - port (int): Try to start a Redis server at this port. + port (int): If provided, start a Redis server with this port. redis_max_clients: If this is provided, Ray will attempt to configure Redis with this maxclients number. - num_retries (int): The number of times to attempt to start Redis at - successive ports. + num_retries (int): The number of times to attempt to start Redis. If a + port is provided, this defaults to 1. stdout_file: A file handle opened for writing to redirect stdout to. If no redirection should happen, then this should be None. stderr_file: A file handle opened for writing to redirect stderr to. If @@ -963,6 +943,13 @@ def _start_redis_instance(executable, for module in modules: assert os.path.isfile(module) counter = 0 + if port is not None: + # If a port is specified, then try only once to connect. + # This ensures that we will use the given port. + num_retries = 1 + else: + port = ray_constants.DEFAULT_PORT + load_module_args = [] for module in modules: load_module_args += ["--loadmodule", module] diff --git a/python/ray/parameter.py b/python/ray/parameter.py index 043cc258c0d9..af7bdf47593d 100644 --- a/python/ray/parameter.py +++ b/python/ray/parameter.py @@ -17,12 +17,9 @@ class RayParams: raylet, a plasma store, a plasma manager, and some workers. It will also kill these processes when Python exits. redis_port (int): The port that the primary Redis shard should listen - to. If None, then it will fall back to - ray.ray_constants.DEFAULT_PORT, or a random port if the default is - not available. + to. If None, then a random port will be chosen. redis_shard_ports: A list of the ports to use for the non-primary Redis - shards. If None, then it will fall back to the ports right after - redis_port, or random ports if those are not available. + shards. num_cpus (int): Number of CPUs to configure the raylet with. num_gpus (int): Number of GPUs to configure the raylet with. resources: A dictionary mapping the name of a resource to the quantity From f86a7702e6b97795908770c3383ee1e3e5cc2089 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 092/244] Revert "Scalability envelope readme typo (#13874)" This reverts commit 85011a3c15e304682ee0382a7199c102587e95fb. --- benchmarks/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 352845dd02b5..2167151656a9 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -26,7 +26,7 @@ All single node benchmarks are run on a single m4.16xlarge. | Dimension | Quantity | | --------- | -------- | -| # of object arguments to a single task | 10000+ | +| # of object artuments to a single task | 10000+ | | # of objects returned from a single task | 3000+ | | # of plasma objects in a single `ray.get` call | 10000+ | | # of tasks queued on a single node | 1,000,000+ | From 66d57302e11f0597461d7f87a0a7d0ae8931e895 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 093/244] Revert "Rename timeout to period with heartbeat interval (#13872)" This reverts commit 187162d95bb060967143503473623925612dbce1. --- python/ray/includes/ray_config.pxd | 2 +- python/ray/includes/ray_config.pxi | 4 ++-- python/ray/tests/test_actor_failures.py | 2 +- python/ray/tests/test_failure.py | 4 ++-- python/ray/tests/test_reconstruction.py | 18 +++++++++--------- src/ray/common/ray_config_def.h | 4 ++-- .../gcs/gcs_server/gcs_heartbeat_manager.cc | 2 +- src/ray/raylet/main.cc | 2 +- src/ray/raylet/node_manager.cc | 4 ++-- 9 files changed, 21 insertions(+), 21 deletions(-) diff --git a/python/ray/includes/ray_config.pxd b/python/ray/includes/ray_config.pxd index 309132cf74c6..079f30690998 100644 --- a/python/ray/includes/ray_config.pxd +++ b/python/ray/includes/ray_config.pxd @@ -13,7 +13,7 @@ cdef extern from "ray/common/ray_config.h" nogil: int64_t handler_warning_timeout_ms() const - int64_t raylet_heartbeat_period_milliseconds() const + int64_t raylet_heartbeat_timeout_milliseconds() const int64_t debug_dump_period_milliseconds() const diff --git a/python/ray/includes/ray_config.pxi b/python/ray/includes/ray_config.pxi index d6c28805c48c..96a2a14f24d8 100644 --- a/python/ray/includes/ray_config.pxi +++ b/python/ray/includes/ray_config.pxi @@ -10,8 +10,8 @@ cdef class Config: return RayConfig.instance().handler_warning_timeout_ms() @staticmethod - def raylet_heartbeat_period_milliseconds(): - return RayConfig.instance().raylet_heartbeat_period_milliseconds() + def raylet_heartbeat_timeout_milliseconds(): + return RayConfig.instance().raylet_heartbeat_timeout_milliseconds() @staticmethod def debug_dump_period_milliseconds(): diff --git a/python/ray/tests/test_actor_failures.py b/python/ray/tests/test_actor_failures.py index 4e2e19f1bfd0..227fb48d211d 100644 --- a/python/ray/tests/test_actor_failures.py +++ b/python/ray/tests/test_actor_failures.py @@ -275,7 +275,7 @@ def call_other(self, counter, signal): def test_actor_restart_on_node_failure(ray_start_cluster): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_period_milliseconds": 100, + "raylet_heartbeat_timeout_milliseconds": 100, "object_timeout_milliseconds": 1000, "task_retry_delay_ms": 100, } diff --git a/python/ray/tests/test_failure.py b/python/ray/tests/test_failure.py index f6aad1fa3185..abd82011d1e4 100644 --- a/python/ray/tests/test_failure.py +++ b/python/ray/tests/test_failure.py @@ -990,7 +990,7 @@ def sleep_to_kill_raylet(): def test_connect_with_disconnected_node(shutdown_only): config = { "num_heartbeats_timeout": 50, - "raylet_heartbeat_period_milliseconds": 10, + "raylet_heartbeat_timeout_milliseconds": 10, } cluster = Cluster() cluster.add_node(num_cpus=0, _system_config=config) @@ -1202,7 +1202,7 @@ def get(obj_refs, test_dependent_task): def test_fate_sharing(ray_start_cluster, use_actors, node_failure): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_period_milliseconds": 100, + "raylet_heartbeat_timeout_milliseconds": 100, } cluster = Cluster() # Head node with no resources. diff --git a/python/ray/tests/test_reconstruction.py b/python/ray/tests/test_reconstruction.py index 35d00a9b819d..1cd1f133a911 100644 --- a/python/ray/tests/test_reconstruction.py +++ b/python/ray/tests/test_reconstruction.py @@ -17,7 +17,7 @@ def test_cached_object(ray_start_cluster): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_period_milliseconds": 100, + "raylet_heartbeat_timeout_milliseconds": 100, "object_timeout_milliseconds": 200, } cluster = ray_start_cluster @@ -59,7 +59,7 @@ def test_reconstruction_cached_dependency(ray_start_cluster, reconstruction_enabled): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_period_milliseconds": 100, + "raylet_heartbeat_timeout_milliseconds": 100, "object_timeout_milliseconds": 200, } # Workaround to reset the config to the default value. @@ -118,7 +118,7 @@ def dependent_task(x): def test_basic_reconstruction(ray_start_cluster, reconstruction_enabled): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_period_milliseconds": 100, + "raylet_heartbeat_timeout_milliseconds": 100, "object_timeout_milliseconds": 200, } # Workaround to reset the config to the default value. @@ -167,7 +167,7 @@ def dependent_task(x): def test_basic_reconstruction_put(ray_start_cluster, reconstruction_enabled): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_period_milliseconds": 100, + "raylet_heartbeat_timeout_milliseconds": 100, "object_timeout_milliseconds": 200, } # Workaround to reset the config to the default value. @@ -224,7 +224,7 @@ def test_basic_reconstruction_actor_task(ray_start_cluster, reconstruction_enabled): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_period_milliseconds": 100, + "raylet_heartbeat_timeout_milliseconds": 100, "object_timeout_milliseconds": 200, } # Workaround to reset the config to the default value. @@ -297,7 +297,7 @@ def test_basic_reconstruction_actor_constructor(ray_start_cluster, reconstruction_enabled): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_period_milliseconds": 100, + "raylet_heartbeat_timeout_milliseconds": 100, "object_timeout_milliseconds": 200, } # Workaround to reset the config to the default value. @@ -377,7 +377,7 @@ def probe(): def test_multiple_downstream_tasks(ray_start_cluster, reconstruction_enabled): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_period_milliseconds": 100, + "raylet_heartbeat_timeout_milliseconds": 100, "object_timeout_milliseconds": 200, } # Workaround to reset the config to the default value. @@ -442,7 +442,7 @@ def dependent_task(x): def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_period_milliseconds": 100, + "raylet_heartbeat_timeout_milliseconds": 100, "object_timeout_milliseconds": 200, } # Workaround to reset the config to the default value. @@ -494,7 +494,7 @@ def dependent_task(x): def test_reconstruction_stress(ray_start_cluster): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_period_milliseconds": 100, + "raylet_heartbeat_timeout_milliseconds": 100, "max_direct_call_object_size": 100, "task_retry_delay_ms": 100, "object_timeout_milliseconds": 200, diff --git a/src/ray/common/ray_config_def.h b/src/ray/common/ray_config_def.h index f109bbd59ea9..cd6bd84cee9c 100644 --- a/src/ray/common/ray_config_def.h +++ b/src/ray/common/ray_config_def.h @@ -35,7 +35,7 @@ RAY_CONFIG(int64_t, ray_cookie, 0x5241590000000000) RAY_CONFIG(int64_t, handler_warning_timeout_ms, 1000) /// The duration between heartbeats sent by the raylets. -RAY_CONFIG(int64_t, raylet_heartbeat_period_milliseconds, 100) +RAY_CONFIG(int64_t, raylet_heartbeat_timeout_milliseconds, 100) /// If a component has not sent a heartbeat in the last num_heartbeats_timeout /// heartbeat intervals, the raylet monitor process will report /// it as dead to the db_client table. @@ -93,7 +93,7 @@ RAY_CONFIG(bool, record_ref_creation_sites, true) /// serialized, then either passed as an argument or returned from a task. /// NOTE(swang): The timer is checked by the raylet during every heartbeat, so /// this should be set to a value larger than -/// raylet_heartbeat_period_milliseconds. +/// raylet_heartbeat_timeout_milliseconds. RAY_CONFIG(int64_t, free_objects_period_milliseconds, 1000) /// If object_pinning_enabled is on, then objects that have been unpinned are diff --git a/src/ray/gcs/gcs_server/gcs_heartbeat_manager.cc b/src/ray/gcs/gcs_server/gcs_heartbeat_manager.cc index 5991c20a8f0e..b6dd56945cbf 100644 --- a/src/ray/gcs/gcs_server/gcs_heartbeat_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_heartbeat_manager.cc @@ -103,7 +103,7 @@ void GcsHeartbeatManager::DetectDeadNodes() { void GcsHeartbeatManager::ScheduleTick() { auto heartbeat_period = boost::posix_time::milliseconds( - RayConfig::instance().raylet_heartbeat_period_milliseconds()); + RayConfig::instance().raylet_heartbeat_timeout_milliseconds()); detect_timer_.expires_from_now(heartbeat_period); detect_timer_.async_wait([this](const boost::system::error_code &error) { if (error == boost::asio::error::operation_aborted) { diff --git a/src/ray/raylet/main.cc b/src/ray/raylet/main.cc index 1d47f23b356a..ba6a53ee473f 100644 --- a/src/ray/raylet/main.cc +++ b/src/ray/raylet/main.cc @@ -196,7 +196,7 @@ int main(int argc, char *argv[]) { } node_manager_config.heartbeat_period_ms = - RayConfig::instance().raylet_heartbeat_period_milliseconds(); + RayConfig::instance().raylet_heartbeat_timeout_milliseconds(); node_manager_config.report_resources_period_ms = RayConfig::instance().raylet_report_resources_period_milliseconds(); node_manager_config.debug_dump_period_ms = diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index d0e3be78b23f..2c20bab40a39 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -410,7 +410,7 @@ void NodeManager::Heartbeat() { uint64_t now_ms = current_time_ms(); uint64_t interval = now_ms - last_heartbeat_at_ms_; if (interval > RayConfig::instance().num_heartbeats_warning() * - RayConfig::instance().raylet_heartbeat_period_milliseconds()) { + RayConfig::instance().raylet_heartbeat_timeout_milliseconds()) { RAY_LOG(WARNING) << "Last heartbeat was sent " << interval << " ms ago. There might be resource pressure on this node. If heartbeat keeps " @@ -723,7 +723,7 @@ void NodeManager::NodeRemoved(const NodeID &node_id) { << "Exiting because this node manager has mistakenly been marked dead by the " << "monitor: GCS didn't receive heartbeats within timeout " << RayConfig::instance().num_heartbeats_timeout() * - RayConfig::instance().raylet_heartbeat_period_milliseconds() + RayConfig::instance().raylet_heartbeat_timeout_milliseconds() << " ms. This is likely since the machine or raylet became overloaded."; // Below, when we remove node_id from all of these data structures, we could From 1fa76911bbd3cedaf93cbf861864602f14b7b5b7 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 094/244] Revert "Always replace DEL with UNLINK (#13832)" This reverts commit bac8ffa50261b45edacd05b191670ae09bb0b3a2. --- src/ray/gcs/store_client/redis_store_client.cc | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/ray/gcs/store_client/redis_store_client.cc b/src/ray/gcs/store_client/redis_store_client.cc index 0216b92a6942..4db20698861d 100644 --- a/src/ray/gcs/store_client/redis_store_client.cc +++ b/src/ray/gcs/store_client/redis_store_client.cc @@ -104,8 +104,7 @@ Status RedisStoreClient::AsyncDelete(const std::string &table_name, } std::string redis_key = GenRedisKey(table_name, key); - // We always replace `DEL` with `UNLINK`. - std::vector args = {"UNLINK", redis_key}; + std::vector args = {"DEL", redis_key}; auto shard_context = redis_client_->GetShardContext(redis_key); return shard_context->RunArgvAsync(args, delete_callback); @@ -219,11 +218,10 @@ Status RedisStoreClient::DoPut(const std::string &key, const std::string &data, Status RedisStoreClient::DeleteByKeys(const std::vector &keys, const StatusCallback &callback) { - // Delete for each shard. - // We always replace `DEL` with `UNLINK`. + // The `DEL` command for each shard. int total_count = 0; auto del_commands_by_shards = - GenCommandsByShards(redis_client_, "UNLINK", keys, &total_count); + GenCommandsByShards(redis_client_, "DEL", keys, &total_count); auto finished_count = std::make_shared(0); From 30429fab1951b4cb7a4d8b845c33203eed132bbc Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 095/244] Revert "[autoscaler][kubernetes][operator] Rudimentary error handling, make "MODIFIED" -> update event work. (#13756)" This reverts commit 3749d2d15da1c7f3646caa91b987922f64c95599. --- doc/source/cluster/k8s-operator.rst | 3 -- .../operator_configs/cluster_crd.yaml | 16 -------- .../kubernetes/operator_configs/operator.yaml | 2 +- python/ray/ray_operator/operator.py | 39 +++---------------- python/ray/ray_operator/operator_utils.py | 13 ------- .../ray/tests/test_k8s_operator_examples.py | 14 +------ 6 files changed, 7 insertions(+), 80 deletions(-) diff --git a/doc/source/cluster/k8s-operator.rst b/doc/source/cluster/k8s-operator.rst index d846fe029177..2fb8efef8974 100644 --- a/doc/source/cluster/k8s-operator.rst +++ b/doc/source/cluster/k8s-operator.rst @@ -19,9 +19,6 @@ The rest of this document explains step-by-step how to use the Ray Kubernetes Op .. role:: bash(code) :language: bash -.. note:: - The Ray Kubernetes Operator is still experimental. For the yaml files in the examples below, we recomend using the latest master version of Ray. - .. warning:: The Ray Kubernetes Operator requires Kubernetes version at least ``v1.17.0``. Check Kubernetes version info with the command :bash:`kubectl version`. diff --git a/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml b/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml index 5387803c136e..75a802b58d87 100644 --- a/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml +++ b/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml @@ -13,16 +13,6 @@ spec: - name: v1 served: true storage: true - subresources: - status: {} - additionalPrinterColumns: - - name: status - type: string - description: Running or Error - jsonPath: .status.phase - - name: age - type: date - jsonPath: .metadata.creationTimestamp schema: openAPIV3Schema: description: Ray cluster configuration @@ -30,12 +20,6 @@ spec: required: - spec properties: - status: - type: object - properties: - phase: - description: Running or Error - type: string spec: type: object required: diff --git a/python/ray/autoscaler/kubernetes/operator_configs/operator.yaml b/python/ray/autoscaler/kubernetes/operator_configs/operator.yaml index 6f259a9a7467..2c170f072df8 100644 --- a/python/ray/autoscaler/kubernetes/operator_configs/operator.yaml +++ b/python/ray/autoscaler/kubernetes/operator_configs/operator.yaml @@ -10,7 +10,7 @@ metadata: name: ray-operator-role rules: - apiGroups: ["", "cluster.ray.io"] - resources: ["rayclusters", "rayclusters/finalizers", "rayclusters/status", "pods", "pods/exec"] + resources: ["rayclusters", "rayclusters/finalizers", "pods", "pods/exec"] verbs: ["get", "watch", "list", "create", "delete", "patch", "update"] --- apiVersion: rbac.authorization.k8s.io/v1 diff --git a/python/ray/ray_operator/operator.py b/python/ray/ray_operator/operator.py index e39f4cfef322..cc03c2fefc8f 100644 --- a/python/ray/ray_operator/operator.py +++ b/python/ray/ray_operator/operator.py @@ -12,12 +12,10 @@ from ray.ray_operator import operator_utils from ray import ray_constants -logger = logging.getLogger(__name__) - class RayCluster(): def __init__(self, config: Dict[str, Any]): - self.set_config(config) + self.config = config self.name = self.config["cluster_name"] self.config_path = operator_utils.config_path(self.name) @@ -25,9 +23,6 @@ def __init__(self, config: Dict[str, Any]): self.subprocess = None # type: Optional[mp.Process] - def set_config(self, config: Dict[str, Any]) -> None: - self.config = config - def do_in_subprocess(self, f: Callable[[], None], wait_to_finish: bool = False) -> None: @@ -101,42 +96,18 @@ def delete_config(self) -> None: ray_clusters = {} -last_generation = {} - -def handle_event(event_type, cluster_cr, cluster_name): - # TODO: This only detects errors in the parent process and thus doesn't - # catch cluster-specific autoscaling failures. Fix that (perhaps at - # the same time that we eliminate subprocesses). - try: - cluster_action(event_type, cluster_cr, cluster_name) - except Exception: - logger.exception(f"Error while updating RayCluster {cluster_name}.") - operator_utils.set_status(cluster_cr, cluster_name, "Error") - -def cluster_action(event_type, cluster_cr, cluster_name) -> None: - - cluster_config = operator_utils.cr_to_config(cluster_cr) +def cluster_action(cluster_config: Dict[str, Any], event_type: str) -> None: cluster_name = cluster_config["cluster_name"] - if event_type == "ADDED": - operator_utils.set_status(cluster_cr, cluster_name, "Running") ray_clusters[cluster_name] = RayCluster(cluster_config) ray_clusters[cluster_name].create_or_update() - last_generation[cluster_name] = cluster_cr["metadata"]["generation"] elif event_type == "MODIFIED": - # Check metadata.generation to determine if there's a spec change. - current_generation = cluster_cr["metadata"]["generation"] - if current_generation > last_generation[cluster_name]: - ray_clusters[cluster_name].set_config(cluster_config) - ray_clusters[cluster_name].create_or_update() - last_generation[cluster_name] = current_generation - + ray_clusters[cluster_name].create_or_update() elif event_type == "DELETED": ray_clusters[cluster_name].clean_up() del ray_clusters[cluster_name] - del last_generation[cluster_name] def main() -> None: @@ -148,9 +119,9 @@ def main() -> None: try: for event in cluster_cr_stream: cluster_cr = event["object"] - cluster_name = cluster_cr["metadata"]["name"] event_type = event["type"] - handle_event(event_type, cluster_cr, cluster_name) + cluster_config = operator_utils.cr_to_config(cluster_cr) + cluster_action(cluster_config, event_type) except ApiException as e: if e.status == 404: raise Exception( diff --git a/python/ray/ray_operator/operator_utils.py b/python/ray/ray_operator/operator_utils.py index e20cd6719b21..5d51baebbd77 100644 --- a/python/ray/ray_operator/operator_utils.py +++ b/python/ray/ray_operator/operator_utils.py @@ -99,16 +99,3 @@ def translate(configuration: Dict[str, Any], dictionary[field]: configuration[field] for field in dictionary if field in configuration } - - -def set_status(cluster_cr: Dict[str, Any], cluster_name: str, - status: str) -> None: - # TODO: Add retry logic in case of 409 due to old resource version. - cluster_cr["status"] = {"phase": status} - custom_objects_api()\ - .patch_namespaced_custom_object_status(namespace=RAY_NAMESPACE, - group="cluster.ray.io", - version="v1", - plural="rayclusters", - name=cluster_name, - body=cluster_cr) diff --git a/python/ray/tests/test_k8s_operator_examples.py b/python/ray/tests/test_k8s_operator_examples.py index 1636b347bd14..6ca2aca370b2 100644 --- a/python/ray/tests/test_k8s_operator_examples.py +++ b/python/ray/tests/test_k8s_operator_examples.py @@ -1,6 +1,5 @@ -"""Tests launch, teardown, and update of multiple Ray clusters using Kubernetes +"""Tests launch and teardown of multiple Ray clusters using Kubernetes operator.""" -import copy import sys import os import subprocess @@ -131,17 +130,6 @@ def test_examples(self): # Four pods remain wait_for_pods(4) - # Check that cluster updates work: increase minWorkers to 3 - # and check that one worker is created. - example_cluster_edit = copy.deepcopy(example_cluster_config) - example_cluster_edit["spec"]["podTypes"][1]["minWorkers"] = 3 - yaml.dump(example_cluster_edit, example_cluster_file) - example_cluster_file.flush() - cm = f"kubectl -n {NAMESPACE} apply -f {example_cluster_file.name}" - subprocess.check_call(cm, shell=True) - - wait_for_pods(5) - # Delete the first cluster cmd = f"kubectl -n {NAMESPACE} delete -f"\ f"{example_cluster_file.name}" From 4abc772f7319008dce0d5999113b9ab66e22295e Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 096/244] Revert "Check Ray client protocol version (#13886)" This reverts commit 48085c7ba9d9c9f94a8e94f552553148efe0db10. --- python/ray/tests/test_client_init.py | 41 +------------------ python/ray/util/client/__init__.py | 18 +------- python/ray/util/client/server/dataservicer.py | 5 ++- 3 files changed, 6 insertions(+), 58 deletions(-) diff --git a/python/ray/tests/test_client_init.py b/python/ray/tests/test_client_init.py index 6b6ce8a42598..9528f1d202fe 100644 --- a/python/ray/tests/test_client_init.py +++ b/python/ray/tests/test_client_init.py @@ -8,7 +8,7 @@ import ray.util.client.server.server as ray_client_server import ray.core.generated.ray_client_pb2 as ray_client_pb2 -from ray.util.client import RayAPIStub, CURRENT_PROTOCOL_VERSION +from ray.util.client import RayAPIStub import ray @@ -109,45 +109,6 @@ def mock_connection_response(): python_version="2.7.12", ray_version="", ray_commit="", - protocol_version=CURRENT_PROTOCOL_VERSION, - ) - - # inject mock connection function - server_handle.data_servicer._build_connection_response = \ - mock_connection_response - - ray = RayAPIStub() - with pytest.raises(RuntimeError): - _ = ray.connect("localhost:50051") - - ray = RayAPIStub() - info3 = ray.connect("localhost:50051", ignore_version=True) - assert info3["num_clients"] == 1, info3 - ray.disconnect() - finally: - ray_client_server.shutdown_with_server(server_handle.grpc_server) - time.sleep(2) - - -def test_protocol_version(): - - server_handle, _ = ray_client_server.init_and_serve("localhost:50051") - try: - ray = RayAPIStub() - info1 = ray.connect("localhost:50051") - local_py_version = ".".join( - [str(x) for x in list(sys.version_info)[:3]]) - assert info1["protocol_version"] == CURRENT_PROTOCOL_VERSION, info1 - ray.disconnect() - time.sleep(1) - - def mock_connection_response(): - return ray_client_pb2.ConnectionInfoResponse( - num_clients=1, - python_version=local_py_version, - ray_version="", - ray_commit="", - protocol_version="2050-01-01", # from the future ) # inject mock connection function diff --git a/python/ray/util/client/__init__.py b/python/ray/util/client/__init__.py index 3fdcd4f8810c..9a2d14877936 100644 --- a/python/ray/util/client/__init__.py +++ b/python/ray/util/client/__init__.py @@ -5,10 +5,6 @@ logger = logging.getLogger(__name__) -# This version string is incremented to indicate breaking changes in the -# protocol that require upgrading the client version. -CURRENT_PROTOCOL_VERSION = "2020-02-01" - class RayAPIStub: """This class stands in as the replacement API for the `import ray` module. @@ -39,9 +35,6 @@ def connect(self, conn_str: Connection string, in the form "[host]:port" secure: Whether to use a TLS secured gRPC channel metadata: gRPC metadata to send on connect - connection_retries: number of connection attempts to make - ignore_version: whether to ignore Python or Ray version mismatches. - This should only be used for debugging purposes. Returns: Dictionary of connection info, e.g., {"num_clients": 1}. @@ -73,8 +66,7 @@ def connect(self, self.disconnect() raise - def _check_versions(self, conn_info: Dict[str, Any], - ignore_version: bool) -> None: + def _check_versions(self, conn_info, ignore_version: bool) -> None: local_major_minor = f"{sys.version_info[0]}.{sys.version_info[1]}" if not conn_info["python_version"].startswith(local_major_minor): version_str = f"{local_major_minor}.{sys.version_info[2]}" @@ -85,14 +77,6 @@ def _check_versions(self, conn_info: Dict[str, Any], logger.warning(msg) else: raise RuntimeError(msg) - if CURRENT_PROTOCOL_VERSION < conn_info["protocol_version"]: - msg = "Client Ray installation out of date:" + \ - f" client is {CURRENT_PROTOCOL_VERSION}," + \ - f" server is {conn_info['protocol_version']}" - if ignore_version: - logger.warning(msg) - else: - raise RuntimeError(msg) def disconnect(self): """Disconnect the Ray Client. diff --git a/python/ray/util/client/server/dataservicer.py b/python/ray/util/client/server/dataservicer.py index 82ddc85c6f5f..7091478208f3 100644 --- a/python/ray/util/client/server/dataservicer.py +++ b/python/ray/util/client/server/dataservicer.py @@ -8,13 +8,16 @@ import ray.core.generated.ray_client_pb2 as ray_client_pb2 import ray.core.generated.ray_client_pb2_grpc as ray_client_pb2_grpc -from ray.util.client import CURRENT_PROTOCOL_VERSION if TYPE_CHECKING: from ray.util.client.server.server import RayletServicer logger = logging.getLogger(__name__) +# This version string is incremented to indicate breaking changes in the +# protocol that require upgrading the client version. +CURRENT_PROTOCOL_VERSION = "2020-02-01" + class DataServicer(ray_client_pb2_grpc.RayletDataStreamerServicer): def __init__(self, basic_service: "RayletServicer"): From 8f8a67991070cd569832c30b1b8a1e6031b41834 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 097/244] Revert "[Core] Ownership-based Object Directory - Changed infinite short-poll location subscription to long-poll. (#13841)" This reverts commit 93a2495f6282c4509f1d213555b9b67c335670ec. --- src/ray/core_worker/core_worker.cc | 26 +++---- src/ray/core_worker/reference_count.cc | 44 +---------- src/ray/core_worker/reference_count.h | 41 ++-------- .../ownership_based_object_directory.cc | 74 +++++-------------- src/ray/protobuf/core_worker.proto | 6 -- 5 files changed, 37 insertions(+), 154 deletions(-) diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 6c8287c1507b..a8c2e85570a6 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -2219,25 +2219,19 @@ void CoreWorker::HandleGetObjectLocationsOwner( return; } auto object_id = ObjectID::FromBinary(request.object_id()); - const auto &callback = [object_id, reply, send_reply_callback]( - const absl::flat_hash_set &locations, - int64_t object_size, int64_t current_version) { - RAY_LOG(DEBUG) << "Replying to HandleGetObjectLocationsOwner for " << object_id - << " with location update version " << current_version << ", " - << locations.size() << " locations, and " << object_size - << " object size."; - for (const auto &node_id : locations) { + absl::optional> node_ids = + reference_counter_->GetObjectLocations(object_id); + Status status; + if (node_ids.has_value()) { + for (const auto &node_id : node_ids.value()) { reply->add_node_ids(node_id.Binary()); } - reply->set_object_size(object_size); - reply->set_current_version(current_version); - send_reply_callback(Status::OK(), nullptr, nullptr); - }; - auto status = reference_counter_->SubscribeObjectLocations( - object_id, request.last_version(), callback); - if (!status.ok()) { - send_reply_callback(status, nullptr, nullptr); + status = Status::OK(); + } else { + status = Status::ObjectNotFound("Object " + object_id.Hex() + " not found"); } + reply->set_object_size(reference_counter_->GetObjectSize(object_id)); + send_reply_callback(status, nullptr, nullptr); } void CoreWorker::HandleWaitForRefRemoved(const rpc::WaitForRefRemovedRequest &request, diff --git a/src/ray/core_worker/reference_count.cc b/src/ray/core_worker/reference_count.cc index a38a98d801ed..ba2e20994e44 100644 --- a/src/ray/core_worker/reference_count.cc +++ b/src/ray/core_worker/reference_count.cc @@ -185,7 +185,6 @@ void ReferenceCounter::UpdateObjectSize(const ObjectID &object_id, int64_t objec auto it = object_id_refs_.find(object_id); if (it != object_id_refs_.end()) { it->second.object_size = object_size; - PushToLocationSubscribers(it); } } @@ -916,12 +915,11 @@ bool ReferenceCounter::AddObjectLocation(const ObjectID &object_id, absl::MutexLock lock(&mutex_); auto it = object_id_refs_.find(object_id); if (it == object_id_refs_.end()) { - RAY_LOG(INFO) << "Tried to add an object location for an object " << object_id - << " that doesn't exist in the reference table"; + RAY_LOG(WARNING) << "Tried to add an object location for an object " << object_id + << " that doesn't exist in the reference table"; return false; } it->second.locations.insert(node_id); - PushToLocationSubscribers(it); return true; } @@ -930,12 +928,11 @@ bool ReferenceCounter::RemoveObjectLocation(const ObjectID &object_id, absl::MutexLock lock(&mutex_); auto it = object_id_refs_.find(object_id); if (it == object_id_refs_.end()) { - RAY_LOG(INFO) << "Tried to remove an object location for an object " << object_id - << " that doesn't exist in the reference table"; + RAY_LOG(WARNING) << "Tried to remove an object location for an object " << object_id + << " that doesn't exist in the reference table"; return false; } it->second.locations.erase(node_id); - PushToLocationSubscribers(it); return true; } @@ -1006,39 +1003,6 @@ absl::optional ReferenceCounter::GetLocalityData( return locality_data; } -void ReferenceCounter::PushToLocationSubscribers(ReferenceTable::iterator it) { - const auto callbacks = it->second.location_subscription_callbacks; - it->second.location_subscription_callbacks.clear(); - it->second.location_version++; - for (const auto callback : callbacks) { - callback(it->second.locations, it->second.object_size, it->second.location_version); - } -} - -Status ReferenceCounter::SubscribeObjectLocations( - const ObjectID &object_id, int64_t last_location_version, - const LocationSubscriptionCallback &callback) { - absl::MutexLock lock(&mutex_); - auto it = object_id_refs_.find(object_id); - if (it == object_id_refs_.end()) { - RAY_LOG(INFO) << "Tried to register a location subscriber for an object " << object_id - << " that doesn't exist in the reference table." - << " The object has probably already been freed."; - return Status::ObjectNotFound("Object " + object_id.Hex() + " not found"); - } - - if (last_location_version < it->second.location_version) { - // If the last location version is less than the current location version, we - // already have location data that the subscriber hasn't seen yet, so we immediately - // invoke the callback. - callback(it->second.locations, it->second.object_size, it->second.location_version); - } else { - // Otherwise, save the callback for later invocation. - it->second.location_subscription_callbacks.push_back(callback); - } - return Status::OK(); -} - ReferenceCounter::Reference ReferenceCounter::Reference::FromProto( const rpc::ObjectReferenceCount &ref_count) { Reference ref; diff --git a/src/ray/core_worker/reference_count.h b/src/ray/core_worker/reference_count.h index 014b94714715..9c0576393fb3 100644 --- a/src/ray/core_worker/reference_count.h +++ b/src/ray/core_worker/reference_count.h @@ -49,10 +49,6 @@ class ReferenceCounterInterface { virtual ~ReferenceCounterInterface() {} }; -// Callback for location subscriptions. -using LocationSubscriptionCallback = - std::function &, int64_t, int64_t)>; - /// Class used by the core worker to keep track of ObjectID reference counts for garbage /// collection. This class is thread safe. class ReferenceCounter : public ReferenceCounterInterface, @@ -401,19 +397,6 @@ class ReferenceCounter : public ReferenceCounterInterface, absl::optional> GetObjectLocations( const ObjectID &object_id) LOCKS_EXCLUDED(mutex_); - /// Subscribe to object location changes that are more recent than the given version. - /// The provided callback will be invoked when new locations become available. - /// - /// \param[in] object_id The object whose locations we want. - /// \param[in] last_location_version The version of the last location update the - /// caller received. Only more recent location updates will be returned. - /// \param[in] callback The callback to invoke with the location update. - /// \return The status of the location get. - Status SubscribeObjectLocations(const ObjectID &object_id, - int64_t last_location_version, - const LocationSubscriptionCallback &callback) - LOCKS_EXCLUDED(mutex_); - /// Get an object's size. This will return 0 if the object is out of scope. /// /// \param[in] object_id The object whose size to get. @@ -509,17 +492,13 @@ class ReferenceCounter : public ReferenceCounterInterface, /// process is a borrower, the borrower must add the owner's address before /// using the ObjectID. absl::optional owner_address; - /// If this object is owned by us and stored in plasma, and reference - /// counting is enabled, then some raylet must be pinning the object value. - /// This is the address of that raylet. + // If this object is owned by us and stored in plasma, and reference + // counting is enabled, then some raylet must be pinning the object value. + // This is the address of that raylet. absl::optional pinned_at_raylet_id; - /// If this object is owned by us and stored in plasma, this contains all - /// object locations. + // If this object is owned by us and stored in plasma, this contains all + // object locations. absl::flat_hash_set locations; - /// A logical counter for object location updates, used for object location - /// subscriptions. Subscribers use -1 to indicate that they want us to - /// immediately send them the current location data. - int64_t location_version = 0; // Whether this object can be reconstructed via lineage. If false, then the // object's value will be pinned as long as it is referenced by any other // object's lineage. @@ -586,9 +565,7 @@ class ReferenceCounter : public ReferenceCounterInterface, size_t lineage_ref_count = 0; /// Whether this object has been spilled to external storage. bool spilled = false; - /// Location subscription callbacks registered by async location get requests. - /// These will be invoked whenever locations or object_size are changed. - std::vector location_subscription_callbacks; + /// Callback that will be called when this ObjectID no longer has /// references. std::function on_delete; @@ -712,12 +689,6 @@ class ReferenceCounter : public ReferenceCounterInterface, void ReleaseLineageReferencesInternal(const std::vector &argument_ids) EXCLUSIVE_LOCKS_REQUIRED(mutex_); - /// Pushes location updates to subscribers of a particular reference, invoking all - /// callbacks registered for the reference by GetLocationsAsync calls. This method - /// also increments the reference's location version counter. - void PushToLocationSubscribers(ReferenceTable::iterator it) - EXCLUSIVE_LOCKS_REQUIRED(mutex_); - /// Address of our RPC server. This is used to determine whether we own a /// given object or not, by comparing our WorkerID with the WorkerID of the /// object's owner. diff --git a/src/ray/object_manager/ownership_based_object_directory.cc b/src/ray/object_manager/ownership_based_object_directory.cc index 3f2ccc540ed2..a17d3dfc66c0 100644 --- a/src/ray/object_manager/ownership_based_object_directory.cc +++ b/src/ray/object_manager/ownership_based_object_directory.cc @@ -80,18 +80,11 @@ ray::Status OwnershipBasedObjectDirectory::ReportObjectAdded( request.set_node_id(node_id.Binary()); rpc_client->AddObjectLocationOwner( - request, [worker_id, object_id, node_id]( - Status status, const rpc::AddObjectLocationOwnerReply &reply) { + request, [worker_id, object_id](Status status, + const rpc::AddObjectLocationOwnerReply &reply) { if (!status.ok()) { - if (status.IsObjectNotFound()) { - RAY_LOG(INFO) << "Worker " << worker_id << " failed to add the location " - << node_id << " for " << object_id - << " because the owner no longer has the object; we assume the " - "object was evicted."; - } else { - RAY_LOG(INFO) << "Worker " << worker_id << " failed to add the location " - << node_id << " for " << object_id << ": " << status.ToString(); - } + RAY_LOG(ERROR) << "Worker " << worker_id << " failed to add the location for " + << object_id; } }); return Status::OK(); @@ -115,18 +108,11 @@ ray::Status OwnershipBasedObjectDirectory::ReportObjectRemoved( request.set_node_id(node_id.Binary()); rpc_client->RemoveObjectLocationOwner( - request, [worker_id, object_id, node_id]( - Status status, const rpc::RemoveObjectLocationOwnerReply &reply) { + request, [worker_id, object_id](Status status, + const rpc::RemoveObjectLocationOwnerReply &reply) { if (!status.ok()) { - if (status.IsObjectNotFound()) { - RAY_LOG(INFO) << "Worker " << worker_id << " failed to remove the location " - << node_id << " for " << object_id - << " because the owner no longer has the object; we assume the " - "object was freed."; - } else { - RAY_LOG(INFO) << "Worker " << worker_id << " failed to remove the location " - << node_id << " for " << object_id << ": " << status.ToString(); - } + RAY_LOG(ERROR) << "Worker " << worker_id + << " failed to remove the location for " << object_id; } }); return Status::OK(); @@ -135,36 +121,22 @@ ray::Status OwnershipBasedObjectDirectory::ReportObjectRemoved( void OwnershipBasedObjectDirectory::SubscriptionCallback( ObjectID object_id, WorkerID worker_id, Status status, const rpc::GetObjectLocationsOwnerReply &reply) { - // Objects are added to this map in SubscribeObjectLocations. auto it = listeners_.find(object_id); - // Do nothing for objects we are not listening for. if (it == listeners_.end()) { return; } - std::unordered_set node_ids; - - // Once this flag is set to true, it should never go back to false. - it->second.subscribed = true; - if (!status.ok()) { - RAY_LOG(INFO) << "Worker " << worker_id << " failed to return location updates to " - << "subscribers for " << object_id << ": " << status.ToString() - << ", assuming that the object was freed or evicted."; - it->second.object_size = 0; - } else { - if (reply.object_size() > 0) { - it->second.object_size = reply.object_size(); - } + if (reply.object_size() > 0) { + it->second.object_size = reply.object_size(); + } - for (auto const &node_id : reply.node_ids()) { - node_ids.emplace(NodeID::FromBinary(node_id)); - } - FilterRemovedNodes(gcs_client_, &node_ids); + std::unordered_set node_ids; + for (auto const &node_id : reply.node_ids()) { + node_ids.emplace(NodeID::FromBinary(node_id)); } - if (node_ids != it->second.current_object_locations || !status.ok()) { + FilterRemovedNodes(gcs_client_, &node_ids); + if (node_ids != it->second.current_object_locations) { it->second.current_object_locations = std::move(node_ids); - // Copy the callbacks so that the callbacks can unsubscribe without interrupting - // looping over the callbacks. auto callbacks = it->second.callbacks; // Call all callbacks associated with the object id locations we have // received. This notifies the client even if the list of locations is @@ -182,7 +154,7 @@ void OwnershipBasedObjectDirectory::SubscriptionCallback( rpc::GetObjectLocationsOwnerRequest request; request.set_intended_worker_id(worker_id.Binary()); request.set_object_id(object_id.Binary()); - request.set_last_version(reply.current_version()); + // TODO(zhuohan): Fix this infinite loop. worker_it->second->GetObjectLocationsOwner( request, std::bind(&OwnershipBasedObjectDirectory::SubscriptionCallback, this, object_id, @@ -204,7 +176,6 @@ ray::Status OwnershipBasedObjectDirectory::SubscribeObjectLocations( rpc::GetObjectLocationsOwnerRequest request; request.set_intended_worker_id(owner_address.worker_id()); request.set_object_id(object_id.Binary()); - request.set_last_version(-1); rpc_client->GetObjectLocationsOwner( request, std::bind(&OwnershipBasedObjectDirectory::SubscriptionCallback, this, object_id, @@ -217,16 +188,6 @@ ray::Status OwnershipBasedObjectDirectory::SubscribeObjectLocations( return Status::OK(); } listener_state.callbacks.emplace(callback_id, callback); - - // If we previously received some notifications about the object's locations, - // immediately notify the caller of the current known locations. - if (listener_state.subscribed) { - auto &locations = listener_state.current_object_locations; - auto object_size = it->second.object_size; - io_service_.post([callback, locations, object_size, object_id]() { - callback(object_id, locations, "", NodeID::Nil(), object_size); - }); - } return Status::OK(); } @@ -260,7 +221,6 @@ ray::Status OwnershipBasedObjectDirectory::LookupLocations( rpc::GetObjectLocationsOwnerRequest request; request.set_intended_worker_id(owner_address.worker_id()); request.set_object_id(object_id.Binary()); - request.set_last_version(-1); rpc_client->GetObjectLocationsOwner( request, [this, worker_id, object_id, callback]( diff --git a/src/ray/protobuf/core_worker.proto b/src/ray/protobuf/core_worker.proto index ef5f9730212f..43a3a667407b 100644 --- a/src/ray/protobuf/core_worker.proto +++ b/src/ray/protobuf/core_worker.proto @@ -182,17 +182,11 @@ message RemoveObjectLocationOwnerReply { message GetObjectLocationsOwnerRequest { bytes intended_worker_id = 1; bytes object_id = 2; - // The version of the last location update. Only updates more recent than this version - // will be returned. -1 indicates that the current location data should - // always be returned. - int64 last_version = 3; } message GetObjectLocationsOwnerReply { repeated bytes node_ids = 1; uint64 object_size = 2; - // The version of the returned location updates. - int64 current_version = 3; } message KillActorRequest { From 4fdcacd89ed145a2e849a54cfbe0c7fe1aa8999f Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 098/244] Revert "[Object Spilling] Add consumed bytes to detect thrashing. (#13853)" This reverts commit b212885a3f3456202661429340c3518ae30ab316. --- python/ray/internal/internal_api.py | 9 +- python/ray/scripts/scripts.py | 11 +-- python/ray/tests/test_memstat.py | 3 +- python/ray/tests/test_object_spilling.py | 86 +++++-------------- src/ray/core_worker/core_worker.cc | 13 +-- src/ray/core_worker/core_worker.h | 2 - .../store_provider/plasma_store_provider.cc | 7 +- src/ray/object_manager/object_buffer_pool.cc | 3 +- src/ray/object_manager/object_manager.cc | 3 - src/ray/object_manager/plasma/client.cc | 32 +++---- src/ray/object_manager/plasma/client.h | 6 +- src/ray/object_manager/plasma/plasma.fbs | 2 - src/ray/object_manager/plasma/protocol.cc | 9 +- src/ray/object_manager/plasma/protocol.h | 6 +- src/ray/object_manager/plasma/store.cc | 26 ++---- src/ray/object_manager/plasma/store.h | 9 +- src/ray/object_manager/plasma/store_runner.cc | 2 - src/ray/object_manager/plasma/store_runner.h | 2 - src/ray/protobuf/node_manager.proto | 2 - src/ray/raylet/node_manager.cc | 6 +- 20 files changed, 67 insertions(+), 172 deletions(-) diff --git a/python/ray/internal/internal_api.py b/python/ray/internal/internal_api.py index 7956725b7b05..67c1a9275f37 100644 --- a/python/ray/internal/internal_api.py +++ b/python/ray/internal/internal_api.py @@ -13,9 +13,7 @@ def global_gc(): worker.core_worker.global_gc() -def memory_summary(node_manager_address=None, - node_manager_port=None, - stats_only=False): +def memory_summary(node_manager_address=None, node_manager_port=None): """Returns a formatted string describing memory usage in the cluster.""" import grpc @@ -65,11 +63,6 @@ def memory_summary(node_manager_address=None, reply.store_stats.restored_objects_total, int(reply.store_stats.restored_bytes_total / (1024 * 1024) / reply.store_stats.restore_time_total_s))) - if reply.store_stats.consumed_bytes > 0: - store_summary += ("Objects consumed by Ray tasks: {} MiB.".format( - int(reply.store_stats.consumed_bytes / (1024 * 1024)))) - if stats_only: - return store_summary return reply.memory_summary + "\n" + store_summary diff --git a/python/ray/scripts/scripts.py b/python/ray/scripts/scripts.py index 8deaa6f4a2f0..d4ae094d95e3 100644 --- a/python/ray/scripts/scripts.py +++ b/python/ray/scripts/scripts.py @@ -1372,13 +1372,7 @@ def timeline(address): type=str, default=ray_constants.REDIS_DEFAULT_PASSWORD, help="Connect to ray with redis_password.") -@click.option( - "--stats-only", - is_flag=True, - type=bool, - default=False, - help="Connect to ray with redis_password.") -def memory(address, redis_password, stats_only): +def memory(address, redis_password): """Print object references held in a Ray cluster.""" if not address: address = services.get_ray_address_to_use_or_die() @@ -1387,8 +1381,7 @@ def memory(address, redis_password, stats_only): raylet = state.node_table()[0] print( ray.internal.internal_api.memory_summary(raylet["NodeManagerAddress"], - raylet["NodeManagerPort"], - stats_only)) + raylet["NodeManagerPort"])) @cli.command() diff --git a/python/ray/tests/test_memstat.py b/python/ray/tests/test_memstat.py index a0e8e3c90ed1..cb734b3b7582 100644 --- a/python/ray/tests/test_memstat.py +++ b/python/ray/tests/test_memstat.py @@ -27,8 +27,7 @@ def data_lines(memory_str): for line in memory_str.split("\n"): if (not line or "---" in line or "===" in line or "Object ID" in line - or "pid=" in line or "Plasma memory" in line - or "Objects consumed" in line): + or "pid=" in line or "Plasma memory" in line): continue yield line diff --git a/python/ray/tests/test_object_spilling.py b/python/ray/tests/test_object_spilling.py index 500c662250ac..159e0aaf79b1 100644 --- a/python/ray/tests/test_object_spilling.py +++ b/python/ray/tests/test_object_spilling.py @@ -88,27 +88,6 @@ def is_dir_empty(temp_folder, return num_files == 0 -def assert_no_thrashing(address): - state = ray.state.GlobalState() - state._initialize_global_state(address, - ray.ray_constants.REDIS_DEFAULT_PASSWORD) - raylet = state.node_table()[0] - memory_summary = ray.internal.internal_api.memory_summary( - raylet["NodeManagerAddress"], - raylet["NodeManagerPort"], - stats_only=True) - restored_bytes = 0 - consumed_bytes = 0 - - for line in memory_summary.split("\n"): - if "Restored" in line: - restored_bytes = int(line.split(" ")[1]) - if "consumed" in line: - consumed_bytes = int(line.split(" ")[-2]) - assert consumed_bytes >= restored_bytes, ( - f"consumed: {consumed_bytes}, restored: {restored_bytes}") - - def test_invalid_config_raises_exception(shutdown_only): # Make sure ray.init raises an exception before # it starts processes when invalid object spilling @@ -208,7 +187,7 @@ def test_spilling_not_done_for_pinned_object(object_spilling_config, shutdown_only): # Limit our object store to 75 MiB of memory. object_spilling_config, temp_folder = object_spilling_config - address = ray.init( + ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 4, @@ -224,7 +203,6 @@ def test_spilling_not_done_for_pinned_object(object_spilling_config, ref2 = ray.put(arr) # noqa wait_for_condition(lambda: is_dir_empty(temp_folder)) - assert_no_thrashing(address["redis_address"]) @pytest.mark.skipif( @@ -271,7 +249,6 @@ def depends(arg): # Test passing the spilled object as an arg to another task. ray.get(depends.remote(ref)) - assert_no_thrashing(cluster.address) @pytest.mark.skipif( @@ -279,7 +256,7 @@ def depends(arg): def test_spill_objects_automatically(object_spilling_config, shutdown_only): # Limit our object store to 75 MiB of memory. object_spilling_config, _ = object_spilling_config - address = ray.init( + ray.init( num_cpus=1, object_store_memory=75 * 1024 * 1024, _system_config={ @@ -310,15 +287,14 @@ def test_spill_objects_automatically(object_spilling_config, shutdown_only): solution = solution_buffer[index] sample = ray.get(ref, timeout=0) assert np.array_equal(sample, solution) - assert_no_thrashing(address["redis_address"]) @pytest.mark.skipif( - platform.system() in ["Windows", "Darwin"], reason="Failing on Windows.") + platform.system() in ["Darwin", "Windows"], reason="Failing on Windows.") def test_spill_stats(object_spilling_config, shutdown_only): # Limit our object store to 75 MiB of memory. object_spilling_config, _ = object_spilling_config - address = ray.init( + ray.init( num_cpus=1, object_store_memory=100 * 1024 * 1024, _system_config={ @@ -343,31 +319,17 @@ def f(): x_id = f.remote() # noqa ray.get(x_id) - s = memory_summary(stats_only=True) + s = memory_summary() assert "Plasma memory usage 50 MiB, 1 objects, 50.0% full" in s, s assert "Spilled 200 MiB, 4 objects" in s, s assert "Restored 150 MiB, 3 objects" in s, s - # Test if consumed bytes are correctly calculated. - obj = ray.put(np.zeros(30 * 1024 * 1024, dtype=np.uint8)) - - @ray.remote - def func_with_ref(obj): - return True - - ray.get(func_with_ref.remote(obj)) - - s = memory_summary(stats_only=True) - # 50MB * 5 references + 30MB used for task execution. - assert "Objects consumed by Ray tasks: 280 MiB." in s, s - assert_no_thrashing(address["redis_address"]) - @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") def test_spill_during_get(object_spilling_config, shutdown_only): object_spilling_config, _ = object_spilling_config - address = ray.init( + ray.init( num_cpus=4, object_store_memory=100 * 1024 * 1024, _system_config={ @@ -393,7 +355,6 @@ def f(): # objects are being created. for x in ids: print(ray.get(x).shape) - assert_no_thrashing(address["redis_address"]) @pytest.mark.skipif( @@ -401,7 +362,7 @@ def f(): def test_spill_deadlock(object_spilling_config, shutdown_only): object_spilling_config, _ = object_spilling_config # Limit our object store to 75 MiB of memory. - address = ray.init( + ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 1, @@ -425,7 +386,6 @@ def test_spill_deadlock(object_spilling_config, shutdown_only): ref = random.choice(replay_buffer) sample = ray.get(ref, timeout=0) assert np.array_equal(sample, arr) - assert_no_thrashing(address["redis_address"]) @pytest.mark.skipif( @@ -434,7 +394,7 @@ def test_delete_objects(object_spilling_config, shutdown_only): # Limit our object store to 75 MiB of memory. object_spilling_config, temp_folder = object_spilling_config - address = ray.init( + ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 1, @@ -457,7 +417,6 @@ def test_delete_objects(object_spilling_config, shutdown_only): del replay_buffer del ref wait_for_condition(lambda: is_dir_empty(temp_folder)) - assert_no_thrashing(address["redis_address"]) @pytest.mark.skipif( @@ -467,7 +426,7 @@ def test_delete_objects_delete_while_creating(object_spilling_config, # Limit our object store to 75 MiB of memory. object_spilling_config, temp_folder = object_spilling_config - address = ray.init( + ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 4, @@ -498,7 +457,6 @@ def test_delete_objects_delete_while_creating(object_spilling_config, del replay_buffer del ref wait_for_condition(lambda: is_dir_empty(temp_folder)) - assert_no_thrashing(address["redis_address"]) @pytest.mark.skipif( @@ -508,7 +466,7 @@ def test_delete_objects_on_worker_failure(object_spilling_config, # Limit our object store to 75 MiB of memory. object_spilling_config, temp_folder = object_spilling_config - address = ray.init( + ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 4, @@ -560,7 +518,6 @@ def wait_until_actor_dead(): # After all, make sure all objects are deleted upon worker failures. wait_for_condition(lambda: is_dir_empty(temp_folder)) - assert_no_thrashing(address["redis_address"]) @pytest.mark.skipif( @@ -582,11 +539,10 @@ def test_delete_objects_multi_node(multi_node_object_spilling_config, "object_store_full_delay_ms": 100, "object_spilling_config": object_spilling_config, }) - ray.init(address=cluster.address) # Add 2 worker nodes. for _ in range(2): cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024) - cluster.wait_for_nodes() + ray.init(address=cluster.address) arr = np.random.rand(1024 * 1024) # 8 MB data @@ -609,9 +565,9 @@ def create_objects(self): self.replay_buffer.pop() # Do random sampling. - for _ in range(50): + for _ in range(200): ref = random.choice(self.replay_buffer) - sample = ray.get(ref, timeout=10) + sample = ray.get(ref, timeout=0) assert np.array_equal(sample, arr) actors = [Actor.remote() for _ in range(3)] @@ -630,7 +586,6 @@ def wait_until_actor_dead(actor): wait_for_condition(lambda: wait_until_actor_dead(actor)) # The multi node deletion should work. wait_for_condition(lambda: is_dir_empty(temp_folder)) - assert_no_thrashing(cluster.address) @pytest.mark.skipif(platform.system() == "Windows", reason="Flaky on Windows.") @@ -638,7 +593,7 @@ def test_fusion_objects(object_spilling_config, shutdown_only): # Limit our object store to 75 MiB of memory. object_spilling_config, temp_folder = object_spilling_config min_spilling_size = 10 * 1024 * 1024 - address = ray.init( + ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 3, @@ -682,13 +637,12 @@ def test_fusion_objects(object_spilling_config, shutdown_only): if file_size >= min_spilling_size: is_test_passing = True assert is_test_passing - assert_no_thrashing(address["redis_address"]) # https://github.com/ray-project/ray/issues/12912 def do_test_release_resource(object_spilling_config, expect_released): object_spilling_config, temp_folder = object_spilling_config - address = ray.init( + ray.init( num_cpus=1, object_store_memory=75 * 1024 * 1024, _system_config={ @@ -720,7 +674,6 @@ def f(dep): assert ready else: assert not ready - assert_no_thrashing(address["redis_address"]) @pytest.mark.skipif( @@ -792,7 +745,6 @@ def allocate(*args): # spilling. tasks = [foo.remote(*task_args) for task_args in args] ray.get(tasks) - assert_no_thrashing(cluster.address) @pytest.mark.skipif( @@ -849,6 +801,14 @@ def test_file_deleted_when_driver_exits(tmp_path, shutdown_only): driver.format(temp_dir=str(temp_folder), signum=2))) wait_for_condition(lambda: is_dir_empty(temp_folder, append_path="")) + # Q: Looks like Sigterm doesn't work with Ray? + # print("Sending sigterm...") + # # Run a driver with sigterm. + # with pytest.raises(subprocess.CalledProcessError): + # print(run_string_as_driver( + # driver.format(temp_dir=str(temp_folder), signum=15))) + # wait_for_condition(is_dir_empty, timeout=1000) + if __name__ == "__main__": sys.exit(pytest.main(["-sv", __file__])) diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index a8c2e85570a6..b56f18cf04e4 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -566,8 +566,6 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_ // NOTE: This also marks the worker as available in Raylet. We do this at the // very end in case there is a problem during construction. RAY_CHECK_OK(local_raylet_client_->AnnounceWorkerPort(core_worker_server_->GetPort())); - // Used to detect if the object is in the plasma store. - max_direct_call_object_size_ = RayConfig::instance().max_direct_call_object_size(); } void CoreWorker::Shutdown() { @@ -883,7 +881,8 @@ Status CoreWorker::Put(const RayObject &object, bool object_exists; if (options_.is_local_mode || (RayConfig::instance().put_small_object_in_memory_store() && - static_cast(object.GetSize()) < max_direct_call_object_size_)) { + static_cast(object.GetSize()) < + RayConfig::instance().max_direct_call_object_size())) { RAY_LOG(DEBUG) << "Put " << object_id << " in memory store"; RAY_CHECK(memory_store_->Put(object, object_id)); return Status::OK(); @@ -924,7 +923,8 @@ Status CoreWorker::CreateOwned(const std::shared_ptr &metadata, NodeID::FromBinary(rpc_address_.raylet_id())); if (options_.is_local_mode || (RayConfig::instance().put_small_object_in_memory_store() && - static_cast(data_size) < max_direct_call_object_size_)) { + static_cast(data_size) < + RayConfig::instance().max_direct_call_object_size())) { *data = std::make_shared(data_size); } else { auto status = @@ -1037,7 +1037,7 @@ Status CoreWorker::Get(const std::vector &ids, const int64_t timeout_m bool missing_result = false; bool will_throw_exception = false; for (size_t i = 0; i < ids.size(); i++) { - const auto pair = result_map.find(ids[i]); + auto pair = result_map.find(ids[i]); if (pair != result_map.end()) { (*results)[i] = pair->second; RAY_CHECK(!pair->second->IsInPlasmaError()); @@ -1778,7 +1778,8 @@ Status CoreWorker::AllocateReturnObjects( // Allocate a buffer for the return object. if (options_.is_local_mode || - static_cast(data_sizes[i]) < max_direct_call_object_size_) { + static_cast(data_sizes[i]) < + RayConfig::instance().max_direct_call_object_size()) { data_buffer = std::make_shared(data_sizes[i]); } else { RAY_RETURN_NOT_OK(CreateExisting(metadatas[i], data_sizes[i], object_ids[i], diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h index 6fa24c29e94e..89331b5ce10f 100644 --- a/src/ray/core_worker/core_worker.h +++ b/src/ray/core_worker/core_worker.h @@ -1255,8 +1255,6 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { /// Whether we are shutting down and not running further tasks. bool exiting_ = false; - int64_t max_direct_call_object_size_; - friend class CoreWorkerTest; }; diff --git a/src/ray/core_worker/store_provider/plasma_store_provider.cc b/src/ray/core_worker/store_provider/plasma_store_provider.cc index f3b5f047c8fc..b42c4b50941f 100644 --- a/src/ray/core_worker/store_provider/plasma_store_provider.cc +++ b/src/ray/core_worker/store_provider/plasma_store_provider.cc @@ -191,8 +191,7 @@ Status CoreWorkerPlasmaStoreProvider::FetchAndGetFromPlasmaStore( std::vector plasma_results; { std::lock_guard guard(store_client_mutex_); - RAY_RETURN_NOT_OK(store_client_.Get(batch_ids, timeout_ms, &plasma_results, - /*is_from_worker=*/true)); + RAY_RETURN_NOT_OK(store_client_.Get(batch_ids, timeout_ms, &plasma_results)); } // Add successfully retrieved objects to the result map and remove them from @@ -232,9 +231,7 @@ Status CoreWorkerPlasmaStoreProvider::GetIfLocal( std::vector plasma_results; { std::lock_guard guard(store_client_mutex_); - // Since this path is used only for spilling, we should set is_from_worker: false. - RAY_RETURN_NOT_OK(store_client_.Get(object_ids, /*timeout_ms=*/0, &plasma_results, - /*is_from_worker=*/false)); + RAY_RETURN_NOT_OK(store_client_.Get(object_ids, /*timeout_ms=*/0, &plasma_results)); } for (size_t i = 0; i < object_ids.size(); i++) { diff --git a/src/ray/object_manager/object_buffer_pool.cc b/src/ray/object_manager/object_buffer_pool.cc index 63dabcb419ef..726a6fefca35 100644 --- a/src/ray/object_manager/object_buffer_pool.cc +++ b/src/ray/object_manager/object_buffer_pool.cc @@ -57,8 +57,7 @@ std::pair ObjectBufferPool::Ge std::lock_guard lock(pool_mutex_); if (get_buffer_state_.count(object_id) == 0) { plasma::ObjectBuffer object_buffer; - RAY_CHECK_OK( - store_client_.Get(&object_id, 1, 0, &object_buffer, /*is_from_worker=*/false)); + RAY_CHECK_OK(store_client_.Get(&object_id, 1, 0, &object_buffer)); if (object_buffer.data == nullptr) { RAY_LOG(INFO) << "Failed to get a chunk of the object: " << object_id diff --git a/src/ray/object_manager/object_manager.cc b/src/ray/object_manager/object_manager.cc index d59737ca6c25..448245e012ee 100644 --- a/src/ray/object_manager/object_manager.cc +++ b/src/ray/object_manager/object_manager.cc @@ -834,9 +834,6 @@ void ObjectManager::FillObjectStoreStats(rpc::GetNodeStatsReply *reply) const { stats->set_object_store_bytes_used(used_memory_); stats->set_object_store_bytes_avail(config_.object_store_memory); stats->set_num_local_objects(local_objects_.size()); - if (plasma::plasma_store_runner) { - stats->set_consumed_bytes(plasma::plasma_store_runner->GetConsumedBytes()); - } } void ObjectManager::Tick(const boost::system::error_code &e) { diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc index 9b9bb5408df4..a5429d985f91 100644 --- a/src/ray/object_manager/plasma/client.cc +++ b/src/ray/object_manager/plasma/client.cc @@ -121,10 +121,10 @@ class PlasmaClient::Impl : public std::enable_shared_from_this *data, int device_num); Status Get(const std::vector &object_ids, int64_t timeout_ms, - std::vector *object_buffers, bool is_from_worker); + std::vector *object_buffers); Status Get(const ObjectID *object_ids, int64_t num_objects, int64_t timeout_ms, - ObjectBuffer *object_buffers, bool is_from_worker); + ObjectBuffer *object_buffers); Status Release(const ObjectID &object_id); @@ -172,7 +172,7 @@ class PlasmaClient::Impl : public std::enable_shared_from_this( const ObjectID &, const std::shared_ptr &)> &wrap_buffer, - ObjectBuffer *object_buffers, bool is_from_worker); + ObjectBuffer *object_buffers); uint8_t *LookupMmappedFile(MEMFD_TYPE store_fd_val); @@ -362,7 +362,7 @@ Status PlasmaClient::Impl::GetBuffers( const ObjectID *object_ids, int64_t num_objects, int64_t timeout_ms, const std::function( const ObjectID &, const std::shared_ptr &)> &wrap_buffer, - ObjectBuffer *object_buffers, bool is_from_worker) { + ObjectBuffer *object_buffers) { // Fill out the info for the objects that are already in use locally. bool all_present = true; for (int64_t i = 0; i < num_objects; ++i) { @@ -409,8 +409,7 @@ Status PlasmaClient::Impl::GetBuffers( // If we get here, then the objects aren't all currently in use by this // client, so we need to send a request to the plasma store. - RAY_RETURN_NOT_OK(SendGetRequest(store_conn_, &object_ids[0], num_objects, timeout_ms, - is_from_worker)); + RAY_RETURN_NOT_OK(SendGetRequest(store_conn_, &object_ids[0], num_objects, timeout_ms)); std::vector buffer; RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaGetReply, &buffer)); std::vector received_object_ids(num_objects); @@ -471,8 +470,7 @@ Status PlasmaClient::Impl::GetBuffers( } Status PlasmaClient::Impl::Get(const std::vector &object_ids, - int64_t timeout_ms, std::vector *out, - bool is_from_worker) { + int64_t timeout_ms, std::vector *out) { std::lock_guard guard(client_mutex_); const auto wrap_buffer = [=](const ObjectID &object_id, @@ -481,19 +479,16 @@ Status PlasmaClient::Impl::Get(const std::vector &object_ids, }; const size_t num_objects = object_ids.size(); *out = std::vector(num_objects); - return GetBuffers(&object_ids[0], num_objects, timeout_ms, wrap_buffer, &(*out)[0], - is_from_worker); + return GetBuffers(&object_ids[0], num_objects, timeout_ms, wrap_buffer, &(*out)[0]); } Status PlasmaClient::Impl::Get(const ObjectID *object_ids, int64_t num_objects, - int64_t timeout_ms, ObjectBuffer *out, - bool is_from_worker) { + int64_t timeout_ms, ObjectBuffer *out) { std::lock_guard guard(client_mutex_); const auto wrap_buffer = [](const ObjectID &object_id, const std::shared_ptr &buffer) { return buffer; }; - return GetBuffers(object_ids, num_objects, timeout_ms, wrap_buffer, out, - is_from_worker); + return GetBuffers(object_ids, num_objects, timeout_ms, wrap_buffer, out); } Status PlasmaClient::Impl::MarkObjectUnused(const ObjectID &object_id) { @@ -758,14 +753,13 @@ Status PlasmaClient::TryCreateImmediately(const ObjectID &object_id, } Status PlasmaClient::Get(const std::vector &object_ids, int64_t timeout_ms, - std::vector *object_buffers, bool is_from_worker) { - return impl_->Get(object_ids, timeout_ms, object_buffers, is_from_worker); + std::vector *object_buffers) { + return impl_->Get(object_ids, timeout_ms, object_buffers); } Status PlasmaClient::Get(const ObjectID *object_ids, int64_t num_objects, - int64_t timeout_ms, ObjectBuffer *object_buffers, - bool is_from_worker) { - return impl_->Get(object_ids, num_objects, timeout_ms, object_buffers, is_from_worker); + int64_t timeout_ms, ObjectBuffer *object_buffers) { + return impl_->Get(object_ids, num_objects, timeout_ms, object_buffers); } Status PlasmaClient::Release(const ObjectID &object_id) { diff --git a/src/ray/object_manager/plasma/client.h b/src/ray/object_manager/plasma/client.h index 703250bd23b0..e88a9eb138a1 100644 --- a/src/ray/object_manager/plasma/client.h +++ b/src/ray/object_manager/plasma/client.h @@ -161,10 +161,9 @@ class PlasmaClient { /// \param timeout_ms The amount of time in milliseconds to wait before this /// request times out. If this value is -1, then no timeout is set. /// \param[out] object_buffers The object results. - /// \param is_from_worker Whether or not if the Get request comes from a Ray workers. /// \return The return status. Status Get(const std::vector &object_ids, int64_t timeout_ms, - std::vector *object_buffers, bool is_from_worker); + std::vector *object_buffers); /// Deprecated variant of Get() that doesn't automatically release buffers /// when they get out of scope. @@ -174,13 +173,12 @@ class PlasmaClient { /// \param timeout_ms The amount of time in milliseconds to wait before this /// request times out. If this value is -1, then no timeout is set. /// \param object_buffers An array where the results will be stored. - /// \param is_from_worker Whether or not if the Get request comes from a Ray workers. /// \return The return status. /// /// The caller is responsible for releasing any retrieved objects, but it /// should not release objects that were not retrieved. Status Get(const ObjectID *object_ids, int64_t num_objects, int64_t timeout_ms, - ObjectBuffer *object_buffers, bool is_from_worker); + ObjectBuffer *object_buffers); /// Tell Plasma that the client no longer needs the object. This should be /// called after Get() or Create() when the client is done with the object. diff --git a/src/ray/object_manager/plasma/plasma.fbs b/src/ray/object_manager/plasma/plasma.fbs index 5a268a891d4a..3816de79e842 100644 --- a/src/ray/object_manager/plasma/plasma.fbs +++ b/src/ray/object_manager/plasma/plasma.fbs @@ -210,8 +210,6 @@ table PlasmaGetRequest { object_ids: [string]; // The number of milliseconds before the request should timeout. timeout_ms: long; - // Whether or not the get request is from the core worker. It is used to record how many bytes are consumed by core workers. - is_from_worker: bool; } table PlasmaGetReply { diff --git a/src/ray/object_manager/plasma/protocol.cc b/src/ray/object_manager/plasma/protocol.cc index c3b5b55ee1d5..8c3164d6a7df 100644 --- a/src/ray/object_manager/plasma/protocol.cc +++ b/src/ray/object_manager/plasma/protocol.cc @@ -553,16 +553,16 @@ Status ReadEvictReply(uint8_t *data, size_t size, int64_t &num_bytes) { // Get messages. Status SendGetRequest(const std::shared_ptr &store_conn, - const ObjectID *object_ids, int64_t num_objects, int64_t timeout_ms, - bool is_from_worker) { + const ObjectID *object_ids, int64_t num_objects, + int64_t timeout_ms) { flatbuffers::FlatBufferBuilder fbb; auto message = fb::CreatePlasmaGetRequest( - fbb, ToFlatbuffer(&fbb, object_ids, num_objects), timeout_ms, is_from_worker); + fbb, ToFlatbuffer(&fbb, object_ids, num_objects), timeout_ms); return PlasmaSend(store_conn, MessageType::PlasmaGetRequest, &fbb, message); } Status ReadGetRequest(uint8_t *data, size_t size, std::vector &object_ids, - int64_t *timeout_ms, bool *is_from_worker) { + int64_t *timeout_ms) { RAY_DCHECK(data); auto message = flatbuffers::GetRoot(data); RAY_DCHECK(VerifyFlatbuffer(message, data, size)); @@ -571,7 +571,6 @@ Status ReadGetRequest(uint8_t *data, size_t size, std::vector &object_ object_ids.push_back(ObjectID::FromBinary(object_id)); } *timeout_ms = message->timeout_ms(); - *is_from_worker = message->is_from_worker(); return Status::OK(); } diff --git a/src/ray/object_manager/plasma/protocol.h b/src/ray/object_manager/plasma/protocol.h index f5baf03ec955..a8ba71b4621f 100644 --- a/src/ray/object_manager/plasma/protocol.h +++ b/src/ray/object_manager/plasma/protocol.h @@ -128,11 +128,11 @@ Status ReadSealReply(uint8_t *data, size_t size, ObjectID *object_id); /* Plasma Get message functions. */ Status SendGetRequest(const std::shared_ptr &store_conn, - const ObjectID *object_ids, int64_t num_objects, int64_t timeout_ms, - bool is_from_worker); + const ObjectID *object_ids, int64_t num_objects, + int64_t timeout_ms); Status ReadGetRequest(uint8_t *data, size_t size, std::vector &object_ids, - int64_t *timeout_ms, bool *is_from_worker); + int64_t *timeout_ms); Status SendGetReply(const std::shared_ptr &client, ObjectID object_ids[], std::unordered_map &plasma_objects, diff --git a/src/ray/object_manager/plasma/store.cc b/src/ray/object_manager/plasma/store.cc index af72192732ec..e101c5a9b71a 100644 --- a/src/ray/object_manager/plasma/store.cc +++ b/src/ray/object_manager/plasma/store.cc @@ -69,7 +69,7 @@ namespace plasma { struct GetRequest { GetRequest(boost::asio::io_service &io_context, const std::shared_ptr &client, - const std::vector &object_ids, bool is_from_worker); + const std::vector &object_ids); /// The client that called get. std::shared_ptr client; /// The object IDs involved in this request. This is used in the reply. @@ -82,9 +82,6 @@ struct GetRequest { /// The number of object requests in this wait request that are already /// satisfied. int64_t num_satisfied; - /// Whether or not the request comes from the core worker. It is used to track the size - /// of total objects that are consumed by core worker. - bool is_from_worker; void AsyncWait(int64_t timeout_ms, std::function on_timeout) { @@ -103,12 +100,11 @@ struct GetRequest { GetRequest::GetRequest(boost::asio::io_service &io_context, const std::shared_ptr &client, - const std::vector &object_ids, bool is_from_worker) + const std::vector &object_ids) : client(client), object_ids(object_ids.begin(), object_ids.end()), objects(object_ids.size()), num_satisfied(0), - is_from_worker(is_from_worker), timer_(io_context) { std::unordered_set unique_ids(object_ids.begin(), object_ids.end()); num_objects_to_wait_for = unique_ids.size(); @@ -397,9 +393,6 @@ void PlasmaStore::ReturnFromGet(GetRequest *get_req) { fds_to_send.insert(fd); store_fds.push_back(fd); mmap_sizes.push_back(GetMmapSize(fd)); - if (get_req->is_from_worker) { - total_consumed_bytes_ += object.data_size + object.metadata_size; - } } } // Send the get reply to the client. @@ -472,9 +465,9 @@ void PlasmaStore::UpdateObjectGetRequests(const ObjectID &object_id) { void PlasmaStore::ProcessGetRequest(const std::shared_ptr &client, const std::vector &object_ids, - int64_t timeout_ms, bool is_from_worker) { + int64_t timeout_ms) { // Create a get request for this object. - auto get_req = new GetRequest(io_context_, client, object_ids, is_from_worker); + auto get_req = new GetRequest(io_context_, client, object_ids); for (auto object_id : object_ids) { // Check if this object is already present // locally. If so, record that the object is being used and mark it as accounted for. @@ -901,10 +894,8 @@ Status PlasmaStore::ProcessMessage(const std::shared_ptr &client, case fb::MessageType::PlasmaGetRequest: { std::vector object_ids_to_get; int64_t timeout_ms; - bool is_from_worker; - RAY_RETURN_NOT_OK(ReadGetRequest(input, input_size, object_ids_to_get, &timeout_ms, - &is_from_worker)); - ProcessGetRequest(client, object_ids_to_get, timeout_ms, is_from_worker); + RAY_RETURN_NOT_OK(ReadGetRequest(input, input_size, object_ids_to_get, &timeout_ms)); + ProcessGetRequest(client, object_ids_to_get, timeout_ms); } break; case fb::MessageType::PlasmaReleaseRequest: { RAY_RETURN_NOT_OK(ReadReleaseRequest(input, input_size, &object_id)); @@ -1029,11 +1020,6 @@ void PlasmaStore::ReplyToCreateClient(const std::shared_ptr &client, } } -int64_t PlasmaStore::GetConsumedBytes() { - std::lock_guard guard(mutex_); - return total_consumed_bytes_; -} - bool PlasmaStore::IsObjectSpillable(const ObjectID &object_id) { // The lock is acquired when a request is received to the plasma store. // recursive mutex is used here to allow diff --git a/src/ray/object_manager/plasma/store.h b/src/ray/object_manager/plasma/store.h index eedcb526d809..214cf9763bf6 100644 --- a/src/ray/object_manager/plasma/store.h +++ b/src/ray/object_manager/plasma/store.h @@ -139,8 +139,7 @@ class PlasmaStore { /// \param object_ids Object IDs of the objects to be gotten. /// \param timeout_ms The timeout for the get request in milliseconds. void ProcessGetRequest(const std::shared_ptr &client, - const std::vector &object_ids, int64_t timeout_ms, - bool is_from_worker); + const std::vector &object_ids, int64_t timeout_ms); /// Seal a vector of objects. The objects are now immutable and can be accessed with /// get. @@ -191,9 +190,6 @@ class PlasmaStore { /// before the object is pinned by raylet for the first time. bool IsObjectSpillable(const ObjectID &object_id); - /// Return the plasma object bytes that are consumed by core workers. - int64_t GetConsumedBytes(); - void SetNotificationListener( const std::shared_ptr ¬ification_listener) { notification_listener_ = notification_listener; @@ -320,9 +316,6 @@ class PlasmaStore { std::recursive_mutex mutex_; size_t num_bytes_in_use_ = 0; - - /// Total plasma object bytes that are consumed by core workers. - int64_t total_consumed_bytes_ = 0; }; } // namespace plasma diff --git a/src/ray/object_manager/plasma/store_runner.cc b/src/ray/object_manager/plasma/store_runner.cc index 5a44e297cd42..34e08080cced 100644 --- a/src/ray/object_manager/plasma/store_runner.cc +++ b/src/ray/object_manager/plasma/store_runner.cc @@ -123,8 +123,6 @@ bool PlasmaStoreRunner::IsPlasmaObjectSpillable(const ObjectID &object_id) { return store_->IsObjectSpillable(object_id); } -int64_t PlasmaStoreRunner::GetConsumedBytes() { return store_->GetConsumedBytes(); } - std::unique_ptr plasma_store_runner; } // namespace plasma diff --git a/src/ray/object_manager/plasma/store_runner.h b/src/ray/object_manager/plasma/store_runner.h index f4785810cb24..7ac7be59bbc5 100644 --- a/src/ray/object_manager/plasma/store_runner.h +++ b/src/ray/object_manager/plasma/store_runner.h @@ -22,8 +22,6 @@ class PlasmaStoreRunner { } bool IsPlasmaObjectSpillable(const ObjectID &object_id); - int64_t GetConsumedBytes(); - void GetAvailableMemoryAsync(std::function callback) const { main_service_.post([this, callback]() { store_->GetAvailableMemory(callback); }); } diff --git a/src/ray/protobuf/node_manager.proto b/src/ray/protobuf/node_manager.proto index 8e225293c54f..386ed988ade3 100644 --- a/src/ray/protobuf/node_manager.proto +++ b/src/ray/protobuf/node_manager.proto @@ -138,8 +138,6 @@ message ObjectStoreStats { int64 object_store_bytes_avail = 8; // The number of local objects total. int64 num_local_objects = 9; - // The number of plasma object bytes that are consumed by core workers. - int64 consumed_bytes = 10; } message GetNodeStatsReply { diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index 2c20bab40a39..e784758b1c92 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -2384,9 +2384,7 @@ bool NodeManager::GetObjectsFromPlasma(const std::vector &object_ids, // heavy load, then this request can still block the NodeManager event loop // since we must wait for the plasma store's reply. We should consider using // an `AsyncGet` instead. - if (!store_client_ - .Get(object_ids, /*timeout_ms=*/0, &plasma_results, /*is_from_worker=*/false) - .ok()) { + if (!store_client_.Get(object_ids, /*timeout_ms=*/0, &plasma_results).ok()) { return false; } @@ -2548,8 +2546,6 @@ rpc::ObjectStoreStats AccumulateStoreStats( cur_store.object_store_bytes_avail()); store_stats.set_num_local_objects(store_stats.num_local_objects() + cur_store.num_local_objects()); - store_stats.set_consumed_bytes(store_stats.consumed_bytes() + - cur_store.consumed_bytes()); } return store_stats; } From cf6fcaba0b28dc6432b341b74b1277e830641d90 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 099/244] Revert "[ray_client] convert things registered for ray into ray_client (#13639)" This reverts commit f523284ca111063b1889e4131222c89717c9b379. --- python/ray/_private/client_mode_hook.py | 51 +++---------------------- python/ray/actor.py | 25 ------------ python/ray/remote_function.py | 22 ----------- python/ray/tests/test_client_init.py | 49 ------------------------ python/ray/util/client/api.py | 14 ------- python/ray/util/client/common.py | 18 ++------- python/ray/util/client/options.py | 9 ++--- python/ray/util/client/worker.py | 50 ------------------------ python/ray/worker.py | 1 + 9 files changed, 13 insertions(+), 226 deletions(-) diff --git a/python/ray/_private/client_mode_hook.py b/python/ray/_private/client_mode_hook.py index 74682f1cfa9d..3ceef7316abd 100644 --- a/python/ray/_private/client_mode_hook.py +++ b/python/ray/_private/client_mode_hook.py @@ -2,9 +2,6 @@ from contextlib import contextmanager from functools import wraps -# Attr set on func defs to mark they have been converted to client mode. -RAY_CLIENT_MODE_ATTR = "__ray_client_mode_key__" - client_mode_enabled = os.environ.get("RAY_CLIENT_MODE", "0") == "1" _client_hook_enabled = True @@ -37,54 +34,16 @@ def disable_client_hook(): def client_mode_hook(func): - """Decorator for ray module methods to delegate to ray client""" + """ + Decorator for ray module methods to delegate to ray client + """ from ray.util.client import ray @wraps(func) def wrapper(*args, **kwargs): - if client_mode_should_convert(): + global _client_hook_enabled + if client_mode_enabled and _client_hook_enabled: return getattr(ray, func.__name__)(*args, **kwargs) return func(*args, **kwargs) return wrapper - - -def client_mode_should_convert(): - global _client_hook_enabled - return client_mode_enabled and _client_hook_enabled - - -def client_mode_convert_function(func_cls, in_args, in_kwargs, **kwargs): - """Runs a preregistered ray RemoteFunction through the ray client. - - The common case for this is to transparently convert that RemoteFunction - to a ClientRemoteFunction. This happens in circumstances where the - RemoteFunction is declared early, in a library and only then is Ray used in - client mode -- nescessitating a conversion. - """ - from ray.util.client import ray - - key = getattr(func_cls, RAY_CLIENT_MODE_ATTR, None) - if key is None: - key = ray._convert_function(func_cls) - setattr(func_cls, RAY_CLIENT_MODE_ATTR, key) - client_func = ray._get_converted(key) - return client_func._remote(in_args, in_kwargs, **kwargs) - - -def client_mode_convert_actor(actor_cls, in_args, in_kwargs, **kwargs): - """Runs a preregistered actor class on the ray client - - The common case for this decorator is for instantiating an ActorClass - transparently as a ClientActorClass. This happens in circumstances where - the ActorClass is declared early, in a library and only then is Ray used in - client mode -- nescessitating a conversion. - """ - from ray.util.client import ray - - key = getattr(actor_cls, RAY_CLIENT_MODE_ATTR, None) - if key is None: - key = ray._convert_actor(actor_cls) - setattr(actor_cls, RAY_CLIENT_MODE_ATTR, key) - client_actor = ray._get_converted(key) - return client_actor._remote(in_args, in_kwargs, **kwargs) diff --git a/python/ray/actor.py b/python/ray/actor.py index b24c04a10dd5..7ff9f1f33e04 100644 --- a/python/ray/actor.py +++ b/python/ray/actor.py @@ -13,8 +13,6 @@ from ray import ActorClassID, Language from ray._raylet import PythonFunctionDescriptor from ray._private.client_mode_hook import client_mode_hook -from ray._private.client_mode_hook import client_mode_should_convert -from ray._private.client_mode_hook import client_mode_convert_actor from ray import cross_language from ray.util.inspect import ( is_function_or_method, @@ -555,29 +553,6 @@ def _remote(self, if max_concurrency < 1: raise ValueError("max_concurrency must be >= 1") - if client_mode_should_convert(): - return client_mode_convert_actor( - self, - args, - kwargs, - num_cpus=num_cpus, - num_gpus=num_gpus, - memory=memory, - object_store_memory=object_store_memory, - resources=resources, - accelerator_type=accelerator_type, - max_concurrency=max_concurrency, - max_restarts=max_restarts, - max_task_retries=max_task_retries, - name=name, - lifetime=lifetime, - placement_group=placement_group, - placement_group_bundle_index=placement_group_bundle_index, - placement_group_capture_child_tasks=( - placement_group_capture_child_tasks), - override_environment_variables=( - override_environment_variables)) - worker = ray.worker.global_worker worker.check_connected() diff --git a/python/ray/remote_function.py b/python/ray/remote_function.py index 3b8b42062b3e..e717e2d28fe7 100644 --- a/python/ray/remote_function.py +++ b/python/ray/remote_function.py @@ -4,8 +4,6 @@ from ray import cloudpickle as pickle from ray._raylet import PythonFunctionDescriptor from ray import cross_language, Language -from ray._private.client_mode_hook import client_mode_convert_function -from ray._private.client_mode_hook import client_mode_should_convert from ray.util.placement_group import ( PlacementGroup, check_placement_group_index, @@ -183,26 +181,6 @@ def _remote(self, override_environment_variables=None, name=""): """Submit the remote function for execution.""" - if client_mode_should_convert(): - return client_mode_convert_function( - self, - args, - kwargs, - num_returns=num_returns, - num_cpus=num_cpus, - num_gpus=num_gpus, - memory=memory, - object_store_memory=object_store_memory, - accelerator_type=accelerator_type, - resources=resources, - max_retries=max_retries, - placement_group=placement_group, - placement_group_bundle_index=placement_group_bundle_index, - placement_group_capture_child_tasks=( - placement_group_capture_child_tasks), - override_environment_variables=override_environment_variables, - name=name) - worker = ray.worker.global_worker worker.check_connected() diff --git a/python/ray/tests/test_client_init.py b/python/ray/tests/test_client_init.py index 9528f1d202fe..5e43ac6314b7 100644 --- a/python/ray/tests/test_client_init.py +++ b/python/ray/tests/test_client_init.py @@ -2,7 +2,6 @@ import pytest import time -import random import sys import ray.util.client.server.server as ray_client_server @@ -10,54 +9,6 @@ from ray.util.client import RayAPIStub -import ray - - -@ray.remote -def hello_world(): - c1 = complex_task.remote(random.randint(1, 10)) - c2 = complex_task.remote(random.randint(1, 10)) - return sum(ray.get([c1, c2])) - - -@ray.remote -def complex_task(value): - time.sleep(1) - return value * 10 - - -@ray.remote -class C: - def __init__(self, x): - self.val = x - - def double(self): - self.val += self.val - - def get(self): - return self.val - - -def test_basic_preregister(): - from ray.util.client import ray - server, _ = ray_client_server.init_and_serve("localhost:50051") - try: - ray.connect("localhost:50051") - val = ray.get(hello_world.remote()) - print(val) - assert val >= 20 - assert val <= 200 - c = C.remote(3) - x = c.double.remote() - y = c.double.remote() - ray.wait([x, y]) - val = ray.get(c.get.remote()) - assert val == 12 - finally: - ray.disconnect() - ray_client_server.shutdown_with_server(server) - time.sleep(2) - def test_num_clients(): # Tests num clients reporting; useful if you want to build an app that diff --git a/python/ray/util/client/api.py b/python/ray/util/client/api.py index 5b1ae881e5cd..7d8576d1f276 100644 --- a/python/ray/util/client/api.py +++ b/python/ray/util/client/api.py @@ -4,8 +4,6 @@ from ray.util.client.runtime_context import ClientWorkerPropertyAPI from typing import TYPE_CHECKING if TYPE_CHECKING: - from ray.actor import ActorClass - from ray.remote_function import RemoteFunction from ray.util.client.common import ClientStub from ray.util.client.common import ClientActorHandle from ray.util.client.common import ClientObjectRef @@ -267,18 +265,6 @@ def _internal_kv_list(self, prefix: bytes) -> bytes: """Hook for internal_kv._internal_kv_list.""" return self.worker.internal_kv_list(as_bytes(prefix)) - def _convert_actor(self, actor: "ActorClass") -> str: - """Register a ClientActorClass for the ActorClass and return a UUID""" - return self.worker._convert_actor(actor) - - def _convert_function(self, func: "RemoteFunction") -> str: - """Register a ClientRemoteFunc for the ActorClass and return a UUID""" - return self.worker._convert_function(func) - - def _get_converted(self, key: str) -> "ClientStub": - """Given a UUID, return the converted object""" - return self.worker._get_converted(key) - def __getattr__(self, key: str): if not key.startswith("_"): raise NotImplementedError( diff --git a/python/ray/util/client/common.py b/python/ray/util/client/common.py index 8eac0983a390..2bcd14f3f586 100644 --- a/python/ray/util/client/common.py +++ b/python/ray/util/client/common.py @@ -82,11 +82,7 @@ def remote(self, *args, **kwargs): def options(self, **kwargs): return OptionWrapper(self, kwargs) - def _remote(self, args=None, kwargs=None, **option_args): - if args is None: - args = [] - if kwargs is None: - kwargs = {} + def _remote(self, args=[], kwargs={}, **option_args): return self.options(**option_args).remote(*args, **kwargs) def __repr__(self): @@ -154,11 +150,7 @@ def remote(self, *args, **kwargs) -> "ClientActorHandle": def options(self, **kwargs): return ActorOptionWrapper(self, kwargs) - def _remote(self, args=None, kwargs=None, **option_args): - if args is None: - args = [] - if kwargs is None: - kwargs = {} + def _remote(self, args=[], kwargs={}, **option_args): return self.options(**option_args).remote(*args, **kwargs) def __repr__(self): @@ -238,11 +230,7 @@ def __repr__(self): def options(self, **kwargs): return OptionWrapper(self, kwargs) - def _remote(self, args=None, kwargs=None, **option_args): - if args is None: - args = [] - if kwargs is None: - kwargs = {} + def _remote(self, args=[], kwargs={}, **option_args): return self.options(**option_args).remote(*args, **kwargs) def _prepare_client_task(self) -> ray_client_pb2.ClientTask: diff --git a/python/ray/util/client/options.py b/python/ray/util/client/options.py index b2f1dae8138a..79727b126473 100644 --- a/python/ray/util/client/options.py +++ b/python/ray/util/client/options.py @@ -46,10 +46,9 @@ def validate_options( raise TypeError(f"Invalid option passed to remote(): {k}") validator = options[k] if len(validator) != 0: - if v is not None: - if not isinstance(v, validator[0]): - raise ValueError(validator[2]) - if not validator[1](v): - raise ValueError(validator[2]) + if not isinstance(v, validator[0]): + raise ValueError(validator[2]) + if not validator[1](v): + raise ValueError(validator[2]) out[k] = v return out diff --git a/python/ray/util/client/worker.py b/python/ray/util/client/worker.py index 3f04c80a48ca..535ec5ab76b4 100644 --- a/python/ray/util/client/worker.py +++ b/python/ray/util/client/worker.py @@ -13,7 +13,6 @@ from typing import List from typing import Tuple from typing import Optional -from typing import TYPE_CHECKING import grpc @@ -23,19 +22,12 @@ from ray.util.client.client_pickler import convert_to_arg from ray.util.client.client_pickler import dumps_from_client from ray.util.client.client_pickler import loads_from_server -from ray.util.client.common import ClientStub from ray.util.client.common import ClientActorHandle -from ray.util.client.common import ClientActorClass -from ray.util.client.common import ClientRemoteFunc from ray.util.client.common import ClientActorRef from ray.util.client.common import ClientObjectRef from ray.util.client.dataclient import DataClient from ray.util.client.logsclient import LogstreamClient -if TYPE_CHECKING: - from ray.actor import ActorClass - from ray.remote_function import RemoteFunction - logger = logging.getLogger(__name__) INITIAL_TIMEOUT_SEC = 5 @@ -70,7 +62,6 @@ def __init__(self, self.channel = None self._conn_state = grpc.ChannelConnectivity.IDLE self._client_id = make_client_id() - self._converted: Dict[str, ClientStub] = {} if secure: credentials = grpc.ssl_channel_credentials() self.channel = grpc.secure_channel(conn_str, credentials) @@ -380,47 +371,6 @@ def is_initialized(self) -> bool: def is_connected(self) -> bool: return self._conn_state == grpc.ChannelConnectivity.READY - def _convert_actor(self, actor: "ActorClass") -> str: - """Register a ClientActorClass for the ActorClass and return a UUID""" - key = uuid.uuid4().hex - md = actor.__ray_metadata__ - cls = md.modified_class - self._converted[key] = ClientActorClass( - cls, - options={ - "max_restarts": md.max_restarts, - "max_task_retries": md.max_task_retries, - "num_cpus": md.num_cpus, - "num_gpus": md.num_gpus, - "memory": md.memory, - "object_store_memory": md.object_store_memory, - "resources": md.resources, - "accelerator_type": md.accelerator_type, - }) - return key - - def _convert_function(self, func: "RemoteFunction") -> str: - """Register a ClientRemoteFunc for the ActorClass and return a UUID""" - key = uuid.uuid4().hex - f = func._function - self._converted[key] = ClientRemoteFunc( - f, - options={ - "num_cpus": func._num_cpus, - "num_gpus": func._num_gpus, - "max_calls": func._max_calls, - "max_retries": func._max_retries, - "resources": func._resources, - "accelerator_type": func._accelerator_type, - "num_returns": func._num_returns, - "memory": func._memory - }) - return key - - def _get_converted(self, key: str) -> "ClientStub": - """Given a UUID, return the converted object""" - return self._converted[key] - def make_client_id() -> str: id = uuid.uuid4() diff --git a/python/ray/worker.py b/python/ray/worker.py index 00d99930cf95..337b4ffc95fe 100644 --- a/python/ray/worker.py +++ b/python/ray/worker.py @@ -1768,6 +1768,7 @@ def decorator(function_or_class): return decorator +@client_mode_hook def remote(*args, **kwargs): """Defines a remote function or an actor class. From 4218eeb956436e7a76b066236fc323e7c85c4de1 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 100/244] Revert "[Core] Put raylet ip's in resource usage report (#13871)" This reverts commit 988ff31cbab1c773b7e4fb1e1d1c69c9d19f87cd. --- python/ray/tests/test_global_state.py | 26 -------------------------- src/ray/protobuf/gcs.proto | 2 -- src/ray/raylet/node_manager.cc | 1 - 3 files changed, 29 deletions(-) diff --git a/python/ray/tests/test_global_state.py b/python/ray/tests/test_global_state.py index 7522039eceed..3dcd64c1ebd2 100644 --- a/python/ray/tests/test_global_state.py +++ b/python/ray/tests/test_global_state.py @@ -7,7 +7,6 @@ import ray import ray.ray_constants -import ray.services import ray.test_utils from ray._raylet import GlobalStateAccessor @@ -333,31 +332,6 @@ def backlog_size_set(): global_state_accessor.disconnect() -def test_heartbeat_ip(shutdown_only): - cluster = ray.init( - num_cpus=1, _system_config={ - "report_worker_backlog": True, - }) - global_state_accessor = GlobalStateAccessor( - cluster["redis_address"], ray.ray_constants.REDIS_DEFAULT_PASSWORD) - global_state_accessor.connect() - - self_ip = ray.services.get_node_ip_address() - - def self_ip_is_set(): - message = global_state_accessor.get_all_resource_usage() - if message is None: - return False - - resource_usage = ray.gcs_utils.ResourceUsageBatchData.FromString( - message) - resources_data = resource_usage.batch[0] - return resources_data.node_manager_address == self_ip - - ray.test_utils.wait_for_condition(self_ip_is_set, timeout=2) - global_state_accessor.disconnect() - - if __name__ == "__main__": import pytest import sys diff --git a/src/ray/protobuf/gcs.proto b/src/ray/protobuf/gcs.proto index a56bffbe1147..902c29cb7f58 100644 --- a/src/ray/protobuf/gcs.proto +++ b/src/ray/protobuf/gcs.proto @@ -327,8 +327,6 @@ message ResourcesData { ResourceLoad resource_load_by_shape = 7; // Whether this node manager is requesting global GC. bool should_global_gc = 8; - // IP address of the node. - string node_manager_address = 9; } message ResourceUsageBatchData { diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index e784758b1c92..cbe287ef721d 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -456,7 +456,6 @@ void NodeManager::Heartbeat() { void NodeManager::ReportResourceUsage() { auto resources_data = std::make_shared(); resources_data->set_node_id(self_node_id_.Binary()); - resources_data->set_node_manager_address(initial_config_.node_manager_address); // Update local chche from gcs remote cache, this is needed when gcs restart. // We should always keep the cache view consistent. cluster_resource_scheduler_->UpdateLastResourceUsage( From f08e8143a267698532c788d51d887f53b2603988 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 101/244] Revert "Enabling the cancellation of non-actor tasks in a worker's queue 2 (#13244)" This reverts commit cc2e895a48ef4a475cfbe09a80fdcf966d18ecf7. --- python/ray/tests/test_cancel.py | 9 +-- src/ray/core_worker/core_worker.cc | 15 +---- src/ray/core_worker/test/core_worker_test.cc | 42 -------------- src/ray/core_worker/test/mock_worker.cc | 11 ---- .../core_worker/test/scheduling_queue_test.cc | 27 ++------- .../transport/direct_actor_transport.cc | 10 +--- .../transport/direct_actor_transport.h | 56 +++---------------- 7 files changed, 19 insertions(+), 151 deletions(-) diff --git a/python/ray/tests/test_cancel.py b/python/ray/tests/test_cancel.py index aefff09fae62..11b4dfbd4e64 100644 --- a/python/ray/tests/test_cancel.py +++ b/python/ray/tests/test_cancel.py @@ -175,8 +175,6 @@ def infinite_sleep(y): sleep_or_no = [random.randint(0, 1) for _ in range(100)] tasks = [infinite_sleep.remote(i) for i in sleep_or_no] cancelled = set() - - # Randomly kill queued tasks (infinitely sleeping or not). for t in tasks: if random.random() > 0.5: ray.cancel(t, force=use_force) @@ -188,13 +186,10 @@ def infinite_sleep(y): for done in cancelled: with pytest.raises(valid_exceptions(use_force)): ray.get(done, timeout=120) - - # Kill all infinitely sleeping tasks (queued or not). for indx, t in enumerate(tasks): if sleep_or_no[indx]: ray.cancel(t, force=use_force) cancelled.add(t) - for indx, t in enumerate(tasks): if t in cancelled: with pytest.raises(valid_exceptions(use_force)): ray.get(t, timeout=120) @@ -218,8 +213,8 @@ def fast(y): # between a worker receiving a task and the worker executing # that task (specifically the python execution), Cancellation # can fail. - - time.sleep(0.1) + if not use_force: + time.sleep(0.1) ray.cancel(x, force=use_force) ids.append(x) diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index b56f18cf04e4..1961406d8a8a 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -760,7 +760,6 @@ void CoreWorker::InternalHeartbeat(const boost::system::error_code &error) { } absl::MutexLock lock(&mutex_); - while (!to_resubmit_.empty() && current_time_ms() > to_resubmit_.front().first) { auto &spec = to_resubmit_.front().second; if (spec.IsActorTask()) { @@ -2267,17 +2266,12 @@ void CoreWorker::HandleCancelTask(const rpc::CancelTaskRequest &request, rpc::SendReplyCallback send_reply_callback) { absl::MutexLock lock(&mutex_); TaskID task_id = TaskID::FromBinary(request.intended_task_id()); - bool requested_task_running = main_thread_task_id_ == task_id; - bool success = requested_task_running; + bool success = main_thread_task_id_ == task_id; // Try non-force kill - if (requested_task_running && !request.force_kill()) { + if (success && !request.force_kill()) { RAY_LOG(INFO) << "Interrupting a running task " << main_thread_task_id_; success = options_.kill_main(); - } else if (!requested_task_running) { - // If the task is not currently running, check if it is in the worker's queue of - // normal tasks, and remove it if found. - success = direct_task_receiver_->CancelQueuedNormalTask(task_id); } if (request.recursive()) { auto recursive_cancel = CancelChildren(task_id, request.force_kill()); @@ -2286,14 +2280,11 @@ void CoreWorker::HandleCancelTask(const rpc::CancelTaskRequest &request, } } - // TODO: fix race condition to avoid using this hack - requested_task_running = main_thread_task_id_ == task_id; - reply->set_attempt_succeeded(success); send_reply_callback(Status::OK(), nullptr, nullptr); // Do force kill after reply callback sent - if (requested_task_running && request.force_kill()) { + if (success && request.force_kill()) { RAY_LOG(INFO) << "Force killing a worker running " << main_thread_task_id_; Disconnect(); if (options_.enable_logging) { diff --git a/src/ray/core_worker/test/core_worker_test.cc b/src/ray/core_worker/test/core_worker_test.cc index cf1bab624de2..82ea826175e4 100644 --- a/src/ray/core_worker/test/core_worker_test.cc +++ b/src/ray/core_worker/test/core_worker_test.cc @@ -841,48 +841,6 @@ TEST_F(SingleNodeTest, TestNormalTaskLocal) { TestNormalTask(resources); } -TEST_F(SingleNodeTest, TestCancelTasks) { - auto &driver = CoreWorkerProcess::GetCoreWorker(); - - // Create two functions, each implementing a while(true) loop. - RayFunction func1(ray::Language::PYTHON, ray::FunctionDescriptorBuilder::BuildPython( - "WhileTrueLoop", "", "", "")); - RayFunction func2(ray::Language::PYTHON, ray::FunctionDescriptorBuilder::BuildPython( - "WhileTrueLoop", "", "", "")); - // Return IDs for the two functions that implement while(true) loops. - std::vector return_ids1; - std::vector return_ids2; - - // Create default args and options needed to submit the tasks that encapsulate func1 and - // func2. - std::vector> args; - TaskOptions options; - - // Submit func1. The function should start looping forever. - driver.SubmitTask(func1, args, options, &return_ids1, /*max_retries=*/0, - std::make_pair(PlacementGroupID::Nil(), -1), true, - /*debugger_breakpoint=*/""); - ASSERT_EQ(return_ids1.size(), 1); - - // Submit func2. The function should be queued at the worker indefinitely. - driver.SubmitTask(func2, args, options, &return_ids2, /*max_retries=*/0, - std::make_pair(PlacementGroupID::Nil(), -1), true, - /*debugger_breakpoint=*/""); - ASSERT_EQ(return_ids2.size(), 1); - - // Cancel func2 by removing it from the worker's queue - RAY_CHECK_OK(driver.CancelTask(return_ids2[0], true, false)); - - // Cancel func1, which is currently running. - RAY_CHECK_OK(driver.CancelTask(return_ids1[0], true, false)); - - // TestNormalTask will get stuck unless both func1 and func2 have been cancelled. Thus, - // if TestNormalTask succeeds, we know that func2 must have been removed from the - // worker's queue. - std::unordered_map resources; - TestNormalTask(resources); -} - TEST_F(TwoNodeTest, TestNormalTaskCrossNodes) { std::unordered_map resources; resources.emplace("resource1", 1); diff --git a/src/ray/core_worker/test/mock_worker.cc b/src/ray/core_worker/test/mock_worker.cc index 03a78a1981a7..4439519bb5ce 100644 --- a/src/ray/core_worker/test/mock_worker.cc +++ b/src/ray/core_worker/test/mock_worker.cc @@ -79,8 +79,6 @@ class MockWorker { } else if ("MergeInputArgsAsOutput" == typed_descriptor->ModuleName()) { // Merge input args and write the merged content to each of return ids return MergeInputArgsAsOutput(args, return_ids, results); - } else if ("WhileTrueLoop" == typed_descriptor->ModuleName()) { - return WhileTrueLoop(args, return_ids, results); } else { return Status::TypeError("Unknown function descriptor: " + typed_descriptor->ModuleName()); @@ -130,15 +128,6 @@ class MockWorker { return Status::OK(); } - Status WhileTrueLoop(const std::vector> &args, - const std::vector &return_ids, - std::vector> *results) { - while (1) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } - return Status::OK(); - } - int64_t prev_seq_no_ = 0; }; diff --git a/src/ray/core_worker/test/scheduling_queue_test.cc b/src/ray/core_worker/test/scheduling_queue_test.cc index 6854c1810e3e..8c8e60fd5251 100644 --- a/src/ray/core_worker/test/scheduling_queue_test.cc +++ b/src/ray/core_worker/test/scheduling_queue_test.cc @@ -66,9 +66,9 @@ TEST(SchedulingQueueTest, TestWaitForObjects) { auto fn_ok = [&n_ok]() { n_ok++; }; auto fn_rej = [&n_rej]() { n_rej++; }; queue.Add(0, -1, fn_ok, fn_rej); - queue.Add(1, -1, fn_ok, fn_rej, TaskID::Nil(), ObjectIdsToRefs({obj1})); - queue.Add(2, -1, fn_ok, fn_rej, TaskID::Nil(), ObjectIdsToRefs({obj2})); - queue.Add(3, -1, fn_ok, fn_rej, TaskID::Nil(), ObjectIdsToRefs({obj3})); + queue.Add(1, -1, fn_ok, fn_rej, ObjectIdsToRefs({obj1})); + queue.Add(2, -1, fn_ok, fn_rej, ObjectIdsToRefs({obj2})); + queue.Add(3, -1, fn_ok, fn_rej, ObjectIdsToRefs({obj3})); ASSERT_EQ(n_ok, 1); waiter.Complete(0); @@ -92,7 +92,7 @@ TEST(SchedulingQueueTest, TestWaitForObjectsNotSubjectToSeqTimeout) { auto fn_ok = [&n_ok]() { n_ok++; }; auto fn_rej = [&n_rej]() { n_rej++; }; queue.Add(0, -1, fn_ok, fn_rej); - queue.Add(1, -1, fn_ok, fn_rej, TaskID::Nil(), ObjectIdsToRefs({obj1})); + queue.Add(1, -1, fn_ok, fn_rej, ObjectIdsToRefs({obj1})); ASSERT_EQ(n_ok, 1); io_service.run(); ASSERT_EQ(n_rej, 0); @@ -158,25 +158,6 @@ TEST(SchedulingQueueTest, TestSkipAlreadyProcessedByClient) { ASSERT_EQ(n_rej, 2); } -TEST(SchedulingQueueTest, TestCancelQueuedTask) { - NormalSchedulingQueue *queue = new NormalSchedulingQueue(); - ASSERT_TRUE(queue->TaskQueueEmpty()); - int n_ok = 0; - int n_rej = 0; - auto fn_ok = [&n_ok]() { n_ok++; }; - auto fn_rej = [&n_rej]() { n_rej++; }; - queue->Add(-1, -1, fn_ok, fn_rej); - queue->Add(-1, -1, fn_ok, fn_rej); - queue->Add(-1, -1, fn_ok, fn_rej); - queue->Add(-1, -1, fn_ok, fn_rej); - queue->Add(-1, -1, fn_ok, fn_rej); - ASSERT_TRUE(queue->CancelTaskIfFound(TaskID::Nil())); - ASSERT_FALSE(queue->TaskQueueEmpty()); - queue->ScheduleRequests(); - ASSERT_EQ(n_ok, 4); - ASSERT_EQ(n_rej, 0); -} - } // namespace ray int main(int argc, char **argv) { diff --git a/src/ray/core_worker/transport/direct_actor_transport.cc b/src/ray/core_worker/transport/direct_actor_transport.cc index bac80af4f7a6..e266b0d94f01 100644 --- a/src/ray/core_worker/transport/direct_actor_transport.cc +++ b/src/ray/core_worker/transport/direct_actor_transport.cc @@ -482,12 +482,12 @@ void CoreWorkerDirectTaskReceiver::HandleTask( // TODO(swang): Remove this with legacy raylet code. dependencies.pop_back(); it->second->Add(request.sequence_number(), request.client_processed_up_to(), - accept_callback, reject_callback, task_spec.TaskId(), dependencies); + accept_callback, reject_callback, dependencies); } else { // Add the normal task's callbacks to the non-actor scheduling queue. normal_scheduling_queue_->Add(request.sequence_number(), request.client_processed_up_to(), accept_callback, - reject_callback, task_spec.TaskId(), dependencies); + reject_callback, dependencies); } } @@ -501,10 +501,4 @@ void CoreWorkerDirectTaskReceiver::RunNormalTasksFromQueue() { normal_scheduling_queue_->ScheduleRequests(); } -bool CoreWorkerDirectTaskReceiver::CancelQueuedNormalTask(TaskID task_id) { - // Look up the task to be canceled in the queue of normal tasks. If it is found and - // removed successfully, return true. - return normal_scheduling_queue_->CancelTaskIfFound(task_id); -} - } // namespace ray diff --git a/src/ray/core_worker/transport/direct_actor_transport.h b/src/ray/core_worker/transport/direct_actor_transport.h index cbd0a82fccf6..ab28dc85a8ba 100644 --- a/src/ray/core_worker/transport/direct_actor_transport.h +++ b/src/ray/core_worker/transport/direct_actor_transport.h @@ -254,23 +254,19 @@ class InboundRequest { public: InboundRequest(){}; InboundRequest(std::function accept_callback, - std::function reject_callback, TaskID task_id, - bool has_dependencies) + std::function reject_callback, bool has_dependencies) : accept_callback_(accept_callback), reject_callback_(reject_callback), - task_id(task_id), has_pending_dependencies_(has_dependencies) {} void Accept() { accept_callback_(); } void Cancel() { reject_callback_(); } bool CanExecute() const { return !has_pending_dependencies_; } - ray::TaskID TaskID() const { return task_id; } void MarkDependenciesSatisfied() { has_pending_dependencies_ = false; } private: std::function accept_callback_; std::function reject_callback_; - ray::TaskID task_id; bool has_pending_dependencies_; }; @@ -350,11 +346,10 @@ class SchedulingQueue { public: virtual void Add(int64_t seq_no, int64_t client_processed_up_to, std::function accept_request, - std::function reject_request, TaskID task_id = TaskID::Nil(), + std::function reject_request, const std::vector &dependencies = {}) = 0; virtual void ScheduleRequests() = 0; virtual bool TaskQueueEmpty() const = 0; - virtual bool CancelTaskIfFound(TaskID task_id) = 0; virtual ~SchedulingQueue(){}; }; @@ -376,7 +371,6 @@ class ActorSchedulingQueue : public SchedulingQueue { /// Add a new actor task's callbacks to the worker queue. void Add(int64_t seq_no, int64_t client_processed_up_to, std::function accept_request, std::function reject_request, - TaskID task_id = TaskID::Nil(), const std::vector &dependencies = {}) { // A seq_no of -1 means no ordering constraint. Actor tasks must be executed in order. RAY_CHECK(seq_no != -1); @@ -389,7 +383,7 @@ class ActorSchedulingQueue : public SchedulingQueue { } RAY_LOG(DEBUG) << "Enqueue " << seq_no << " cur seqno " << next_seq_no_; pending_actor_tasks_[seq_no] = - InboundRequest(accept_request, reject_request, task_id, dependencies.size() > 0); + InboundRequest(accept_request, reject_request, dependencies.size() > 0); if (dependencies.size() > 0) { waiter_.Wait(dependencies, [seq_no, this]() { RAY_CHECK(boost::this_thread::get_id() == main_thread_id_); @@ -403,15 +397,6 @@ class ActorSchedulingQueue : public SchedulingQueue { ScheduleRequests(); } - // We don't allow the cancellation of actor tasks, so invoking CancelTaskIfFound results - // in a fatal error. - bool CancelTaskIfFound(TaskID task_id) { - RAY_CHECK(false) << "Cannot cancel actor tasks"; - // The return instruction will never be executed, but we need to include it - // nonetheless because this is a non-void function. - return false; - } - /// Schedules as many requests as possible in sequence. void ScheduleRequests() { // Only call SetMaxActorConcurrency to configure threadpool size when the @@ -535,45 +520,22 @@ class NormalSchedulingQueue : public SchedulingQueue { /// Add a new task's callbacks to the worker queue. void Add(int64_t seq_no, int64_t client_processed_up_to, std::function accept_request, std::function reject_request, - TaskID task_id = TaskID::Nil(), const std::vector &dependencies = {}) { absl::MutexLock lock(&mu_); // Normal tasks should not have ordering constraints. RAY_CHECK(seq_no == -1); // Create a InboundRequest object for the new task, and add it to the queue. pending_normal_tasks_.push_back( - InboundRequest(accept_request, reject_request, task_id, dependencies.size() > 0)); - } - - // Search for an InboundRequest associated with the task that we are trying to cancel. - // If found, remove the InboundRequest from the queue and return true. Otherwise, return - // false. - bool CancelTaskIfFound(TaskID task_id) { - absl::MutexLock lock(&mu_); - for (std::deque::reverse_iterator it = pending_normal_tasks_.rbegin(); - it != pending_normal_tasks_.rend(); ++it) { - if (it->TaskID() == task_id) { - pending_normal_tasks_.erase(std::next(it).base()); - return true; - } - } - return false; + InboundRequest(accept_request, reject_request, dependencies.size() > 0)); } /// Schedules as many requests as possible in sequence. void ScheduleRequests() { - while (true) { - InboundRequest head; - { - absl::MutexLock lock(&mu_); - if (!pending_normal_tasks_.empty()) { - head = pending_normal_tasks_.front(); - pending_normal_tasks_.pop_front(); - } else { - return; - } - } + absl::MutexLock lock(&mu_); + while (!pending_normal_tasks_.empty()) { + auto &head = pending_normal_tasks_.front(); head.Accept(); + pending_normal_tasks_.pop_front(); } } @@ -621,8 +583,6 @@ class CoreWorkerDirectTaskReceiver { /// Pop tasks from the queue and execute them sequentially void RunNormalTasksFromQueue(); - bool CancelQueuedNormalTask(TaskID task_id); - private: // Worker context. WorkerContext &worker_context_; From b2f4815a9b1fe97d9b054540fe66b4e340d69c47 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 102/244] Revert "[docs] Update actors.rst (#13873)" This reverts commit d3bef59c8ae4497e54aa4d0a5e60d7e534c9abe9. --- doc/source/actors.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/actors.rst b/doc/source/actors.rst index 9e4a0fd34dba..c680b1558b3d 100644 --- a/doc/source/actors.rst +++ b/doc/source/actors.rst @@ -105,7 +105,7 @@ Methods of the actor can be called remotely. counter_actor = Counter.remote() - assert ray.get(counter_actor.increment.remote()) == 1 + assert counter_actor.increment.remote() == 1 @ray.remote class Foo(object): From 164a3f535811ada54aaba04f02e58abd929d97d3 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 103/244] Revert "[serve] Small cleanups for BackendState (#13870)" This reverts commit 2b7a010d3bcfdddcd1452c79ac73be847343f210. --- python/ray/serve/backend_state.py | 53 ++++++++++++++++++++++++------- python/ray/serve/controller.py | 2 +- 2 files changed, 43 insertions(+), 12 deletions(-) diff --git a/python/ray/serve/backend_state.py b/python/ray/serve/backend_state.py index 418ab3b2ad12..4aad2671ea4e 100644 --- a/python/ray/serve/backend_state.py +++ b/python/ray/serve/backend_state.py @@ -347,10 +347,40 @@ def update_backend_config(self, backend_tag: BackendTag, return new_goal_id - def _scale_backend_replicas( + def _start_backend_replica(self, backend_tag: BackendTag, + replica_tag: ReplicaTag) -> ActorHandle: + """Start a replica and return its actor handle. + + Checks if the named actor already exists before starting a new one. + + Assumes that the backend configuration is already in the Goal State. + """ + # NOTE(edoakes): the replicas may already be created if we + # failed after creating them but before writing a + # checkpoint. + replica_name = format_actor_name(replica_tag, self._controller_name) + try: + replica_handle = ray.get_actor(replica_name) + except ValueError: + logger.debug("Starting replica '{}' for backend '{}'.".format( + replica_tag, backend_tag)) + backend_info = self.get_backend(backend_tag) + + replica_handle = ray.remote(backend_info.worker_class).options( + name=replica_name, + lifetime="detached" if self._detached else None, + max_restarts=-1, + max_task_retries=-1, + **backend_info.replica_config.ray_actor_options).remote( + backend_tag, replica_tag, + backend_info.replica_config.actor_init_args, + backend_info.backend_config, self._controller_name) + + return replica_handle + + def scale_backend_replicas( self, backend_tag: BackendTag, - num_replicas: int, ) -> bool: """Scale the given backend to the number of replicas. @@ -361,6 +391,8 @@ def _scale_backend_replicas( inconsistencies with starting/stopping a replica and then crashing before writing a checkpoint. """ + num_replicas = self._target_replicas.get(backend_tag, 0) + logger.debug("Scaling backend '{}' to {} replicas".format( backend_tag, num_replicas)) assert (backend_tag in self._backend_metadata @@ -429,11 +461,11 @@ def _scale_backend_replicas( return True - def _scale_all_backends(self): + def scale_all_backends(self): checkpoint_needed = False for backend_tag, num_replicas in list(self._target_replicas.items()): - checkpoint_needed |= self._scale_backend_replicas( - backend_tag, num_replicas) + checkpoint_needed = (checkpoint_needed + or self.scale_backend_replicas(backend_tag)) if num_replicas == 0: del self._backend_metadata[backend_tag] del self._target_replicas[backend_tag] @@ -469,24 +501,23 @@ def _completed_goals(self) -> List[GoalId]: or state_dict.get(ReplicaState.STOPPING)): continue - # Check for deleting. + # TODO(ilr): FIX + # Check for deleting if (not desired_num_replicas or desired_num_replicas == 0) and \ (not existing_info or len(existing_info) == 0): completed_goals.append( self.backend_goals.pop(backend_tag, None)) - # Check for a non-zero number of backends. + # Check for a non-zero number of backends if (desired_num_replicas and existing_info) \ and desired_num_replicas == len(existing_info): completed_goals.append( self.backend_goals.pop(backend_tag, None)) return [goal for goal in completed_goals if goal] - def update(self) -> bool: - """Updates the state of all running replicas to match the goal state. - """ - self._scale_all_backends() + async def update(self) -> bool: + self.scale_all_backends() for goal_id in self._completed_goals(): self._goal_manager.complete_goal(goal_id) diff --git a/python/ray/serve/controller.py b/python/ray/serve/controller.py index 0ad444a54b36..b5c65111a8f9 100644 --- a/python/ray/serve/controller.py +++ b/python/ray/serve/controller.py @@ -111,7 +111,7 @@ async def run_control_loop(self) -> None: while True: async with self.write_lock: self.http_state.update() - self.backend_state.update() + await self.backend_state.update() await asyncio.sleep(CONTROL_LOOP_PERIOD_S) From 72c1dafd07ec87b4e087051a3af141aeb41dfe2a Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 104/244] Revert "[joblib] Log once the context warning argument. (#13865)" This reverts commit 3d89191bd528ecd3cd90cb0040732850aa36aeb5. --- python/ray/util/multiprocessing/pool.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/ray/util/multiprocessing/pool.py b/python/ray/util/multiprocessing/pool.py index b74e10279568..9910bc3a46a9 100644 --- a/python/ray/util/multiprocessing/pool.py +++ b/python/ray/util/multiprocessing/pool.py @@ -9,7 +9,6 @@ import copy import ray -from ray.util import log_once logger = logging.getLogger(__name__) @@ -337,7 +336,7 @@ def __init__(self, self._maxtasksperchild = maxtasksperchild or -1 self._actor_deletion_ids = [] - if context and log_once("context_argument_warning"): + if context: logger.warning("The 'context' argument is not supported using " "ray. Please refer to the documentation for how " "to control ray initialization.") From 7fdec7752fa3ddfb35e2fdfb0385557d7324746a Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 105/244] Revert "Move the tune driver into a remote task (#13778)" This reverts commit 32824853325ba7e2f4f79edd5a9fb34ddc0b40c6. --- python/ray/tune/BUILD | 8 -- python/ray/tune/ray_trial_executor.py | 13 +++ python/ray/tune/tests/test_remote.py | 77 ------------- python/ray/tune/tests/test_trial_runner_3.py | 2 - .../tune/tests/test_trial_runner_callbacks.py | 1 - python/ray/tune/trial.py | 13 +-- python/ray/tune/tune.py | 103 +----------------- 7 files changed, 20 insertions(+), 197 deletions(-) delete mode 100644 python/ray/tune/tests/test_remote.py diff --git a/python/ray/tune/BUILD b/python/ray/tune/BUILD index b013dc4e4751..007055364a78 100644 --- a/python/ray/tune/BUILD +++ b/python/ray/tune/BUILD @@ -163,14 +163,6 @@ py_test( tags = ["exclusive"], ) -py_test( - name = "test_remote", - size = "medium", - srcs = ["tests/test_remote.py"], - deps = [":tune_lib"], - tags = ["exclusive"], -) - py_test( name = "test_sample", size = "medium", diff --git a/python/ray/tune/ray_trial_executor.py b/python/ray/tune/ray_trial_executor.py index c5aaeee79a8e..26480118c2b0 100644 --- a/python/ray/tune/ray_trial_executor.py +++ b/python/ray/tune/ray_trial_executor.py @@ -154,7 +154,15 @@ class RayTrialExecutor(TrialExecutor): def __init__(self, queue_trials: bool = False, reuse_actors: bool = False, + ray_auto_init: Optional[bool] = None, refresh_period: Optional[float] = None): + if ray_auto_init is None: + if os.environ.get("TUNE_DISABLE_AUTO_INIT") == "1": + logger.info("'TUNE_DISABLE_AUTO_INIT=1' detected.") + ray_auto_init = False + else: + ray_auto_init = True + super(RayTrialExecutor, self).__init__(queue_trials) # Check for if we are launching a trial without resources in kick off # autoscaler. @@ -185,6 +193,11 @@ def __init__(self, self._last_ip_refresh = float("-inf") self._last_ip_addresses = set() self._last_nontrivial_wait = time.time() + if not ray.is_initialized() and ray_auto_init: + logger.info("Initializing Ray automatically." + "For cluster usage or custom Ray initialization, " + "call `ray.init(...)` before `tune.run`.") + ray.init() if ray.is_initialized(): self._update_avail_resources() diff --git a/python/ray/tune/tests/test_remote.py b/python/ray/tune/tests/test_remote.py deleted file mode 100644 index 1e521c54b7a6..000000000000 --- a/python/ray/tune/tests/test_remote.py +++ /dev/null @@ -1,77 +0,0 @@ -import unittest - -import ray -from ray.tune import register_trainable, run_experiments, run -from ray.tune.result import TIMESTEPS_TOTAL -from ray.tune.experiment import Experiment -from ray.tune.trial import Trial -from ray.util.client.ray_client_helpers import ray_start_client_server - - -class RemoteTest(unittest.TestCase): - def tearDown(self): - ray.shutdown() - - def testRemoteRunExperiments(self): - def train(config, reporter): - for i in range(100): - reporter(timesteps_total=i) - - register_trainable("f1", train) - exp1 = Experiment(**{ - "name": "foo", - "run": "f1", - }) - [trial] = run_experiments(exp1, _remote=True) - self.assertEqual(trial.status, Trial.TERMINATED) - self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99) - - def testRemoteRun(self): - def train(config, reporter): - for i in range(100): - reporter(timesteps_total=i) - - analysis = run(train, _remote=True) - [trial] = analysis.trials - self.assertEqual(trial.status, Trial.TERMINATED) - self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99) - - def testRemoteRunExperimentsInClient(self): - ray.init() - assert not ray.util.client.ray.is_connected() - with ray_start_client_server(): - assert ray.util.client.ray.is_connected() - - def train(config, reporter): - for i in range(100): - reporter(timesteps_total=i) - - register_trainable("f1", train) - exp1 = Experiment(**{ - "name": "foo", - "run": "f1", - }) - [trial] = run_experiments(exp1) - self.assertEqual(trial.status, Trial.TERMINATED) - self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99) - - def testRemoteRunInClient(self): - ray.init() - assert not ray.util.client.ray.is_connected() - with ray_start_client_server(): - assert ray.util.client.ray.is_connected() - - def train(config, reporter): - for i in range(100): - reporter(timesteps_total=i) - - analysis = run(train) - [trial] = analysis.trials - self.assertEqual(trial.status, Trial.TERMINATED) - self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99) - - -if __name__ == "__main__": - import pytest - import sys - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/test_trial_runner_3.py b/python/ray/tune/tests/test_trial_runner_3.py index 3c2d05981677..b0c4a7063546 100644 --- a/python/ray/tune/tests/test_trial_runner_3.py +++ b/python/ray/tune/tests/test_trial_runner_3.py @@ -697,8 +697,6 @@ def num_checkpoints(trial): @patch("ray.tune.syncer.CLOUD_SYNC_PERIOD", 0) def testCheckpointAutoPeriod(self): - ray.init(num_cpus=3) - # This makes checkpointing take 2 seconds. def sync_up(source, target): time.sleep(2) diff --git a/python/ray/tune/tests/test_trial_runner_callbacks.py b/python/ray/tune/tests/test_trial_runner_callbacks.py index 6211220c2458..75b06d0e34c8 100644 --- a/python/ray/tune/tests/test_trial_runner_callbacks.py +++ b/python/ray/tune/tests/test_trial_runner_callbacks.py @@ -73,7 +73,6 @@ def get_next_failed_trial(self): class TrialRunnerCallbacks(unittest.TestCase): def setUp(self): - ray.init() self.tmpdir = tempfile.mkdtemp() self.callback = TestCallback() self.executor = _MockTrialExecutor() diff --git a/python/ray/tune/trial.py b/python/ray/tune/trial.py index 0070177803df..fc6152f97a40 100644 --- a/python/ray/tune/trial.py +++ b/python/ray/tune/trial.py @@ -166,13 +166,6 @@ class Trial: """ - _nonjson_fields = [ - "results", - "best_result", - "param_config", - "extra_arg", - ] - PENDING = "PENDING" RUNNING = "RUNNING" PAUSED = "PAUSED" @@ -296,6 +289,12 @@ def __init__(self, self.param_config = None self.extra_arg = None + self._nonjson_fields = [ + "results", + "best_result", + "param_config", + "extra_arg", + ] if trial_name_creator: self.custom_trial_name = trial_name_creator(self) diff --git a/python/ray/tune/tune.py b/python/ray/tune/tune.py index 6ce115126e8a..7df9e10570ec 100644 --- a/python/ray/tune/tune.py +++ b/python/ray/tune/tune.py @@ -8,7 +8,6 @@ import sys import time -import ray from ray.tune.analysis import ExperimentAnalysis from ray.tune.callback import Callback from ray.tune.error import TuneError @@ -112,7 +111,6 @@ def run( sync_to_cloud: Optional = None, sync_to_driver: Optional = None, sync_on_checkpoint: Optional = None, - _remote: bool = None, ) -> ExperimentAnalysis: """Executes training. @@ -272,9 +270,6 @@ def run( ``ray.tune.callback.Callback`` class. If not passed, `LoggerCallback` and `SyncerCallback` callbacks are automatically added. - _remote (bool): Whether to run the Tune driver in a remote function. - This is disabled automatically if a custom trial executor is - passed in. This is enabled by default in Ray client mode. Returns: ExperimentAnalysis: Object for experiment analysis. @@ -282,64 +277,6 @@ def run( Raises: TuneError: Any trials failed and `raise_on_failed_trial` is True. """ - - if _remote is None: - _remote = ray.util.client.ray.is_connected() - - if _remote is True and trial_executor: - raise ValueError("cannot use custom trial executor") - - if not trial_executor or isinstance(trial_executor, RayTrialExecutor): - _ray_auto_init() - - if _remote: - return ray.get( - ray.remote(num_cpus=0)(run).remote( - run_or_experiment, - name, - metric, - mode, - stop, - time_budget_s, - config, - resources_per_trial, - num_samples, - local_dir, - search_alg, - scheduler, - keep_checkpoints_num, - checkpoint_score_attr, - checkpoint_freq, - checkpoint_at_end, - verbose, - progress_reporter, - log_to_file, - trial_name_creator, - trial_dirname_creator, - sync_config, - export_formats, - max_failures, - fail_fast, - restore, - server_port, - resume, - queue_trials, - reuse_actors, - trial_executor, - raise_on_failed_trial, - callbacks, - # Deprecated args - loggers, - ray_auto_init, - run_errored_only, - global_checkpoint_period, - with_server, - upload_dir, - sync_to_cloud, - sync_to_driver, - sync_on_checkpoint, - _remote=False)) - all_start = time.time() if global_checkpoint_period: raise ValueError("global_checkpoint_period is deprecated. Set env var " @@ -572,8 +509,7 @@ def run_experiments( trial_executor: Optional[RayTrialExecutor] = None, raise_on_failed_trial: bool = True, concurrent: bool = True, - callbacks: Optional[Sequence[Callback]] = None, - _remote: bool = None): + callbacks: Optional[Sequence[Callback]] = None): """Runs and blocks until all trials finish. Examples: @@ -587,32 +523,6 @@ def run_experiments( List of Trial objects, holding data for each executed trial. """ - if _remote is None: - _remote = ray.util.client.ray.is_connected() - - if _remote is True and trial_executor: - raise ValueError("cannot use custom trial executor") - - if not trial_executor or isinstance(trial_executor, RayTrialExecutor): - _ray_auto_init() - - if _remote: - return ray.get( - ray.remote(num_cpus=0)(run_experiments).remote( - experiments, - scheduler, - server_port, - verbose, - progress_reporter, - resume, - queue_trials, - reuse_actors, - trial_executor, - raise_on_failed_trial, - concurrent, - callbacks, - _remote=False)) - # This is important to do this here # because it schematize the experiments # and it conducts the implicit registration. @@ -647,14 +557,3 @@ def run_experiments( scheduler=scheduler, callbacks=callbacks).trials return trials - - -def _ray_auto_init(): - """Initialize Ray unless already configured.""" - if os.environ.get("TUNE_DISABLE_AUTO_INIT") == "1": - logger.info("'TUNE_DISABLE_AUTO_INIT=1' detected.") - elif not ray.is_initialized(): - logger.info("Initializing Ray automatically." - "For cluster usage or custom Ray initialization, " - "call `ray.init(...)` before `tune.run`.") - ray.init() From 58bd4565bace015bd313e193e2c045e5e170f9f5 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 106/244] Revert "Fix bug that otal_commands_queued_ is not initialized (#13852)" This reverts commit 45649bc1fad49b1341a7b1d56e3442a39e586220. --- src/ray/gcs/pubsub/gcs_pub_sub.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ray/gcs/pubsub/gcs_pub_sub.h b/src/ray/gcs/pubsub/gcs_pub_sub.h index b871a02b13dd..e5b3c1509265 100644 --- a/src/ray/gcs/pubsub/gcs_pub_sub.h +++ b/src/ray/gcs/pubsub/gcs_pub_sub.h @@ -45,7 +45,7 @@ class GcsPubSub { using Callback = std::function; explicit GcsPubSub(std::shared_ptr redis_client) - : redis_client_(redis_client), total_commands_queued_(0) {} + : redis_client_(redis_client) {} virtual ~GcsPubSub() = default; From 2fbbeea69c180e1e8990d80243c3c82e4d33bc0d Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 107/244] Revert "remove starlette install instruction (#13869)" This reverts commit b6e5346005878c2184484a68c1c9b6681f7bf357. --- doc/source/serve/faq.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/source/serve/faq.rst b/doc/source/serve/faq.rst index 6faa5711266e..734293ec491c 100644 --- a/doc/source/serve/faq.rst +++ b/doc/source/serve/faq.rst @@ -95,6 +95,14 @@ and custom middlewares in Starlette format. The example below shows how to enabl `Cross-Origin Resource Sharing (CORS) `_. You can follow the same pattern for other Starlette middlewares. +.. note:: + + Serve does not list ``Starlette`` as one of its dependencies. To utilize this feature, + you will need to: + + .. code-block:: bash + + pip install starlette .. code-block:: python From 5654cda8456c9caa34bf0be9714581d3e874086b Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 108/244] Revert "[serve] Add example code for custom status code response (#13868)" This reverts commit 51e2c7ec99fbc6cf4fa86028599950cd6b0c81f8. --- doc/source/serve/faq.rst | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/doc/source/serve/faq.rst b/doc/source/serve/faq.rst index 734293ec491c..a9d66b610a60 100644 --- a/doc/source/serve/faq.rst +++ b/doc/source/serve/faq.rst @@ -73,20 +73,6 @@ To call a method via Python, use :mod:`handle.options `_ from your backend code: - -.. code-block:: python - - from starlette.responses import Response - - def f(starlette_request): - return Response('Hello, world!', status_code=123, media_type='text/plain') - - client.create_backend("hello", f) - How do I enable CORS and other HTTP features? --------------------------------------------- From 0e55f7dc8cfcf43b50bf444bf2fe9bcacbd176ee Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 109/244] Revert "[Hotfix] Lint (#13864)" This reverts commit 52609d7cd54e43e211927b1da9c7c73eff24a578. --- python/ray/serve/utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/python/ray/serve/utils.py b/python/ray/serve/utils.py index 10753fcb5a2c..b4fdbf497e87 100644 --- a/python/ray/serve/utils.py +++ b/python/ray/serve/utils.py @@ -399,10 +399,7 @@ def __call__(self, batch): } for _ in range(len(batch))] async def other_method(self, batch): - responses = [] - for request in batch: - responses.append(await request.body()) - return responses + return [await request.body() for request in batch] def compute_iterable_delta(old: Iterable, From 63d17fc8e8cfafeaa08276ab7f01a9680f41f9a8 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 110/244] Revert "Add podman support (#13633)" This reverts commit f38283547b4188f320301c46ac3b77b05ea78d99. --- .../ray/autoscaler/_private/command_runner.py | 82 +++++++------------ python/ray/autoscaler/_private/docker.py | 22 +++-- python/ray/autoscaler/ray-schema.json | 5 -- python/ray/tests/test_autoscaler.py | 47 ----------- 4 files changed, 41 insertions(+), 115 deletions(-) diff --git a/python/ray/autoscaler/_private/command_runner.py b/python/ray/autoscaler/_private/command_runner.py index 2a3b7ae65a69..544e8b1077e4 100644 --- a/python/ray/autoscaler/_private/command_runner.py +++ b/python/ray/autoscaler/_private/command_runner.py @@ -584,9 +584,6 @@ def __init__(self, docker_config, **common_args): self.docker_config = docker_config self.home_dir = None self.initialized = False - # Optionally use 'podman' instead of 'docker' - use_podman = docker_config.get("use_podman", False) - self.docker_cmd = "podman" if use_podman else "docker" def run( self, @@ -601,8 +598,8 @@ def run( shutdown_after_run=False, ): if run_env == "auto": - run_env = "host" if (not bool(cmd) or cmd.find( - self.docker_cmd) == 0) else self.docker_cmd + run_env = "host" if (not bool(cmd) + or cmd.find("docker") == 0) else "docker" if environment_variables: cmd = _with_environment_variables(cmd, environment_variables) @@ -614,8 +611,7 @@ def run( cmd = with_docker_exec( [cmd], container_name=self.container_name, - with_interactive=is_using_login_shells(), - docker_cmd=self.docker_cmd)[0] + with_interactive=is_using_login_shells())[0] if shutdown_after_run: # sudo shutdown should run after `with_docker_exec` command above @@ -651,9 +647,9 @@ def run_rsync_up(self, source, target, options=None): # Without it, docker copies the source *into* the target host_destination += "/." self.ssh_command_runner.run( - "{} cp {} {}:{}".format(self.docker_cmd, host_destination, - self.container_name, - self._docker_expand_user(target)), + "docker cp {} {}:{}".format(host_destination, + self.container_name, + self._docker_expand_user(target)), silent=is_rsync_silent()) def run_rsync_down(self, source, target, options=None): @@ -672,9 +668,9 @@ def run_rsync_down(self, source, target, options=None): # Without it, docker copies the source *into* the target if not options.get("docker_mount_if_possible", False): self.ssh_command_runner.run( - "{} cp {}:{} {}".format(self.docker_cmd, self.container_name, - self._docker_expand_user(source), - host_source), + "docker cp {}:{} {}".format(self.container_name, + self._docker_expand_user(source), + host_source), silent=is_rsync_silent()) self.ssh_command_runner.run_rsync_down( host_source, target, options=options) @@ -682,30 +678,22 @@ def run_rsync_down(self, source, target, options=None): def remote_shell_command_str(self): inner_str = self.ssh_command_runner.remote_shell_command_str().replace( "ssh", "ssh -tt", 1).strip("\n") - return inner_str + " {} exec -it {} /bin/bash\n".format( - self.docker_cmd, self.container_name) + return inner_str + " docker exec -it {} /bin/bash\n".format( + self.container_name) def _check_docker_installed(self): no_exist = "NoExist" output = self.ssh_command_runner.run( - f"command -v {self.docker_cmd} || echo '{no_exist}'", - with_output=True) + f"command -v docker || echo '{no_exist}'", with_output=True) cleaned_output = output.decode().strip() if no_exist in cleaned_output or "docker" not in cleaned_output: - if self.docker_cmd == "docker": - install_commands = [ - "curl -fsSL https://get.docker.com -o get-docker.sh", - "sudo sh get-docker.sh", "sudo usermod -aG docker $USER", - "sudo systemctl restart docker -f" - ] - else: - install_commands = [ - "sudo apt-get update", "sudo apt-get -y install podman" - ] - + install_commands = [ + "curl -fsSL https://get.docker.com -o get-docker.sh", + "sudo sh get-docker.sh", "sudo usermod -aG docker $USER", + "sudo systemctl restart docker -f" + ] logger.error( - f"{self.docker_cmd.capitalize()} not installed. You can " - f"install {self.docker_cmd.capitalize()} by adding the " + "Docker not installed. You can install Docker by adding the " "following commands to 'initialization_commands':\n" + "\n".join(install_commands)) @@ -713,7 +701,7 @@ def _check_container_status(self): if self.initialized: return True output = self.ssh_command_runner.run( - check_docker_running_cmd(self.container_name, self.docker_cmd), + check_docker_running_cmd(self.container_name), with_output=True).decode("utf-8").strip() # Checks for the false positive where "true" is in the container name return ("true" in output.lower() @@ -724,8 +712,7 @@ def _docker_expand_user(self, string, any_char=False): if user_pos > -1: if self.home_dir is None: self.home_dir = self.ssh_command_runner.run( - f"{self.docker_cmd} exec {self.container_name} " - "printenv HOME", + f"docker exec {self.container_name} printenv HOME", with_output=True).decode("utf-8").strip() if any_char: @@ -740,7 +727,7 @@ def _check_if_container_restart_is_needed( self, image: str, cleaned_bind_mounts: Dict[str, str]) -> bool: re_init_required = False running_image = self.run( - check_docker_image(self.container_name, self.docker_cmd), + check_docker_image(self.container_name), with_output=True, run_env="host").decode("utf-8").strip() if running_image != image: @@ -749,7 +736,7 @@ def _check_if_container_restart_is_needed( "of {} (which was provided in the YAML)", self.container_name, running_image, image) mounts = self.run( - check_bind_mounts_cmd(self.container_name, self.docker_cmd), + check_bind_mounts_cmd(self.container_name), with_output=True, run_env="host").decode("utf-8").strip() try: @@ -791,14 +778,12 @@ def run_init(self, *, as_head, file_mounts, sync_run_yet): if self.docker_config.get("pull_before_run", True): assert specific_image, "Image must be included in config if " + \ "pull_before_run is specified" - self.run( - "{} pull {}".format(self.docker_cmd, specific_image), - run_env="host") + self.run("docker pull {}".format(specific_image), run_env="host") else: - self.run(f"{self.docker_cmd} image inspect {specific_image} " - "1> /dev/null 2>&1 || " - f"{self.docker_cmd} pull {specific_image}") + self.run( + f"docker image inspect {specific_image} 1> /dev/null 2>&1 || " + f"docker pull {specific_image}") # Bootstrap files cannot be bind mounted because docker opens the # underlying inode. When the file is switched, docker becomes outdated. @@ -814,15 +799,12 @@ def run_init(self, *, as_head, file_mounts, sync_run_yet): requires_re_init = self._check_if_container_restart_is_needed( specific_image, cleaned_bind_mounts) if requires_re_init: - self.run( - f"{self.docker_cmd} stop {self.container_name}", - run_env="host") + self.run(f"docker stop {self.container_name}", run_env="host") if (not container_running) or requires_re_init: # Get home directory image_env = self.ssh_command_runner.run( - f"{self.docker_cmd} " + "inspect -f '{{json .Config.Env}}' " + - specific_image, + "docker inspect -f '{{json .Config.Env}}' " + specific_image, with_output=True).decode().strip() home_directory = "/root" for env_var in json.loads(image_env): @@ -837,8 +819,7 @@ def run_init(self, *, as_head, file_mounts, sync_run_yet): "run_options", []) + self.docker_config.get( f"{'head' if as_head else 'worker'}_run_options", []) + self._configure_runtime() + self._auto_configure_shm(), - self.ssh_command_runner.cluster_name, home_directory, - self.docker_cmd) + self.ssh_command_runner.cluster_name, home_directory) self.run(start_command, run_env="host") docker_run_executed = True @@ -851,8 +832,7 @@ def run_init(self, *, as_head, file_mounts, sync_run_yet): # is called before the first `file_sync` happens self.run_rsync_up(file_mounts[mount], mount) self.ssh_command_runner.run( - "{cmd} cp {src} {container}:{dst}".format( - cmd=self.docker_cmd, + "docker cp {src} {container}:{dst}".format( src=os.path.join( self._get_docker_host_mount_location( self.ssh_command_runner.cluster_name), mount), @@ -866,7 +846,7 @@ def _configure_runtime(self): return [] runtime_output = self.ssh_command_runner.run( - f"{self.docker_cmd} " + "info -f '{{.Runtimes}}' ", + "docker info -f '{{.Runtimes}}' ", with_output=True).decode().strip() if "nvidia-container-runtime" in runtime_output: try: diff --git a/python/ray/autoscaler/_private/docker.py b/python/ray/autoscaler/_private/docker.py index 9a21cd9cbd36..46bb20a3feca 100644 --- a/python/ray/autoscaler/_private/docker.py +++ b/python/ray/autoscaler/_private/docker.py @@ -29,10 +29,8 @@ def validate_docker_config(config): def with_docker_exec(cmds, container_name, - docker_cmd, env_vars=None, with_interactive=False): - assert docker_cmd, "Must provide docker command" env_str = "" if env_vars: env_str = " ".join( @@ -47,27 +45,27 @@ def with_docker_exec(cmds, ] -def _check_helper(cname, template, docker_cmd): +def _check_helper(cname, template): return " ".join([ - docker_cmd, "inspect", "-f", "'{{" + template + "}}'", cname, "||", + "docker", "inspect", "-f", "'{{" + template + "}}'", cname, "||", "true" ]) -def check_docker_running_cmd(cname, docker_cmd): - return _check_helper(cname, ".State.Running", docker_cmd) +def check_docker_running_cmd(cname): + return _check_helper(cname, ".State.Running") -def check_bind_mounts_cmd(cname, docker_cmd): - return _check_helper(cname, "json .Mounts", docker_cmd) +def check_bind_mounts_cmd(cname): + return _check_helper(cname, "json .Mounts") -def check_docker_image(cname, docker_cmd): - return _check_helper(cname, ".Config.Image", docker_cmd) +def check_docker_image(cname): + return _check_helper(cname, ".Config.Image") def docker_start_cmds(user, image, mount_dict, container_name, user_options, - cluster_name, home_directory, docker_cmd): + cluster_name, home_directory): # Imported here due to circular dependency. from ray.autoscaler.sdk import get_docker_host_mount_location docker_mount_prefix = get_docker_host_mount_location(cluster_name) @@ -86,7 +84,7 @@ def docker_start_cmds(user, image, mount_dict, container_name, user_options, user_options_str = " ".join(user_options) docker_run = [ - docker_cmd, "run", "--rm", "--name {}".format(container_name), "-d", + "docker", "run", "--rm", "--name {}".format(container_name), "-d", "-it", mount_flags, env_flags, user_options_str, "--net=host", image, "bash" ] diff --git a/python/ray/autoscaler/ray-schema.json b/python/ray/autoscaler/ray-schema.json index df157bdc067c..7c7b2a1ed4ba 100644 --- a/python/ray/autoscaler/ray-schema.json +++ b/python/ray/autoscaler/ray-schema.json @@ -247,11 +247,6 @@ "type": "boolean", "description": "disable Ray from automatically detecting /dev/shm size for the container", "default": false - }, - "use_podman" : { - "type": "boolean", - "description": "Use 'podman' command in place of 'docker'", - "default": false } } }, diff --git a/python/ray/tests/test_autoscaler.py b/python/ray/tests/test_autoscaler.py index 204ed1ef8c9a..f0f16318ac37 100644 --- a/python/ray/tests/test_autoscaler.py +++ b/python/ray/tests/test_autoscaler.py @@ -429,53 +429,6 @@ def testGetOrCreateHeadNode(self): f"docker cp {docker_mount_prefix}/~/ray_bootstrap_config.yaml" runner.assert_has_call("1.2.3.4", pattern=pattern_to_assert) - @unittest.skipIf(sys.platform == "win32", "Failing on Windows.") - def testGetOrCreateHeadNodePodman(self): - config = copy.deepcopy(SMALL_CLUSTER) - config["docker"]["use_podman"] = True - config_path = self.write_config(config) - self.provider = MockProvider() - runner = MockProcessRunner() - runner.respond_to_call("json .Mounts", ["[]"]) - # Two initial calls to docker cp, + 2 more calls during run_init - runner.respond_to_call(".State.Running", - ["false", "false", "false", "false"]) - runner.respond_to_call("json .Config.Env", ["[]"]) - commands.get_or_create_head_node( - config, - printable_config_file=config_path, - no_restart=False, - restart_only=False, - yes=True, - override_cluster_name=None, - _provider=self.provider, - _runner=runner) - self.waitForNodes(1) - runner.assert_has_call("1.2.3.4", "init_cmd") - runner.assert_has_call("1.2.3.4", "head_setup_cmd") - runner.assert_has_call("1.2.3.4", "start_ray_head") - self.assertEqual(self.provider.mock_nodes[0].node_type, None) - runner.assert_has_call("1.2.3.4", pattern="podman run") - - docker_mount_prefix = get_docker_host_mount_location( - SMALL_CLUSTER["cluster_name"]) - runner.assert_not_has_call( - "1.2.3.4", - pattern=f"-v {docker_mount_prefix}/~/ray_bootstrap_config") - runner.assert_has_call( - "1.2.3.4", - pattern=f"podman cp {docker_mount_prefix}/~/ray_bootstrap_key.pem") - pattern_to_assert = \ - f"podman cp {docker_mount_prefix}/~/ray_bootstrap_config.yaml" - runner.assert_has_call("1.2.3.4", pattern=pattern_to_assert) - - for cmd in runner.command_history(): - assert "docker" not in cmd, ("Docker (not podman) found in call: " - f"{cmd}") - - runner.assert_has_call("1.2.3.4", "podman inspect") - runner.assert_has_call("1.2.3.4", "podman exec") - @unittest.skipIf(sys.platform == "win32", "Failing on Windows.") def testGetOrCreateHeadNodeFromStopped(self): self.testGetOrCreateHeadNode() From 86f8bf585e77fda55d763549d48493771cf086d5 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 111/244] Revert "[RLlib] Unify fcnet initializers for the value output layer (std=1.0 in torch, but 0.01 in tf). (#13733)" This reverts commit b50d900b25b3be5488de193ccf9652bdfa5a4e88. --- rllib/models/torch/fcnet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rllib/models/torch/fcnet.py b/rllib/models/torch/fcnet.py index dc1608156a67..91b9c0e1d59d 100644 --- a/rllib/models/torch/fcnet.py +++ b/rllib/models/torch/fcnet.py @@ -109,7 +109,7 @@ def __init__(self, obs_space: gym.spaces.Space, self._value_branch = SlimFC( in_size=prev_layer_size, out_size=1, - initializer=normc_initializer(0.01), + initializer=normc_initializer(1.0), activation_fn=None) # Holds the current "base" output (before logits layer). self._features = None From b7012b39968d5fac5e417ece0075ce00073f7220 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 112/244] Revert "[RLlib] Trajectory view API example script (enhancements and tf2 support). (#13786)" This reverts commit e9b743daac53ac78080d35560be3b0312e81d76c. --- rllib/BUILD | 4 +- .../trajectory_view_utilizing_models.py | 67 ++++++------------- rllib/examples/trajectory_view_api.py | 8 +-- rllib/models/torch/misc.py | 5 +- rllib/policy/eager_tf_policy.py | 60 ++++------------- rllib/policy/torch_policy.py | 8 ++- 6 files changed, 46 insertions(+), 106 deletions(-) diff --git a/rllib/BUILD b/rllib/BUILD index cfe22c60fbfd..9658983ab4a8 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -2114,7 +2114,7 @@ py_test( tags = ["examples", "examples_T"], size = "medium", srcs = ["examples/trajectory_view_api.py"], - args = ["--as-test", "--framework=tf", "--stop-reward=100.0"] + args = ["--as-test", "--framework=tf", "--stop-reward=80.0"] ) py_test( @@ -2123,7 +2123,7 @@ py_test( tags = ["examples", "examples_T"], size = "medium", srcs = ["examples/trajectory_view_api.py"], - args = ["--as-test", "--framework=torch", "--stop-reward=100.0"] + args = ["--as-test", "--framework=torch", "--stop-reward=80.0"] ) py_test( diff --git a/rllib/examples/models/trajectory_view_utilizing_models.py b/rllib/examples/models/trajectory_view_utilizing_models.py index 0fd4e22cb145..41f53d8724c4 100644 --- a/rllib/examples/models/trajectory_view_utilizing_models.py +++ b/rllib/examples/models/trajectory_view_utilizing_models.py @@ -3,8 +3,6 @@ from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 from ray.rllib.policy.view_requirement import ViewRequirement from ray.rllib.utils.framework import try_import_tf, try_import_torch -from ray.rllib.utils.tf_ops import one_hot -from ray.rllib.utils.torch_ops import one_hot as torch_one_hot tf1, tf, tfv = try_import_tf() torch, nn = try_import_torch() @@ -30,42 +28,27 @@ def __init__(self, # Construct actual (very simple) FC model. assert len(obs_space.shape) == 1 - obs = tf.keras.layers.Input( + input_ = tf.keras.layers.Input( shape=(self.num_frames, obs_space.shape[0])) - obs_reshaped = tf.keras.layers.Reshape( - [obs_space.shape[0] * self.num_frames])(obs) - rewards = tf.keras.layers.Input(shape=(self.num_frames)) - rewards_reshaped = tf.keras.layers.Reshape([self.num_frames])(rewards) - actions = tf.keras.layers.Input( - shape=(self.num_frames, self.action_space.n)) - actions_reshaped = tf.keras.layers.Reshape( - [action_space.n * self.num_frames])(actions) - input_ = tf.keras.layers.Concatenate(axis=-1)( - [obs_reshaped, actions_reshaped, rewards_reshaped]) - layer1 = tf.keras.layers.Dense(256, activation=tf.nn.relu)(input_) - layer2 = tf.keras.layers.Dense(256, activation=tf.nn.relu)(layer1) - out = tf.keras.layers.Dense(self.num_outputs)(layer2) + reshaped = tf.keras.layers.Reshape( + [obs_space.shape[0] * self.num_frames])(input_) + layer1 = tf.keras.layers.Dense(64, activation=tf.nn.relu)(reshaped) + out = tf.keras.layers.Dense(self.num_outputs)(layer1) values = tf.keras.layers.Dense(1)(layer1) - self.base_model = tf.keras.models.Model([obs, actions, rewards], - [out, values]) + self.base_model = tf.keras.models.Model([input_], [out, values]) + self._last_value = None self.view_requirements["prev_n_obs"] = ViewRequirement( data_col="obs", shift="-{}:0".format(num_frames - 1), space=obs_space) - self.view_requirements["prev_n_rewards"] = ViewRequirement( - data_col="rewards", shift="-{}:-1".format(self.num_frames)) - self.view_requirements["prev_n_actions"] = ViewRequirement( - data_col="actions", - shift="-{}:-1".format(self.num_frames), - space=self.action_space) + self.view_requirements["prev_rewards"] = ViewRequirement( + data_col="rewards", shift=-1) def forward(self, input_dict, states, seq_lens): - obs = tf.cast(input_dict["prev_n_obs"], tf.float32) - rewards = tf.cast(input_dict["prev_n_rewards"], tf.float32) - actions = one_hot(input_dict["prev_n_actions"], self.action_space) - out, self._last_value = self.base_model([obs, actions, rewards]) + obs = input_dict["prev_n_obs"] + out, self._last_value = self.base_model(obs) return out, [] def value_function(self): @@ -94,13 +77,13 @@ def __init__(self, # Construct actual (very simple) FC model. assert len(obs_space.shape) == 1 - in_size = self.num_frames * (obs_space.shape[0] + action_space.n + 1) self.layer1 = SlimFC( - in_size=in_size, out_size=256, activation_fn="relu") - self.layer2 = SlimFC(in_size=256, out_size=256, activation_fn="relu") + in_size=obs_space.shape[0] * self.num_frames, + out_size=64, + activation_fn="relu") self.out = SlimFC( - in_size=256, out_size=self.num_outputs, activation_fn="linear") - self.values = SlimFC(in_size=256, out_size=1, activation_fn="linear") + in_size=64, out_size=self.num_outputs, activation_fn="linear") + self.values = SlimFC(in_size=64, out_size=1, activation_fn="linear") self._last_value = None @@ -108,26 +91,14 @@ def __init__(self, data_col="obs", shift="-{}:0".format(num_frames - 1), space=obs_space) - self.view_requirements["prev_n_rewards"] = ViewRequirement( - data_col="rewards", shift="-{}:-1".format(self.num_frames)) - self.view_requirements["prev_n_actions"] = ViewRequirement( - data_col="actions", - shift="-{}:-1".format(self.num_frames), - space=self.action_space) + self.view_requirements["prev_rewards"] = ViewRequirement( + data_col="rewards", shift=-1) def forward(self, input_dict, states, seq_lens): obs = input_dict["prev_n_obs"] obs = torch.reshape(obs, [-1, self.obs_space.shape[0] * self.num_frames]) - rewards = torch.reshape(input_dict["prev_n_rewards"], - [-1, self.num_frames]) - actions = torch_one_hot(input_dict["prev_n_actions"], - self.action_space) - actions = torch.reshape(actions, - [-1, self.num_frames * actions.shape[-1]]) - input_ = torch.cat([obs, actions, rewards], dim=-1) - features = self.layer1(input_) - features = self.layer2(features) + features = self.layer1(obs) out = self.out(features) self._last_value = self.values(features) return out, [] diff --git a/rllib/examples/trajectory_view_api.py b/rllib/examples/trajectory_view_api.py index a720617793d2..400051ad506f 100644 --- a/rllib/examples/trajectory_view_api.py +++ b/rllib/examples/trajectory_view_api.py @@ -2,7 +2,6 @@ import ray from ray import tune -from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole from ray.rllib.examples.models.trajectory_view_utilizing_models import \ FrameStackingCartPoleModel, TorchFrameStackingCartPoleModel from ray.rllib.models.catalog import ModelCatalog @@ -17,7 +16,7 @@ "--framework", choices=["tf2", "tf", "tfe", "torch"], default="tf") parser.add_argument("--as-test", action="store_true") parser.add_argument("--stop-iters", type=int, default=50) -parser.add_argument("--stop-timesteps", type=int, default=200000) +parser.add_argument("--stop-timesteps", type=int, default=100000) parser.add_argument("--stop-reward", type=float, default=150.0) if __name__ == "__main__": @@ -27,14 +26,13 @@ ModelCatalog.register_custom_model( "frame_stack_model", FrameStackingCartPoleModel if args.framework != "torch" else TorchFrameStackingCartPoleModel) - tune.register_env("stateless_cartpole", lambda c: StatelessCartPole()) config = { - "env": "stateless_cartpole", + "env": "CartPole-v0", "model": { "custom_model": "frame_stack_model", "custom_model_config": { - "num_frames": 16, + "num_frames": 4, } }, "framework": args.framework, diff --git a/rllib/models/torch/misc.py b/rllib/models/torch/misc.py index 9f6d8234e87f..830e8bc33b5e 100644 --- a/rllib/models/torch/misc.py +++ b/rllib/models/torch/misc.py @@ -139,9 +139,8 @@ def __init__(self, layers = [] # Actual nn.Linear layer (including correct initialization logic). linear = nn.Linear(in_size, out_size, bias=use_bias) - if initializer is None: - initializer = nn.init.xavier_uniform_ - initializer(linear.weight) + if initializer: + initializer(linear.weight) if use_bias is True: nn.init.constant_(linear.bias, bias_init) layers.append(linear) diff --git a/rllib/policy/eager_tf_policy.py b/rllib/policy/eager_tf_policy.py index 1e1f42c05df2..805cacaaa4dc 100644 --- a/rllib/policy/eager_tf_policy.py +++ b/rllib/policy/eager_tf_policy.py @@ -5,7 +5,6 @@ import functools import logging import threading -from typing import Dict, List, Optional, Tuple from ray.util.debug import log_once from ray.rllib.models.catalog import ModelCatalog @@ -19,7 +18,6 @@ from ray.rllib.utils.tf_ops import convert_to_non_tf_type from ray.rllib.utils.threading import with_lock from ray.rllib.utils.tracking_dict import UsageTrackingDict -from ray.rllib.utils.typing import TensorType tf1, tf, tfv = try_import_tf() logger = logging.getLogger(__name__) @@ -363,7 +361,10 @@ def _compute_gradients_eager(self, samples): grads = [g for g, v in grads_and_vars] return grads, stats + @with_lock @override(Policy) + @convert_eager_inputs + @convert_eager_outputs def compute_actions(self, obs_batch, state_batches=None, @@ -375,9 +376,16 @@ def compute_actions(self, timestep=None, **kwargs): + explore = explore if explore is not None else \ + self.config["explore"] + timestep = timestep if timestep is not None else \ + self.global_timestep + + # TODO: remove python side effect to cull sources of bugs. self._is_training = False self._is_recurrent = \ state_batches is not None and state_batches != [] + self._state_in = state_batches or [] if not tf1.executing_eagerly(): tf1.enable_eager_execution() @@ -386,6 +394,8 @@ def compute_actions(self, SampleBatch.CUR_OBS: tf.convert_to_tensor(obs_batch), "is_training": tf.constant(False), } + batch_size = input_dict[SampleBatch.CUR_OBS].shape[0] + seq_lens = tf.ones(batch_size, dtype=tf.int32) if obs_include_prev_action_reward: if prev_action_batch is not None: input_dict[SampleBatch.PREV_ACTIONS] = \ @@ -394,50 +404,6 @@ def compute_actions(self, input_dict[SampleBatch.PREV_REWARDS] = \ tf.convert_to_tensor(prev_reward_batch) - return self._compute_action_helper(input_dict, state_batches, - episodes, explore, timestep) - - @override(Policy) - def compute_actions_from_input_dict( - self, - input_dict: Dict[str, TensorType], - explore: bool = None, - timestep: Optional[int] = None, - **kwargs - ) -> Tuple[TensorType, List[TensorType], Dict[str, TensorType]]: - - if not tf1.executing_eagerly(): - tf1.enable_eager_execution() - - # Pass lazy (torch) tensor dict to Model as `input_dict`. - input_dict = self._lazy_tensor_dict(input_dict) - # Pack internal state inputs into (separate) list. - state_batches = [ - input_dict[k] for k in input_dict.keys() if "state_in" in k[:8] - ] - - return self._compute_action_helper(input_dict, state_batches, None, - explore, timestep) - - @with_lock - @convert_eager_inputs - @convert_eager_outputs - def _compute_action_helper(self, input_dict, state_batches, episodes, - explore, timestep): - - explore = explore if explore is not None else \ - self.config["explore"] - timestep = timestep if timestep is not None else \ - self.global_timestep - if isinstance(timestep, tf.Tensor): - timestep = int(timestep.numpy()) - self._is_training = False - self._state_in = state_batches or [] - # Calculate RNN sequence lengths. - batch_size = input_dict[SampleBatch.CUR_OBS].shape[0] - seq_lens = tf.ones(batch_size, dtype=tf.int32) if state_batches \ - else None - # Use Exploration object. with tf.variable_creator_scope(_disallow_var_creation): if action_sampler_fn: @@ -530,6 +496,8 @@ def compute_log_likelihoods(self, input_dict[SampleBatch.CUR_OBS], explore=False, is_training=False) + action_dist = dist_class(dist_inputs, self.model) + log_likelihoods = action_dist.logp(actions) # Default log-likelihood calculation. else: dist_inputs, _ = self.model(input_dict, state_batches, diff --git a/rllib/policy/torch_policy.py b/rllib/policy/torch_policy.py index e492a5048563..19d576d3776a 100644 --- a/rllib/policy/torch_policy.py +++ b/rllib/policy/torch_policy.py @@ -159,6 +159,9 @@ def compute_actions( **kwargs) -> \ Tuple[TensorType, List[TensorType], Dict[str, TensorType]]: + explore = explore if explore is not None else self.config["explore"] + timestep = timestep if timestep is not None else self.global_timestep + with torch.no_grad(): seq_lens = torch.ones(len(obs_batch), dtype=torch.int32) input_dict = self._lazy_tensor_dict({ @@ -187,6 +190,9 @@ def compute_actions_from_input_dict( **kwargs) -> \ Tuple[TensorType, List[TensorType], Dict[str, TensorType]]: + explore = explore if explore is not None else self.config["explore"] + timestep = timestep if timestep is not None else self.global_timestep + with torch.no_grad(): # Pass lazy (torch) tensor dict to Model as `input_dict`. input_dict = self._lazy_tensor_dict(input_dict) @@ -210,8 +216,6 @@ def _compute_action_helper(self, input_dict, state_batches, seq_lens, Tuple: - actions, state_out, extra_fetches, logp. """ - explore = explore if explore is not None else self.config["explore"] - timestep = timestep if timestep is not None else self.global_timestep self._is_recurrent = state_batches is not None and state_batches != [] # Switch to eval mode. From 2470a593044a4cbeb01d900c42e20b636b2f8ebb Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 113/244] Revert "[serve] Support batches for ImportedBackends (#13843)" This reverts commit 82da023e64c9df89378b0e6a2d606165d73846c9. --- python/ray/serve/backends.py | 8 -------- python/ray/serve/tests/test_imported_backend.py | 2 +- python/ray/serve/utils.py | 13 +++++-------- 3 files changed, 6 insertions(+), 17 deletions(-) diff --git a/python/ray/serve/backends.py b/python/ray/serve/backends.py index 5f58ad2c9a8d..086755500a46 100644 --- a/python/ray/serve/backends.py +++ b/python/ray/serve/backends.py @@ -1,4 +1,3 @@ -from ray import serve from ray.serve.utils import import_class @@ -27,13 +26,6 @@ def reconfigure(self, *args, **kwargs): # proxy it manually. return self.wrapped.reconfigure(*args, **kwargs) - # We mark 'accept_batch' here just so this will always pass the - # check we make during create_backend(). Unfortunately this means - # that validation won't happen until the replica is created. - @serve.accept_batch - def __call__(self, *args, **kwargs): - return self.wrapped(*args, **kwargs) - def __getattr__(self, attr): """Proxy all other methods to the wrapper class.""" return getattr(self.wrapped, attr) diff --git a/python/ray/serve/tests/test_imported_backend.py b/python/ray/serve/tests/test_imported_backend.py index 99f08a04ba07..cc575dd94e1d 100644 --- a/python/ray/serve/tests/test_imported_backend.py +++ b/python/ray/serve/tests/test_imported_backend.py @@ -7,7 +7,7 @@ def test_imported_backend(serve_instance): client = serve_instance backend_class = ImportedBackend("ray.serve.utils.MockImportedBackend") - config = BackendConfig(user_config="config", max_batch_size=2) + config = BackendConfig(user_config="config") client.create_backend( "imported", backend_class, "input_arg", config=config) client.create_endpoint("imported", backend="imported") diff --git a/python/ray/serve/utils.py b/python/ray/serve/utils.py index b4fdbf497e87..a594b94ddb90 100644 --- a/python/ray/serve/utils.py +++ b/python/ray/serve/utils.py @@ -392,14 +392,11 @@ def __init__(self, arg): def reconfigure(self, config): self.config = config - def __call__(self, batch): - return [{ - "arg": self.arg, - "config": self.config - } for _ in range(len(batch))] + def __call__(self, *args): + return {"arg": self.arg, "config": self.config} - async def other_method(self, batch): - return [await request.body() for request in batch] + async def other_method(self, request): + return await request.body() def compute_iterable_delta(old: Iterable, @@ -409,7 +406,7 @@ def compute_iterable_delta(old: Iterable, Usage: >>> old = {"a", "b"} >>> new = {"a", "d"} - >>> compute_iterable_delta(old, new) + >>> compute_dict_delta(old, new) ({"d"}, {"b"}, {"a"}) """ old_keys, new_keys = set(old), set(new) From 8ecd9bfcfa04c28a8e93d0e706d2760eaf785630 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 114/244] Revert "[tune] catch SIGINT signal and trigger experiment checkpoint (#13767)" This reverts commit a53f066f7ccd9706833ee8bc278eb0902e139d15. --- doc/source/tune/user-guide.rst | 59 -------------------- python/ray/tune/tests/test_tune_restore.py | 62 ---------------------- python/ray/tune/tune.py | 33 ++---------- 3 files changed, 3 insertions(+), 151 deletions(-) diff --git a/doc/source/tune/user-guide.rst b/doc/source/tune/user-guide.rst index 909ebbc9faf4..a830791d09fe 100644 --- a/doc/source/tune/user-guide.rst +++ b/doc/source/tune/user-guide.rst @@ -261,7 +261,6 @@ You can restore a single trial checkpoint by using ``tune.run(restore= ExperimentAnalysis: """Executes training. - When a SIGINT signal is received (e.g. through Ctrl+C), the tuning run - will gracefully shut down and checkpoint the latest experiment state. - Sending SIGINT again (or SIGKILL/SIGTERM instead) will skip this step. - Examples: .. code-block:: python @@ -271,6 +265,7 @@ def run( `LoggerCallback` and `SyncerCallback` callbacks are automatically added. + Returns: ExperimentAnalysis: Object for experiment analysis. @@ -432,24 +427,8 @@ def run( "`Trainable.default_resource_request` if using the " "Trainable API.") - original_handler = signal.getsignal(signal.SIGINT) - state = {signal.SIGINT: False} - - def sigint_handler(sig, frame): - logger.warning( - "SIGINT received (e.g. via Ctrl+C), ending Ray Tune run. " - "This will try to checkpoint the experiment state one last time. " - "Press CTRL+C one more time (or send SIGINT/SIGKILL/SIGTERM) " - "to skip. ") - state[signal.SIGINT] = True - # Restore original signal handler to react to future SIGINT signals - signal.signal(signal.SIGINT, original_handler) - - if not int(os.getenv("TUNE_DISABLE_SIGINT_HANDLER", "0")): - signal.signal(signal.SIGINT, sigint_handler) - tune_start = time.time() - while not runner.is_finished() and not state[signal.SIGINT]: + while not runner.is_finished(): runner.step() if has_verbosity(Verbosity.V1_EXPERIMENT): _report_progress(runner, progress_reporter) @@ -472,7 +451,7 @@ def sigint_handler(sig, frame): incomplete_trials += [trial] if incomplete_trials: - if raise_on_failed_trial and not state[signal.SIGINT]: + if raise_on_failed_trial: raise TuneError("Trials did not complete", incomplete_trials) else: logger.error("Trials did not complete: %s", incomplete_trials) @@ -482,12 +461,6 @@ def sigint_handler(sig, frame): logger.info(f"Total run time: {all_taken:.2f} seconds " f"({tune_taken:.2f} seconds for the tuning loop).") - if state[signal.SIGINT]: - logger.warning( - "Experiment has been interrupted, but the most recent state was " - "saved. You can continue running this experiment by passing " - "`resume=True` to `tune.run()`") - trials = runner.get_trials() return ExperimentAnalysis( runner.checkpoint_file, From 8060c9c737a50b973a9d1160f36b91929900c669 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 115/244] Revert "[RLlib] Issue #13761: Fix get action shape (#13764)" This reverts commit 5d529d7b2787ddf6f85cafde60e804c1eb85daa5. --- rllib/policy/policy.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rllib/policy/policy.py b/rllib/policy/policy.py index 1bce4b96d97e..577ac3d68c75 100644 --- a/rllib/policy/policy.py +++ b/rllib/policy/policy.py @@ -709,8 +709,7 @@ def _get_dummy_batch_from_view_requirements( ret = {} for view_col, view_req in self.view_requirements.items(): if isinstance(view_req.space, (gym.spaces.Dict, gym.spaces.Tuple)): - _, shape = ModelCatalog.get_action_shape( - view_req.space, framework=self.config["framework"]) + _, shape = ModelCatalog.get_action_shape(view_req.space) ret[view_col] = \ np.zeros((batch_size, ) + shape[1:], np.float32) else: From f86e249172a051e829991f88c1af71b2d4bf1e66 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 116/244] Revert "[RLlib] Trainer._validate_config idempotentcy correction (issue 13427) (#13556)" This reverts commit 9144db350d9471ef9ec6b0e408d046231b154ae5. --- rllib/BUILD | 7 ------- rllib/agents/trainer.py | 2 +- rllib/tests/test_trainer.py | 30 ------------------------------ 3 files changed, 1 insertion(+), 38 deletions(-) delete mode 100644 rllib/tests/test_trainer.py diff --git a/rllib/BUILD b/rllib/BUILD index 9658983ab4a8..dd1d4c1638a7 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -1509,13 +1509,6 @@ py_test( srcs = ["tests/test_timesteps.py"] ) -py_test( - name = "tests/test_trainer", - tags = ["tests_dir", "tests_dir_T"], - size = "small", - srcs = ["tests/test_trainer.py"] -) - # -------------------------------------------------------------------- # examples/ directory # diff --git a/rllib/agents/trainer.py b/rllib/agents/trainer.py index 65e315a1d1e8..47e637f6dea7 100644 --- a/rllib/agents/trainer.py +++ b/rllib/agents/trainer.py @@ -1094,7 +1094,7 @@ def _validate_config(config: PartialTrainerConfigDict): if model_config.get("_time_major"): raise ValueError("`model._time_major` only supported " "iff `_use_trajectory_view_api` is True!") - elif traj_view_framestacks not in ["auto", 0]: + elif traj_view_framestacks != "auto": raise ValueError("`model.num_framestacks` only supported " "iff `_use_trajectory_view_api` is True!") model_config["num_framestacks"] = 0 diff --git a/rllib/tests/test_trainer.py b/rllib/tests/test_trainer.py deleted file mode 100644 index 7555c27c5581..000000000000 --- a/rllib/tests/test_trainer.py +++ /dev/null @@ -1,30 +0,0 @@ -"""Testing for trainer class""" -import copy -import unittest -from ray.rllib.agents.trainer import Trainer, COMMON_CONFIG - - -class TestTrainer(unittest.TestCase): - def test_validate_config_idempotent(self): - """ - Asserts that validate_config run multiple - times on COMMON_CONFIG will be idempotent - """ - # Given - standard_config = copy.deepcopy(COMMON_CONFIG) - standard_config["_use_trajectory_view_api"] = False - - # When (we validate config 2 times) - Trainer._validate_config(standard_config) - config_v1 = copy.deepcopy(standard_config) - Trainer._validate_config(standard_config) - config_v2 = copy.deepcopy(standard_config) - - # Then - self.assertEqual(config_v1, config_v2) - - -if __name__ == "__main__": - import pytest - import sys - sys.exit(pytest.main(["-v", __file__])) From d12520c7bb47c0b6e85ef7c0e1586f2a6d32676f Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 117/244] Revert "[RLlib] Update Documentation for Curiosity's support of continuous actions (#13784)" This reverts commit d73c04cc4171c805a2e7186d11565eb8a93543b3. --- doc/source/rllib-algorithms.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/rllib-algorithms.rst b/doc/source/rllib-algorithms.rst index b4f42c7ceab8..8b0413273597 100644 --- a/doc/source/rllib-algorithms.rst +++ b/doc/source/rllib-algorithms.rst @@ -51,7 +51,7 @@ Exploration-based plug-ins (can be combined with any algo) ============================= ========== ======================= ================== =========== ===================== Algorithm Frameworks Discrete Actions Continuous Actions Multi-Agent Model Support ============================= ========== ======================= ================== =========== ===================== -`Curiosity`_ tf + torch **Yes** `+parametric`_ No **Yes** `+RNN`_ +`Curiosity`_ tf + torch **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_ ============================= ========== ======================= ================== =========== ===================== .. _`A2C, A3C`: rllib-algorithms.html#a3c From 0db8ec3fa5ca5216a03d3d3866c322d6d71b4c6a Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 118/244] Revert "[RLlib] Allow SAC to use custom models as Q- or policy nets and deprecate "state-preprocessor" for image spaces. (#13522)" This reverts commit 10c31e8bf242b9a9ed3f5ca58866330234681376. --- doc/source/rllib-models.rst | 2 +- rllib/agents/sac/sac.py | 41 +--- rllib/agents/sac/sac_tf_model.py | 212 ++++++----------- rllib/agents/sac/sac_tf_policy.py | 53 +++-- rllib/agents/sac/sac_torch_model.py | 224 +++++++----------- rllib/agents/sac/tests/test_sac.py | 223 ++++++++--------- rllib/evaluation/rollout_worker.py | 15 +- .../models/cnn_plus_fc_concat_model.py | 218 +++++++++++++++++ rllib/models/catalog.py | 46 +--- rllib/models/modelv2.py | 40 ++-- rllib/models/tf/complex_input_net.py | 156 ------------ rllib/models/tf/fcnet.py | 8 +- rllib/models/tf/tf_modelv2.py | 7 +- rllib/models/tf/visionnet.py | 87 ++----- rllib/models/torch/complex_input_net.py | 163 ------------- rllib/models/torch/fcnet.py | 5 +- rllib/models/torch/visionnet.py | 84 ++----- .../tests/test_compute_log_likelihoods.py | 11 +- rllib/tests/run_regression_tests.py | 6 +- rllib/tests/test_nested_observation_spaces.py | 2 +- rllib/tests/test_supported_spaces.py | 4 +- rllib/tuned_examples/sac/atari-sac.yaml | 2 + rllib/tuned_examples/sac/mspacman-sac.yaml | 2 + rllib/utils/test_utils.py | 5 +- rllib/utils/threading.py | 2 +- 25 files changed, 609 insertions(+), 1009 deletions(-) create mode 100644 rllib/examples/models/cnn_plus_fc_concat_model.py delete mode 100644 rllib/models/tf/complex_input_net.py delete mode 100644 rllib/models/torch/complex_input_net.py diff --git a/doc/source/rllib-models.rst b/doc/source/rllib-models.rst index 279256de45dc..59678af7e187 100644 --- a/doc/source/rllib-models.rst +++ b/doc/source/rllib-models.rst @@ -453,7 +453,7 @@ with the remaining non-image (flat) inputs (the 1D Box and discrete/one-hot comp Take a look at this model example that does exactly that: -.. literalinclude:: ../../rllib/models/tf/complex_input_net.py +.. literalinclude:: ../../rllib/examples/models/cnn_plus_fc_concat_model.py :language: python :start-after: __sphinx_doc_begin__ :end-before: __sphinx_doc_end__ diff --git a/rllib/agents/sac/sac.py b/rllib/agents/sac/sac.py index 97d0f7d77147..5c476248c737 100644 --- a/rllib/agents/sac/sac.py +++ b/rllib/agents/sac/sac.py @@ -16,7 +16,6 @@ from ray.rllib.agents.dqn.dqn import GenericOffPolicyTrainer from ray.rllib.agents.sac.sac_tf_policy import SACTFPolicy from ray.rllib.policy.policy import Policy -from ray.rllib.utils.deprecation import DEPRECATED_VALUE, deprecation_warning from ray.rllib.utils.typing import TrainerConfigDict logger = logging.getLogger(__name__) @@ -40,37 +39,16 @@ # Use a e.g. conv2D state preprocessing network before concatenating the # resulting (feature) vector with the action input for the input to # the Q-networks. - "use_state_preprocessor": DEPRECATED_VALUE, - # Model options for the Q network(s). These will override MODEL_DEFAULTS. - # The `Q_model` dict is treated just as the top-level `model` dict in - # setting up the Q-network(s) (2 if twin_q=True). - # That means, you can do for different observation spaces: - # obs=Box(1D) -> Tuple(Box(1D) + Action) -> concat -> post_fcnet - # obs=Box(3D) -> Tuple(Box(3D) + Action) -> vision-net -> concat w/ action - # -> post_fcnet - # obs=Tuple(Box(1D), Box(3D)) -> Tuple(Box(1D), Box(3D), Action) - # -> vision-net -> concat w/ Box(1D) and action -> post_fcnet - # You can also have SAC use your custom_model as Q-model(s), by simply - # specifying the `custom_model` sub-key in below dict (just like you would - # do in the top-level `model` dict. + "use_state_preprocessor": False, + # Model options for the Q network(s). "Q_model": { - "fcnet_hiddens": [256, 256], "fcnet_activation": "relu", - "post_fcnet_hiddens": [], - "post_fcnet_activation": None, - "custom_model": None, # Use this to define custom Q-model(s). - "custom_model_config": {}, + "fcnet_hiddens": [256, 256], }, - # Model options for the policy function (see `Q_model` above for details). - # The difference to `Q_model` above is that no action concat'ing is - # performed before the post_fcnet stack. + # Model options for the policy function. "policy_model": { - "fcnet_hiddens": [256, 256], "fcnet_activation": "relu", - "post_fcnet_hiddens": [], - "post_fcnet_activation": None, - "custom_model": None, # Use this to define a custom policy model. - "custom_model_config": {}, + "fcnet_hiddens": [256, 256], }, # Unsquash actions to the upper and lower bounds of env's action space. # Ignored for discrete action spaces. @@ -167,10 +145,11 @@ def validate_config(config: TrainerConfigDict) -> None: Raises: ValueError: In case something is wrong with the config. """ - if config["use_state_preprocessor"] != DEPRECATED_VALUE: - deprecation_warning( - old="config['use_state_preprocessor']", error=False) - config["use_state_preprocessor"] = DEPRECATED_VALUE + if config["model"].get("custom_model"): + logger.warning( + "Setting use_state_preprocessor=True since a custom model " + "was specified.") + config["use_state_preprocessor"] = True if config["grad_clip"] is not None and config["grad_clip"] <= 0.0: raise ValueError("`grad_clip` value must be > 0.0!") diff --git a/rllib/agents/sac/sac_tf_model.py b/rllib/agents/sac/sac_tf_model.py index e2c56b5215d2..4c890385f58f 100644 --- a/rllib/agents/sac/sac_tf_model.py +++ b/rllib/agents/sac/sac_tf_model.py @@ -1,12 +1,9 @@ import gym from gym.spaces import Box, Discrete import numpy as np -from typing import Dict, List, Optional +from typing import Optional, Tuple -from ray.rllib.models.catalog import ModelCatalog from ray.rllib.models.tf.tf_modelv2 import TFModelV2 -from ray.rllib.utils import force_list -from ray.rllib.utils.annotations import override from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.spaces.simplex import Simplex from ray.rllib.utils.typing import ModelConfigDict, TensorType @@ -17,21 +14,14 @@ class SACTFModel(TFModelV2): """Extension of the standard TFModelV2 for SAC. - To customize, do one of the following: - - sub-class SACTFModel and override one or more of its methods. - - Use SAC's `Q_model` and `policy_model` keys to tweak the default model - behaviors (e.g. fcnet_hiddens, conv_filters, etc..). - - Use SAC's `Q_model->custom_model` and `policy_model->custom_model` keys - to specify your own custom Q-model(s) and policy-models, which will be - created within this SACTFModel (see `build_policy_model` and - `build_q_model`. - - Note: It is not recommended to override the `forward` method for SAC. This - would lead to shared weights (between policy and Q-nets), which will then - not be optimized by either of the critic- or actor-optimizers! + Instances of this Model get created via wrapping this class around another + default- or custom model (inside + rllib/agents/sac/sac_tf_policy.py::build_sac_model). Doing so simply adds + this class' methods (`get_q_values`, etc..) to the wrapped model, such that + the wrapped model can be used by the SAC algorithm. Data flow: - `obs` -> forward() (should stay a noop method!) -> `model_out` + `obs` -> forward() -> `model_out` `model_out` -> get_policy_output() -> pi(actions|obs) `model_out`, `actions` -> get_q_values() -> Q(s, a) `model_out`, `actions` -> get_twin_q_values() -> Q_twin(s, a) @@ -43,18 +33,20 @@ def __init__(self, num_outputs: Optional[int], model_config: ModelConfigDict, name: str, - policy_model_config: ModelConfigDict = None, - q_model_config: ModelConfigDict = None, + actor_hidden_activation: str = "relu", + actor_hiddens: Tuple[int] = (256, 256), + critic_hidden_activation: str = "relu", + critic_hiddens: Tuple[int] = (256, 256), twin_q: bool = False, initial_alpha: float = 1.0, target_entropy: Optional[float] = None): """Initialize a SACTFModel instance. Args: - policy_model_config (ModelConfigDict): The config dict for the - policy network. - q_model_config (ModelConfigDict): The config dict for the - Q-network(s) (2 if twin_q=True). + actor_hidden_activation (str): Activation for the actor network. + actor_hiddens (list): Hidden layers sizes for the actor network. + critic_hidden_activation (str): Activation for the critic network. + critic_hiddens (list): Hidden layers sizes for the critic network. twin_q (bool): Build twin Q networks (Q-net and target) for more stable Q-learning. initial_alpha (float): The initial value for the to-be-optimized @@ -85,15 +77,54 @@ def __init__(self, action_outs = self.action_dim q_outs = 1 - self.action_model = self.build_policy_model( - self.obs_space, action_outs, policy_model_config, "policy_model") + self.model_out = tf.keras.layers.Input( + shape=(self.num_outputs, ), name="model_out") + self.action_model = tf.keras.Sequential([ + tf.keras.layers.Dense( + units=hidden, + activation=getattr(tf.nn, actor_hidden_activation, None), + name="action_{}".format(i + 1)) + for i, hidden in enumerate(actor_hiddens) + ] + [ + tf.keras.layers.Dense( + units=action_outs, activation=None, name="action_out") + ]) + self.shift_and_log_scale_diag = self.action_model(self.model_out) + + self.actions_input = None + if not self.discrete: + self.actions_input = tf.keras.layers.Input( + shape=(self.action_dim, ), name="actions") + + def build_q_net(name, observations, actions): + # For continuous actions: Feed obs and actions (concatenated) + # through the NN. For discrete actions, only obs. + q_net = tf.keras.Sequential(([ + tf.keras.layers.Concatenate(axis=1), + ] if not self.discrete else []) + [ + tf.keras.layers.Dense( + units=units, + activation=getattr(tf.nn, critic_hidden_activation, None), + name="{}_hidden_{}".format(name, i)) + for i, units in enumerate(critic_hiddens) + ] + [ + tf.keras.layers.Dense( + units=q_outs, activation=None, name="{}_out".format(name)) + ]) + + # TODO(hartikainen): Remove the unnecessary Model calls here + if self.discrete: + q_net = tf.keras.Model(observations, q_net(observations)) + else: + q_net = tf.keras.Model([observations, actions], + q_net([observations, actions])) + return q_net + + self.q_net = build_q_net("q", self.model_out, self.actions_input) - self.q_net = self.build_q_model(self.obs_space, self.action_space, - q_outs, q_model_config, "q") if twin_q: - self.twin_q_net = self.build_q_model(self.obs_space, - self.action_space, q_outs, - q_model_config, "twin_q") + self.twin_q_net = build_q_net("twin_q", self.model_out, + self.actions_input) else: self.twin_q_net = None @@ -112,80 +143,6 @@ def __init__(self, target_entropy = -np.prod(action_space.shape) self.target_entropy = target_entropy - @override(TFModelV2) - def forward(self, input_dict: Dict[str, TensorType], - state: List[TensorType], - seq_lens: TensorType) -> (TensorType, List[TensorType]): - """The common (Q-net and policy-net) forward pass. - - NOTE: It is not(!) recommended to override this method as it would - introduce a shared pre-network, which would be updated by both - actor- and critic optimizers. - """ - return input_dict["obs"], state - - def build_policy_model(self, obs_space, num_outputs, policy_model_config, - name): - """Builds the policy model used by this SAC. - - Override this method in a sub-class of SACTFModel to implement your - own policy net. Alternatively, simply set `custom_model` within the - top level SAC `policy_model` config key to make this default - implementation of `build_policy_model` use your custom policy network. - - Returns: - TFModelV2: The TFModelV2 policy sub-model. - """ - model = ModelCatalog.get_model_v2( - obs_space, - self.action_space, - num_outputs, - policy_model_config, - framework="tf", - name=name) - return model - - def build_q_model(self, obs_space, action_space, num_outputs, - q_model_config, name): - """Builds one of the (twin) Q-nets used by this SAC. - - Override this method in a sub-class of SACTFModel to implement your - own Q-nets. Alternatively, simply set `custom_model` within the - top level SAC `Q_model` config key to make this default implementation - of `build_q_model` use your custom Q-nets. - - Returns: - TFModelV2: The TFModelV2 Q-net sub-model. - """ - self.concat_obs_and_actions = False - if self.discrete: - input_space = obs_space - else: - orig_space = getattr(obs_space, "original_space", obs_space) - if isinstance(orig_space, Box) and len(orig_space.shape) == 1: - input_space = Box( - float("-inf"), - float("inf"), - shape=(orig_space.shape[0] + action_space.shape[0], )) - self.concat_obs_and_actions = True - else: - if isinstance(orig_space, gym.spaces.Tuple): - spaces = orig_space.spaces - elif isinstance(orig_space, gym.spaces.Dict): - spaces = list(orig_space.spaces.values()) - else: - spaces = [obs_space] - input_space = gym.spaces.Tuple(spaces + [action_space]) - - model = ModelCatalog.get_model_v2( - input_space, - action_space, - num_outputs, - q_model_config, - framework="tf", - name=name) - return model - def get_q_values(self, model_out: TensorType, actions: Optional[TensorType] = None) -> TensorType: @@ -204,7 +161,12 @@ def get_q_values(self, Returns: TensorType: Q-values tensor of shape [BATCH_SIZE, 1]. """ - return self._get_q_value(model_out, actions, self.q_net) + # Continuous case -> concat actions to model_out. + if actions is not None: + return self.q_net([model_out, actions]) + # Discrete case -> return q-vals for all actions. + else: + return self.q_net(model_out) def get_twin_q_values(self, model_out: TensorType, @@ -223,32 +185,12 @@ def get_twin_q_values(self, Returns: TensorType: Q-values tensor of shape [BATCH_SIZE, 1]. """ - return self._get_q_value(model_out, actions, self.twin_q_net) - - def _get_q_value(self, model_out, actions, net): - # Model outs may come as original Tuple/Dict observations, concat them - # here if this is the case. - if isinstance(net.obs_space, Box): - if isinstance(model_out, (list, tuple)): - model_out = tf.concat(model_out, axis=-1) - elif isinstance(model_out, dict): - model_out = list(model_out.values()) - # Continuous case -> concat actions to model_out. if actions is not None: - if self.concat_obs_and_actions: - input_dict = {"obs": tf.concat([model_out, actions], axis=-1)} - else: - input_dict = {"obs": force_list(model_out) + [actions]} + return self.twin_q_net([model_out, actions]) # Discrete case -> return q-vals for all actions. else: - input_dict = {"obs": model_out} - # Switch on training mode (when getting Q-values, we are usually in - # training). - input_dict["is_training"] = True - - out, _ = net(input_dict, [], None) - return out + return self.twin_q_net(model_out) def get_policy_output(self, model_out: TensorType) -> TensorType: """Returns policy outputs, given the output of self.__call__(). @@ -265,23 +207,15 @@ def get_policy_output(self, model_out: TensorType) -> TensorType: Returns: TensorType: Distribution inputs for sampling actions. """ - # Model outs may come as original Tuple observations, concat them - # here if this is the case. - if isinstance(self.action_model.obs_space, Box): - if isinstance(model_out, (list, tuple)): - model_out = tf.concat(model_out, axis=-1) - elif isinstance(model_out, dict): - model_out = tf.concat(list(model_out.values()), axis=-1) - out, _ = self.action_model({"obs": model_out}, [], None) - return out + return self.action_model(model_out) def policy_variables(self): """Return the list of variables for the policy net.""" - return self.action_model.variables() + return list(self.action_model.variables) def q_variables(self): """Return the list of variables for Q / twin Q nets.""" - return self.q_net.variables() + (self.twin_q_net.variables() - if self.twin_q_net else []) + return self.q_net.variables + (self.twin_q_net.variables + if self.twin_q_net else []) diff --git a/rllib/agents/sac/sac_tf_policy.py b/rllib/agents/sac/sac_tf_policy.py index 83fa076ed292..44ddbff1fd84 100644 --- a/rllib/agents/sac/sac_tf_policy.py +++ b/rllib/agents/sac/sac_tf_policy.py @@ -6,7 +6,6 @@ from gym.spaces import Box, Discrete from functools import partial import logging -import numpy as np from typing import Dict, List, Optional, Tuple, Type, Union import ray @@ -18,7 +17,7 @@ from ray.rllib.agents.sac.sac_tf_model import SACTFModel from ray.rllib.agents.sac.sac_torch_model import SACTorchModel from ray.rllib.evaluation.episode import MultiAgentEpisode -from ray.rllib.models import ModelCatalog, MODEL_DEFAULTS +from ray.rllib.models import ModelCatalog from ray.rllib.models.modelv2 import ModelV2 from ray.rllib.models.tf.tf_action_dist import Beta, Categorical, \ DiagGaussian, Dirichlet, SquashedGaussian, TFActionDistribution @@ -56,35 +55,40 @@ def build_sac_model(policy: Policy, obs_space: gym.spaces.Space, `policy.target_model`. """ # With separate state-preprocessor (before obs+action concat). - num_outputs = int(np.product(obs_space.shape)) + if config["use_state_preprocessor"]: + num_outputs = 256 # Flatten last Conv2D to this many nodes. + # No separate state-preprocessor: concat obs+actions right away. + else: + num_outputs = 0 + # No state preprocessor: fcnet_hiddens should be empty. + if config["model"]["fcnet_hiddens"]: + logger.warning( + "When not using a state-preprocessor with SAC, `fcnet_hiddens`" + " will be set to an empty list! Any hidden layer sizes are " + "defined via `policy_model.fcnet_hiddens` and " + "`Q_model.fcnet_hiddens`.") + config["model"]["fcnet_hiddens"] = [] # Force-ignore any additionally provided hidden layer sizes. # Everything should be configured using SAC's "Q_model" and "policy_model" # settings. - policy_model_config = MODEL_DEFAULTS.copy() - policy_model_config.update(config["policy_model"]) - q_model_config = MODEL_DEFAULTS.copy() - q_model_config.update(config["Q_model"]) - - default_model_cls = SACTorchModel if config["framework"] == "torch" \ - else SACTFModel - model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework=config["framework"], - default_model=default_model_cls, + model_interface=SACTorchModel + if config["framework"] == "torch" else SACTFModel, name="sac_model", - policy_model_config=policy_model_config, - q_model_config=q_model_config, + actor_hidden_activation=config["policy_model"]["fcnet_activation"], + actor_hiddens=config["policy_model"]["fcnet_hiddens"], + critic_hidden_activation=config["Q_model"]["fcnet_activation"], + critic_hiddens=config["Q_model"]["fcnet_hiddens"], twin_q=config["twin_q"], initial_alpha=config["initial_alpha"], target_entropy=config["target_entropy"]) - assert isinstance(model, default_model_cls) - # Create an exact copy of the model and store it in `policy.target_model`. # This will be used for tau-synched Q-target models that run behind the # actual Q-networks and are used for target q-value calculations in the @@ -95,16 +99,17 @@ def build_sac_model(policy: Policy, obs_space: gym.spaces.Space, num_outputs=num_outputs, model_config=config["model"], framework=config["framework"], - default_model=default_model_cls, + model_interface=SACTorchModel + if config["framework"] == "torch" else SACTFModel, name="target_sac_model", - policy_model_config=policy_model_config, - q_model_config=q_model_config, + actor_hidden_activation=config["policy_model"]["fcnet_activation"], + actor_hiddens=config["policy_model"]["fcnet_hiddens"], + critic_hidden_activation=config["Q_model"]["fcnet_activation"], + critic_hiddens=config["Q_model"]["fcnet_hiddens"], twin_q=config["twin_q"], initial_alpha=config["initial_alpha"], target_entropy=config["target_entropy"]) - assert isinstance(policy.target_model, default_model_cls) - return model @@ -193,14 +198,14 @@ def get_distribution_inputs_and_class( dist inputs, dist class, and a list of internal state outputs (in the RNN case). """ - # Get base-model (forward) output (this should be a noop call). - forward_out, state_out = model({ + # Get base-model output (w/o the SAC specific parts of the network). + model_out, state_out = model({ "obs": obs_batch, "is_training": policy._get_is_training_placeholder(), }, [], None) # Use the base output to get the policy outputs from the SAC model's # policy components. - distribution_inputs = model.get_policy_output(forward_out) + distribution_inputs = model.get_policy_output(model_out) # Get a distribution class to be used with the just calculated dist-inputs. action_dist_class = _get_dist_class(policy.config, policy.action_space) diff --git a/rllib/agents/sac/sac_torch_model.py b/rllib/agents/sac/sac_torch_model.py index f3fe34e23324..5f8b05980fed 100644 --- a/rllib/agents/sac/sac_torch_model.py +++ b/rllib/agents/sac/sac_torch_model.py @@ -1,12 +1,11 @@ import gym from gym.spaces import Box, Discrete import numpy as np -from typing import Dict, List, Optional +from typing import Optional, Tuple -from ray.rllib.models.catalog import ModelCatalog +from ray.rllib.models.torch.misc import SlimFC from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 -from ray.rllib.utils import force_list -from ray.rllib.utils.annotations import override +from ray.rllib.models.utils import get_activation_fn from ray.rllib.utils.framework import try_import_torch from ray.rllib.utils.spaces.simplex import Simplex from ray.rllib.utils.typing import ModelConfigDict, TensorType @@ -17,21 +16,14 @@ class SACTorchModel(TorchModelV2, nn.Module): """Extension of the standard TorchModelV2 for SAC. - To customize, do one of the following: - - sub-class SACTorchModel and override one or more of its methods. - - Use SAC's `Q_model` and `policy_model` keys to tweak the default model - behaviors (e.g. fcnet_hiddens, conv_filters, etc..). - - Use SAC's `Q_model->custom_model` and `policy_model->custom_model` keys - to specify your own custom Q-model(s) and policy-models, which will be - created within this SACTFModel (see `build_policy_model` and - `build_q_model`. - - Note: It is not recommended to override the `forward` method for SAC. This - would lead to shared weights (between policy and Q-nets), which will then - not be optimized by either of the critic- or actor-optimizers! + Instances of this Model get created via wrapping this class around another + default- or custom model (inside + rllib/agents/sac/sac_torch_policy.py::build_sac_model). Doing so simply + adds this class' methods (`get_q_values`, etc..) to the wrapped model, such + that the wrapped model can be used by the SAC algorithm. Data flow: - `obs` -> forward() (should stay a noop method!) -> `model_out` + `obs` -> forward() -> `model_out` `model_out` -> get_policy_output() -> pi(actions|obs) `model_out`, `actions` -> get_q_values() -> Q(s, a) `model_out`, `actions` -> get_twin_q_values() -> Q_twin(s, a) @@ -43,18 +35,20 @@ def __init__(self, num_outputs: Optional[int], model_config: ModelConfigDict, name: str, - policy_model_config: ModelConfigDict = None, - q_model_config: ModelConfigDict = None, + actor_hidden_activation: str = "relu", + actor_hiddens: Tuple[int] = (256, 256), + critic_hidden_activation: str = "relu", + critic_hiddens: Tuple[int] = (256, 256), twin_q: bool = False, initial_alpha: float = 1.0, target_entropy: Optional[float] = None): """Initializes a SACTorchModel instance. 7 Args: - policy_model_config (ModelConfigDict): The config dict for the - policy network. - q_model_config (ModelConfigDict): The config dict for the - Q-network(s) (2 if twin_q=True). + actor_hidden_activation (str): Activation for the actor network. + actor_hiddens (list): Hidden layers sizes for the actor network. + critic_hidden_activation (str): Activation for the critic network. + critic_hiddens (list): Hidden layers sizes for the critic network. twin_q (bool): Build twin Q networks (Q-net and target) for more stable Q-learning. initial_alpha (float): The initial value for the to-be-optimized @@ -75,29 +69,74 @@ def __init__(self, self.action_dim = action_space.n self.discrete = True action_outs = q_outs = self.action_dim + action_ins = None # No action inputs for the discrete case. elif isinstance(action_space, Box): self.action_dim = np.product(action_space.shape) self.discrete = False action_outs = 2 * self.action_dim + action_ins = self.action_dim q_outs = 1 else: assert isinstance(action_space, Simplex) self.action_dim = np.product(action_space.shape) self.discrete = False action_outs = self.action_dim + action_ins = self.action_dim q_outs = 1 # Build the policy network. - self.action_model = self.build_policy_model( - self.obs_space, action_outs, policy_model_config, "policy_model") - - # Build the Q-network(s). - self.q_net = self.build_q_model(self.obs_space, self.action_space, - q_outs, q_model_config, "q") + self.action_model = nn.Sequential() + ins = self.num_outputs + self.obs_ins = ins + activation = get_activation_fn( + actor_hidden_activation, framework="torch") + for i, n in enumerate(actor_hiddens): + self.action_model.add_module( + "action_{}".format(i), + SlimFC( + ins, + n, + initializer=torch.nn.init.xavier_uniform_, + activation_fn=activation)) + ins = n + self.action_model.add_module( + "action_out", + SlimFC( + ins, + action_outs, + initializer=torch.nn.init.xavier_uniform_, + activation_fn=None)) + + # Build the Q-net(s), including target Q-net(s). + def build_q_net(name_): + activation = get_activation_fn( + critic_hidden_activation, framework="torch") + # For continuous actions: Feed obs and actions (concatenated) + # through the NN. For discrete actions, only obs. + q_net = nn.Sequential() + ins = self.obs_ins + (0 if self.discrete else action_ins) + for i, n in enumerate(critic_hiddens): + q_net.add_module( + "{}_hidden_{}".format(name_, i), + SlimFC( + ins, + n, + initializer=torch.nn.init.xavier_uniform_, + activation_fn=activation)) + ins = n + + q_net.add_module( + "{}_out".format(name_), + SlimFC( + ins, + q_outs, + initializer=torch.nn.init.xavier_uniform_, + activation_fn=None)) + return q_net + + self.q_net = build_q_net("q") if twin_q: - self.twin_q_net = self.build_q_model(self.obs_space, - self.action_space, q_outs, - q_model_config, "twin_q") + self.twin_q_net = build_q_net("twin_q") else: self.twin_q_net = None @@ -118,80 +157,6 @@ def __init__(self, self.target_entropy = torch.tensor( data=[target_entropy], dtype=torch.float32, requires_grad=False) - @override(TorchModelV2) - def forward(self, input_dict: Dict[str, TensorType], - state: List[TensorType], - seq_lens: TensorType) -> (TensorType, List[TensorType]): - """The common (Q-net and policy-net) forward pass. - - NOTE: It is not(!) recommended to override this method as it would - introduce a shared pre-network, which would be updated by both - actor- and critic optimizers. - """ - return input_dict["obs"], state - - def build_policy_model(self, obs_space, num_outputs, policy_model_config, - name): - """Builds the policy model used by this SAC. - - Override this method in a sub-class of SACTFModel to implement your - own policy net. Alternatively, simply set `custom_model` within the - top level SAC `policy_model` config key to make this default - implementation of `build_policy_model` use your custom policy network. - - Returns: - TorchModelV2: The TorchModelV2 policy sub-model. - """ - model = ModelCatalog.get_model_v2( - obs_space, - self.action_space, - num_outputs, - policy_model_config, - framework="torch", - name=name) - return model - - def build_q_model(self, obs_space, action_space, num_outputs, - q_model_config, name): - """Builds one of the (twin) Q-nets used by this SAC. - - Override this method in a sub-class of SACTFModel to implement your - own Q-nets. Alternatively, simply set `custom_model` within the - top level SAC `Q_model` config key to make this default implementation - of `build_q_model` use your custom Q-nets. - - Returns: - TorchModelV2: The TorchModelV2 Q-net sub-model. - """ - self.concat_obs_and_actions = False - if self.discrete: - input_space = obs_space - else: - orig_space = getattr(obs_space, "original_space", obs_space) - if isinstance(orig_space, Box) and len(orig_space.shape) == 1: - input_space = Box( - float("-inf"), - float("inf"), - shape=(orig_space.shape[0] + action_space.shape[0], )) - self.concat_obs_and_actions = True - else: - if isinstance(orig_space, gym.spaces.Tuple): - spaces = orig_space.spaces - elif isinstance(orig_space, gym.spaces.Dict): - spaces = list(orig_space.spaces.values()) - else: - spaces = [obs_space] - input_space = gym.spaces.Tuple(spaces + [action_space]) - - model = ModelCatalog.get_model_v2( - input_space, - action_space, - num_outputs, - q_model_config, - framework="torch", - name=name) - return model - def get_q_values(self, model_out: TensorType, actions: Optional[TensorType] = None) -> TensorType: @@ -210,7 +175,12 @@ def get_q_values(self, Returns: TensorType: Q-values tensor of shape [BATCH_SIZE, 1]. """ - return self._get_q_value(model_out, actions, self.q_net) + # Continuous case -> concat actions to model_out. + if actions is not None: + return self.q_net(torch.cat([model_out, actions], -1)) + # Discrete case -> return q-vals for all actions. + else: + return self.q_net(model_out) def get_twin_q_values(self, model_out: TensorType, @@ -229,32 +199,12 @@ def get_twin_q_values(self, Returns: TensorType: Q-values tensor of shape [BATCH_SIZE, 1]. """ - return self._get_q_value(model_out, actions, self.twin_q_net) - - def _get_q_value(self, model_out, actions, net): - # Model outs may come as original Tuple observations, concat them - # here if this is the case. - if isinstance(net.obs_space, Box): - if isinstance(model_out, (list, tuple)): - model_out = torch.cat(model_out, dim=-1) - elif isinstance(model_out, dict): - model_out = list(model_out.values()) - # Continuous case -> concat actions to model_out. if actions is not None: - if self.concat_obs_and_actions: - input_dict = {"obs": torch.cat([model_out, actions], dim=-1)} - else: - input_dict = {"obs": force_list(model_out) + [actions]} + return self.twin_q_net(torch.cat([model_out, actions], -1)) # Discrete case -> return q-vals for all actions. else: - input_dict = {"obs": model_out} - # Switch on training mode (when getting Q-values, we are usually in - # training). - input_dict["is_training"] = True - - out, _ = net(input_dict, [], None) - return out + return self.twin_q_net(model_out) def get_policy_output(self, model_out: TensorType) -> TensorType: """Returns policy outputs, given the output of self.__call__(). @@ -271,23 +221,15 @@ def get_policy_output(self, model_out: TensorType) -> TensorType: Returns: TensorType: Distribution inputs for sampling actions. """ - # Model outs may come as original Tuple observations, concat them - # here if this is the case. - if isinstance(self.action_model.obs_space, Box): - if isinstance(model_out, (list, tuple)): - model_out = torch.cat(model_out, dim=-1) - elif isinstance(model_out, dict): - model_out = torch.cat(list(model_out.values()), dim=-1) - out, _ = self.action_model({"obs": model_out}, [], None) - return out + return self.action_model(model_out) def policy_variables(self): """Return the list of variables for the policy net.""" - return self.action_model.variables() + return list(self.action_model.parameters()) def q_variables(self): """Return the list of variables for Q / twin Q nets.""" - return self.q_net.variables() + (self.twin_q_net.variables() - if self.twin_q_net else []) + return list(self.q_net.parameters()) + \ + (list(self.twin_q_net.parameters()) if self.twin_q_net else []) diff --git a/rllib/agents/sac/tests/test_sac.py b/rllib/agents/sac/tests/test_sac.py index 1ec87370982d..6a84b19c7478 100644 --- a/rllib/agents/sac/tests/test_sac.py +++ b/rllib/agents/sac/tests/test_sac.py @@ -1,5 +1,5 @@ from gym import Env -from gym.spaces import Box, Discrete, Tuple +from gym.spaces import Box import numpy as np import re import unittest @@ -9,10 +9,6 @@ from ray.rllib.agents.sac.sac_tf_policy import sac_actor_critic_loss as tf_loss from ray.rllib.agents.sac.sac_torch_policy import actor_critic_loss as \ loss_torch -from ray.rllib.examples.env.random_env import RandomEnv -from ray.rllib.examples.models.batch_norm_model import KerasBatchNormModel, \ - TorchBatchNormModel -from ray.rllib.models.catalog import ModelCatalog from ray.rllib.models.tf.tf_action_dist import Dirichlet from ray.rllib.models.torch.torch_action_dist import TorchDirichlet from ray.rllib.execution.replay_buffer import LocalReplayBuffer @@ -56,7 +52,7 @@ def step(self, action): class TestSAC(unittest.TestCase): @classmethod def setUpClass(cls) -> None: - ray.init(local_mode=True) + ray.init() @classmethod def tearDownClass(cls) -> None: @@ -65,46 +61,22 @@ def tearDownClass(cls) -> None: def test_sac_compilation(self): """Tests whether an SACTrainer can be built with all frameworks.""" config = sac.DEFAULT_CONFIG.copy() - config["Q_model"] = sac.DEFAULT_CONFIG["Q_model"].copy() config["num_workers"] = 0 # Run locally. config["twin_q"] = True + config["soft_horizon"] = True config["clip_actions"] = False config["normalize_actions"] = True config["learning_starts"] = 0 config["prioritized_replay"] = True - config["rollout_fragment_length"] = 10 - config["train_batch_size"] = 10 num_iterations = 1 - - ModelCatalog.register_custom_model("batch_norm", KerasBatchNormModel) - ModelCatalog.register_custom_model("batch_norm_torch", - TorchBatchNormModel) - - image_space = Box(-1.0, 1.0, shape=(84, 84, 3)) - simple_space = Box(-1.0, 1.0, shape=(3, )) - - for fw in framework_iterator(config): + for _ in framework_iterator(config): # Test for different env types (discrete w/ and w/o image, + cont). for env in [ - RandomEnv, - "MsPacmanNoFrameskip-v4", - "CartPole-v0", + "Pendulum-v0", "MsPacmanNoFrameskip-v4", "CartPole-v0" ]: print("Env={}".format(env)) - if env == RandomEnv: - config["env_config"] = { - "observation_space": Tuple( - [simple_space, - Discrete(2), image_space]), - "action_space": Box(-1.0, 1.0, shape=(1, )), - } - else: - config["env_config"] = {} - # Test making the Q-model a custom one for CartPole, otherwise, - # use the default model. - config["Q_model"]["custom_model"] = "batch_norm{}".format( - "_torch" - if fw == "torch" else "") if env == "CartPole-v0" else None + config["use_state_preprocessor"] = \ + env == "MsPacmanNoFrameskip-v4" trainer = sac.SACTrainer(config=config, env=env) for i in range(num_iterations): results = trainer.train() @@ -131,56 +103,49 @@ def test_sac_loss_function(self): config["env_config"] = {"simplex_actions": True} map_ = { - # Action net. - "default_policy/fc_1/kernel": "action_model._hidden_layers.0." + # Normal net. + "default_policy/sequential/action_1/kernel": "action_model." + "action_0._model.0.weight", + "default_policy/sequential/action_1/bias": "action_model." + "action_0._model.0.bias", + "default_policy/sequential/action_out/kernel": "action_model." + "action_out._model.0.weight", + "default_policy/sequential/action_out/bias": "action_model." + "action_out._model.0.bias", + "default_policy/sequential_1/q_hidden_0/kernel": "q_net." + "q_hidden_0._model.0.weight", + "default_policy/sequential_1/q_hidden_0/bias": "q_net." + "q_hidden_0._model.0.bias", + "default_policy/sequential_1/q_out/kernel": "q_net." + "q_out._model.0.weight", + "default_policy/sequential_1/q_out/bias": "q_net." + "q_out._model.0.bias", + "default_policy/value_out/kernel": "_value_branch." "_model.0.weight", - "default_policy/fc_1/bias": "action_model._hidden_layers.0." + "default_policy/value_out/bias": "_value_branch." "_model.0.bias", - "default_policy/fc_out/kernel": "action_model." - "_logits._model.0.weight", - "default_policy/fc_out/bias": "action_model._logits._model.0.bias", - "default_policy/value_out/kernel": "action_model." - "_value_branch._model.0.weight", - "default_policy/value_out/bias": "action_model." - "_value_branch._model.0.bias", - # Q-net. - "default_policy/fc_1_1/kernel": "q_net." - "_hidden_layers.0._model.0.weight", - "default_policy/fc_1_1/bias": "q_net." - "_hidden_layers.0._model.0.bias", - "default_policy/fc_out_1/kernel": "q_net._logits._model.0.weight", - "default_policy/fc_out_1/bias": "q_net._logits._model.0.bias", - "default_policy/value_out_1/kernel": "q_net." - "_value_branch._model.0.weight", - "default_policy/value_out_1/bias": "q_net." - "_value_branch._model.0.bias", "default_policy/log_alpha": "log_alpha", - # Target action-net. - "default_policy/fc_1_2/kernel": "action_model." - "_hidden_layers.0._model.0.weight", - "default_policy/fc_1_2/bias": "action_model." - "_hidden_layers.0._model.0.bias", - "default_policy/fc_out_2/kernel": "action_model." - "_logits._model.0.weight", - "default_policy/fc_out_2/bias": "action_model." - "_logits._model.0.bias", - "default_policy/value_out_2/kernel": "action_model." - "_value_branch._model.0.weight", - "default_policy/value_out_2/bias": "action_model." - "_value_branch._model.0.bias", - # Target Q-net - "default_policy/fc_1_3/kernel": "q_net." - "_hidden_layers.0._model.0.weight", - "default_policy/fc_1_3/bias": "q_net." - "_hidden_layers.0._model.0.bias", - "default_policy/fc_out_3/kernel": "q_net." - "_logits._model.0.weight", - "default_policy/fc_out_3/bias": "q_net." - "_logits._model.0.bias", - "default_policy/value_out_3/kernel": "q_net." - "_value_branch._model.0.weight", - "default_policy/value_out_3/bias": "q_net." - "_value_branch._model.0.bias", + # Target net. + "default_policy/sequential_2/action_1/kernel": "action_model." + "action_0._model.0.weight", + "default_policy/sequential_2/action_1/bias": "action_model." + "action_0._model.0.bias", + "default_policy/sequential_2/action_out/kernel": "action_model." + "action_out._model.0.weight", + "default_policy/sequential_2/action_out/bias": "action_model." + "action_out._model.0.bias", + "default_policy/sequential_3/q_hidden_0/kernel": "q_net." + "q_hidden_0._model.0.weight", + "default_policy/sequential_3/q_hidden_0/bias": "q_net." + "q_hidden_0._model.0.bias", + "default_policy/sequential_3/q_out/kernel": "q_net." + "q_out._model.0.weight", + "default_policy/sequential_3/q_out/bias": "q_net." + "q_out._model.0.bias", + "default_policy/value_out_1/kernel": "_value_branch." + "_model.0.weight", + "default_policy/value_out_1/bias": "_value_branch." + "_model.0.bias", "default_policy/log_alpha_1": "log_alpha", } @@ -260,12 +225,10 @@ def test_sac_loss_function(self): policy.td_error, policy.optimizer().compute_gradients( policy.critic_loss[0], - [v for v in policy.model.q_variables() if - "value_" not in v.name]), + policy.model.q_variables()), policy.optimizer().compute_gradients( policy.actor_loss, - [v for v in policy.model.policy_variables() if - "value_" not in v.name]), + policy.model.policy_variables()), policy.optimizer().compute_gradients( policy.alpha_loss, policy.model.log_alpha)], feed_dict=policy._get_loss_inputs_dict( @@ -298,6 +261,8 @@ def test_sac_loss_function(self): a.backward() # `actor_loss` depends on Q-net vars (but these grads must # be ignored and overridden in critic_loss.backward!). + assert not any(v.grad is None + for v in policy.model.q_variables()) assert not all( torch.mean(v.grad) == 0 for v in policy.model.policy_variables()) @@ -308,38 +273,45 @@ def test_sac_loss_function(self): # Compare with tf ones. torch_a_grads = [ v.grad for v in policy.model.policy_variables() - if v.grad is not None ] - check(tf_a_grads[2], - np.transpose(torch_a_grads[0].detach().cpu())) + for tf_g, torch_g in zip(tf_a_grads, torch_a_grads): + if tf_g.shape != torch_g.shape: + check(tf_g, np.transpose(torch_g.detach().cpu())) + else: + check(tf_g, torch_g) # Test critic gradients. policy.critic_optims[0].zero_grad() assert all( torch.mean(v.grad) == 0.0 - for v in policy.model.q_variables() if v.grad is not None) + for v in policy.model.q_variables()) assert all( torch.min(v.grad) == 0.0 - for v in policy.model.q_variables() if v.grad is not None) + for v in policy.model.q_variables()) assert policy.model.log_alpha.grad is None c[0].backward() assert not all( torch.mean(v.grad) == 0 - for v in policy.model.q_variables() if v.grad is not None) + for v in policy.model.q_variables()) assert not all( - torch.min(v.grad) == 0 for v in policy.model.q_variables() - if v.grad is not None) + torch.min(v.grad) == 0 for v in policy.model.q_variables()) assert policy.model.log_alpha.grad is None # Compare with tf ones. torch_c_grads = [v.grad for v in policy.model.q_variables()] - check(tf_c_grads[0], - np.transpose(torch_c_grads[2].detach().cpu())) + for tf_g, torch_g in zip(tf_c_grads, torch_c_grads): + if tf_g.shape != torch_g.shape: + check(tf_g, np.transpose(torch_g.detach().cpu())) + else: + check(tf_g, torch_g) # Compare (unchanged(!) actor grads) with tf ones. torch_a_grads = [ v.grad for v in policy.model.policy_variables() ] - check(tf_a_grads[2], - np.transpose(torch_a_grads[0].detach().cpu())) + for tf_g, torch_g in zip(tf_a_grads, torch_a_grads): + if tf_g.shape != torch_g.shape: + check(tf_g, np.transpose(torch_g.detach().cpu())) + else: + check(tf_g, torch_g) # Test alpha gradient. policy.alpha_optim.zero_grad() @@ -364,7 +336,7 @@ def test_sac_loss_function(self): prev_fw_loss = (c, a, e, t) # Update weights from our batch (n times). - for update_iteration in range(5): + for update_iteration in range(10): print("train iteration {}".format(update_iteration)) if fw == "tf": in_ = self._get_batch_helper(obs_size, actions, batch_size) @@ -378,9 +350,10 @@ def test_sac_loss_function(self): # Net must have changed. if tf_updated_weights: check( - updated_weights["default_policy/fc_1/kernel"], + updated_weights[ + "default_policy/sequential/action_1/kernel"], tf_updated_weights[-1][ - "default_policy/fc_1/kernel"], + "default_policy/sequential/action_1/kernel"], false=True) tf_updated_weights.append(updated_weights) @@ -394,9 +367,7 @@ def test_sac_loss_function(self): buf._fake_batch = in_ trainer.train() # Compare updated model. - for tf_key in sorted(tf_weights.keys()): - if re.search("_[23]|alpha", tf_key): - continue + for tf_key in sorted(tf_weights.keys())[2:10]: tf_var = tf_weights[tf_key] torch_var = policy.model.state_dict()[map_[tf_key]] if tf_var.shape != torch_var.shape: @@ -410,9 +381,7 @@ def test_sac_loss_function(self): check(policy.model.log_alpha, tf_weights["default_policy/log_alpha"]) # Compare target nets. - for tf_key in sorted(tf_weights.keys()): - if not re.search("_[23]", tf_key): - continue + for tf_key in sorted(tf_weights.keys())[10:18]: tf_var = tf_weights[tf_key] torch_var = policy.target_model.state_dict()[map_[ tf_key]] @@ -468,9 +437,9 @@ def _sac_loss_helper(self, train_batch, weights, ks, log_alpha, fw, gamma, fc( relu( fc(model_out_t, - weights[ks[1]], - weights[ks[0]], - framework=fw)), weights[ks[9]], weights[ks[8]]), None) + weights[ks[3]], + weights[ks[2]], + framework=fw)), weights[ks[5]], weights[ks[4]]), None) policy_t = action_dist_t.deterministic_sample() log_pis_t = action_dist_t.logp(policy_t) if sess: @@ -483,9 +452,9 @@ def _sac_loss_helper(self, train_batch, weights, ks, log_alpha, fw, gamma, fc( relu( fc(model_out_tp1, - weights[ks[1]], - weights[ks[0]], - framework=fw)), weights[ks[9]], weights[ks[8]]), None) + weights[ks[3]], + weights[ks[2]], + framework=fw)), weights[ks[5]], weights[ks[4]]), None) policy_tp1 = action_dist_tp1.deterministic_sample() log_pis_tp1 = action_dist_tp1.logp(policy_tp1) if sess: @@ -499,11 +468,11 @@ def _sac_loss_helper(self, train_batch, weights, ks, log_alpha, fw, gamma, relu( fc(np.concatenate( [model_out_t, train_batch[SampleBatch.ACTIONS]], -1), - weights[ks[3]], - weights[ks[2]], + weights[ks[7]], + weights[ks[6]], framework=fw)), - weights[ks[11]], - weights[ks[10]], + weights[ks[9]], + weights[ks[8]], framework=fw) # Q-values for current policy in given current state. @@ -511,11 +480,11 @@ def _sac_loss_helper(self, train_batch, weights, ks, log_alpha, fw, gamma, q_t_det_policy = fc( relu( fc(np.concatenate([model_out_t, policy_t], -1), - weights[ks[3]], - weights[ks[2]], + weights[ks[7]], + weights[ks[6]], framework=fw)), - weights[ks[11]], - weights[ks[10]], + weights[ks[9]], + weights[ks[8]], framework=fw) # Target q network evaluation. @@ -524,11 +493,11 @@ def _sac_loss_helper(self, train_batch, weights, ks, log_alpha, fw, gamma, q_tp1 = fc( relu( fc(np.concatenate([target_model_out_tp1, policy_tp1], -1), - weights[ks[7]], - weights[ks[6]], + weights[ks[15]], + weights[ks[14]], framework=fw)), - weights[ks[15]], - weights[ks[14]], + weights[ks[17]], + weights[ks[16]], framework=fw) else: assert fw == "tfe" @@ -569,9 +538,9 @@ def _translate_weights_to_torch(self, weights_dict, map_): map_[k]: convert_to_torch_tensor( np.transpose(v) if re.search("kernel", k) else np.array([v]) if re.search("log_alpha", k) else v) - for i, (k, v) in enumerate(weights_dict.items()) if i < 13 + for k, v in weights_dict.items() + if re.search("(sequential(/|_1)|value_out/|log_alpha)", k) } - return model_dict def _translate_tfe_weights(self, weights_dict, map_): diff --git a/rllib/evaluation/rollout_worker.py b/rllib/evaluation/rollout_worker.py index 39d4bef776db..d0770cdf7dbb 100644 --- a/rllib/evaluation/rollout_worker.py +++ b/rllib/evaluation/rollout_worker.py @@ -32,7 +32,7 @@ from ray.rllib.utils import merge_dicts from ray.rllib.utils.annotations import DeveloperAPI from ray.rllib.utils.debug import summarize -from ray.rllib.utils.deprecation import DEPRECATED_VALUE, deprecation_warning +from ray.rllib.utils.deprecation import deprecation_warning from ray.rllib.utils.filter import get_filter, Filter from ray.rllib.utils.framework import try_import_tf, try_import_torch from ray.rllib.utils.sgd import do_minibatch_sgd @@ -396,22 +396,15 @@ def wrap(env): if clip_rewards is None: clip_rewards = True - # Deprecated way of framestacking is used. - framestack = model_config.get("framestack") is True # framestacking via trajectory view API is enabled. num_framestacks = model_config.get("num_framestacks", 0) - - # No trajectory view API: No traj. view based framestacking. if not policy_config["_use_trajectory_view_api"]: model_config["num_framestacks"] = num_framestacks = 0 - # Trajectory view API is on and num_framestacks=auto: Only - # stack traj. view based if old `framestack=[invalid value]`. elif num_framestacks == "auto": - if framestack == DEPRECATED_VALUE: - model_config["num_framestacks"] = num_framestacks = 4 - else: - model_config["num_framestacks"] = num_framestacks = 0 + model_config["num_framestacks"] = num_framestacks = 4 framestack_traj_view = num_framestacks > 1 + # Deprecated way of framestacking is used. + framestack = model_config.get("framestack") is True def wrap(env): env = wrap_deepmind( diff --git a/rllib/examples/models/cnn_plus_fc_concat_model.py b/rllib/examples/models/cnn_plus_fc_concat_model.py new file mode 100644 index 000000000000..6f8e3d85e4e2 --- /dev/null +++ b/rllib/examples/models/cnn_plus_fc_concat_model.py @@ -0,0 +1,218 @@ +from gym.spaces import Discrete, Tuple + +from ray.rllib.examples.models.impala_vision_nets import TorchImpalaVisionNet +from ray.rllib.models.catalog import ModelCatalog +from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.models.tf.misc import normc_initializer +from ray.rllib.models.tf.tf_modelv2 import TFModelV2 +from ray.rllib.models.torch.misc import normc_initializer as \ + torch_normc_initializer, SlimFC +from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 +from ray.rllib.models.utils import get_filter_config +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_tf, try_import_torch + +tf1, tf, tfv = try_import_tf() +torch, nn = try_import_torch() + + +# __sphinx_doc_begin__ +class CNNPlusFCConcatModel(TFModelV2): + """TFModelV2 concat'ing CNN outputs to flat input(s), followed by FC(s). + + Note: This model should be used for complex (Dict or Tuple) observation + spaces that have one or more image components. + """ + + def __init__(self, obs_space, action_space, num_outputs, model_config, + name): + # TODO: (sven) Support Dicts as well. + assert isinstance(obs_space.original_space, (Tuple)), \ + "`obs_space.original_space` must be Tuple!" + + super().__init__(obs_space, action_space, num_outputs, model_config, + name) + + # Build the CNN(s) given obs_space's image components. + self.cnns = {} + concat_size = 0 + for i, component in enumerate(obs_space.original_space): + # Image space. + if len(component.shape) == 3: + config = { + "conv_filters": model_config.get( + "conv_filters", get_filter_config(component.shape)), + "conv_activation": model_config.get("conv_activation"), + } + cnn = ModelCatalog.get_model_v2( + component, + action_space, + num_outputs=None, + model_config=config, + framework="tf", + name="cnn_{}".format(i)) + concat_size += cnn.num_outputs + self.cnns[i] = cnn + # Discrete inputs -> One-hot encode. + elif isinstance(component, Discrete): + concat_size += component.n + # TODO: (sven) Multidiscrete (see e.g. our auto-LSTM wrappers). + # Everything else (1D Box). + else: + assert len(component.shape) == 1, \ + "Only input Box 1D or 3D spaces allowed!" + concat_size += component.shape[-1] + + self.logits_and_value_model = None + self._value_out = None + if num_outputs: + # Action-distribution head. + concat_layer = tf.keras.layers.Input((concat_size, )) + logits_layer = tf.keras.layers.Dense( + num_outputs, + activation=tf.keras.activations.linear, + name="logits")(concat_layer) + + # Create the value branch model. + value_layer = tf.keras.layers.Dense( + 1, + name="value_out", + activation=None, + kernel_initializer=normc_initializer(0.01))(concat_layer) + self.logits_and_value_model = tf.keras.models.Model( + concat_layer, [logits_layer, value_layer]) + else: + self.num_outputs = concat_size + + @override(ModelV2) + def forward(self, input_dict, state, seq_lens): + # Push image observations through our CNNs. + outs = [] + for i, component in enumerate(input_dict["obs"]): + if i in self.cnns: + cnn_out, _ = self.cnns[i]({"obs": component}) + outs.append(cnn_out) + else: + outs.append(component) + # Concat all outputs and the non-image inputs. + out = tf.concat(outs, axis=1) + if not self.logits_and_value_model: + return out, [] + + # Value branch. + logits, values = self.logits_and_value_model(out) + self._value_out = tf.reshape(values, [-1]) + return logits, [] + + @override(ModelV2) + def value_function(self): + return self._value_out + + +# __sphinx_doc_end__ + + +class TorchCNNPlusFCConcatModel(TorchModelV2, nn.Module): + """TorchModelV2 concat'ing CNN outputs to flat input(s), followed by FC(s). + + Note: This model should be used for complex (Dict or Tuple) observation + spaces that have one or more image components. + """ + + def __init__(self, obs_space, action_space, num_outputs, model_config, + name): + # TODO: (sven) Support Dicts as well. + assert isinstance(obs_space.original_space, (Tuple)), \ + "`obs_space.original_space` must be Tuple!" + + nn.Module.__init__(self) + TorchModelV2.__init__(self, obs_space, action_space, num_outputs, + model_config, name) + + # Atari type CNNs or IMPALA type CNNs (with residual layers)? + self.cnn_type = self.model_config["custom_model_config"].get( + "conv_type", "atari") + + # Build the CNN(s) given obs_space's image components. + self.cnns = {} + concat_size = 0 + for i, component in enumerate(obs_space.original_space): + # Image space. + if len(component.shape) == 3: + config = { + "conv_filters": model_config.get( + "conv_filters", get_filter_config(component.shape)), + "conv_activation": model_config.get("conv_activation"), + } + if self.cnn_type == "atari": + cnn = ModelCatalog.get_model_v2( + component, + action_space, + num_outputs=None, + model_config=config, + framework="torch", + name="cnn_{}".format(i)) + else: + cnn = TorchImpalaVisionNet( + component, + action_space, + num_outputs=None, + model_config=config, + name="cnn_{}".format(i)) + + concat_size += cnn.num_outputs + self.cnns[i] = cnn + self.add_module("cnn_{}".format(i), cnn) + # Discrete inputs -> One-hot encode. + elif isinstance(component, Discrete): + concat_size += component.n + # TODO: (sven) Multidiscrete (see e.g. our auto-LSTM wrappers). + # Everything else (1D Box). + else: + assert len(component.shape) == 1, \ + "Only input Box 1D or 3D spaces allowed!" + concat_size += component.shape[-1] + + self.logits_layer = None + self.value_layer = None + self._value_out = None + + if num_outputs: + # Action-distribution head. + self.logits_layer = SlimFC( + in_size=concat_size, + out_size=num_outputs, + activation_fn=None, + ) + # Create the value branch model. + self.value_layer = SlimFC( + in_size=concat_size, + out_size=1, + activation_fn=None, + initializer=torch_normc_initializer(0.01)) + else: + self.num_outputs = concat_size + + @override(ModelV2) + def forward(self, input_dict, state, seq_lens): + # Push image observations through our CNNs. + outs = [] + for i, component in enumerate(input_dict["obs"]): + if i in self.cnns: + cnn_out, _ = self.cnns[i]({"obs": component}) + outs.append(cnn_out) + else: + outs.append(component) + # Concat all outputs and the non-image inputs. + out = torch.cat(outs, dim=1) + if self.logits_layer is None: + return out, [] + + # Value branch. + logits, values = self.logits_layer(out), self.value_layer(out) + self._value_out = torch.reshape(values, [-1]) + return logits, [] + + @override(ModelV2) + def value_function(self): + return self._value_out diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index 74ddcbeab2f5..66796d71f907 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -19,7 +19,7 @@ TorchDeterministic, TorchDiagGaussian, \ TorchMultiActionDistribution, TorchMultiCategorical from ray.rllib.utils.annotations import DeveloperAPI, PublicAPI -from ray.rllib.utils.deprecation import DEPRECATED_VALUE +from ray.rllib.utils.deprecation import DEPRECATED_VALUE, deprecation_warning from ray.rllib.utils.error import UnsupportedSpaceException from ray.rllib.utils.framework import try_import_tf, try_import_torch from ray.rllib.utils.spaces.simplex import Simplex @@ -56,18 +56,6 @@ # "linear" (or None). "conv_activation": "relu", - # Some default models support a final FC stack of n Dense layers with given - # activation: - # - Complex observation spaces: Image components are fed through - # VisionNets, flat Boxes are left as-is, Discrete are one-hot'd, then - # everything is concated and pushed through this final FC stack. - # - VisionNets (CNNs), e.g. after the CNN stack, there may be - # additional Dense layers. - # - FullyConnectedNetworks will have this additional FCStack as well - # (that's why it's empty by default). - "post_fcnet_hiddens": [], - "post_fcnet_activation": "relu", - # For DiagGaussian action distributions, make the second half of the model # outputs floating bias variables instead of state-dependent. This only # has an effect is using the default fully connected net. @@ -700,22 +688,17 @@ def _get_v2_model_class(input_space: gym.Space, framework: str = "tf") -> Type[ModelV2]: VisionNet = None - ComplexNet = None if framework in ["tf2", "tf", "tfe"]: from ray.rllib.models.tf.fcnet import \ FullyConnectedNetwork as FCNet from ray.rllib.models.tf.visionnet import \ VisionNetwork as VisionNet - from ray.rllib.models.tf.complex_input_net import \ - ComplexInputNetwork as ComplexNet elif framework == "torch": from ray.rllib.models.torch.fcnet import (FullyConnectedNetwork as FCNet) from ray.rllib.models.torch.visionnet import (VisionNetwork as VisionNet) - from ray.rllib.models.torch.complex_input_net import \ - ComplexInputNetwork as ComplexNet elif framework == "jax": from ray.rllib.models.jax.fcnet import (FullyConnectedNetwork as FCNet) @@ -727,29 +710,16 @@ def _get_v2_model_class(input_space: gym.Space, # Discrete/1D obs-spaces or 2D obs space but traj. view framestacking # disabled. num_framestacks = model_config.get("num_framestacks", "auto") - - # Tuple space, where at least one sub-space is image. - # -> Complex input model. - space_to_check = input_space if not hasattr( - input_space, "original_space") else input_space.original_space - if isinstance(input_space, - Tuple) or (isinstance(space_to_check, Tuple) and any( - isinstance(s, Box) and len(s.shape) >= 2 - for s in space_to_check.spaces)): - return ComplexNet - - # Single, flattenable/one-hot-abe space -> Simple FCNet. if isinstance(input_space, (Discrete, MultiDiscrete)) or \ len(input_space.shape) == 1 or ( len(input_space.shape) == 2 and ( num_framestacks == "auto" or num_framestacks <= 1)): return FCNet - - elif framework == "jax": - raise NotImplementedError("No non-FC default net for JAX yet!") - - # Last resort: Conv2D stack for single image spaces. - return VisionNet + # Default Conv2D net. + else: + if framework == "jax": + raise NotImplementedError("No Conv2D default net for JAX yet!") + return VisionNet @staticmethod def _get_multi_action_distribution(dist_class, action_space, config, @@ -798,8 +768,8 @@ def _validate_config(config: ModelConfigDict, framework: str) -> None: "framework=jax so far!") if config.get("framestack") != DEPRECATED_VALUE: - # deprecation_warning( - # old="framestack", new="num_framestacks (int)", error=False) + deprecation_warning( + old="framestack", new="num_framestacks (int)", error=False) # If old behavior is desired, disable traj. view-style # framestacking. config["num_framestacks"] = 0 diff --git a/rllib/models/modelv2.py b/rllib/models/modelv2.py index bd5ee113219b..70ad50202421 100644 --- a/rllib/models/modelv2.py +++ b/rllib/models/modelv2.py @@ -203,13 +203,9 @@ def __call__( restored = input_dict.copy() restored["obs"] = restore_original_dimensions( input_dict["obs"], self.obs_space, self.framework) - try: - if len(input_dict["obs"].shape) > 2: - restored["obs_flat"] = flatten(input_dict["obs"], - self.framework) - else: - restored["obs_flat"] = input_dict["obs"] - except AttributeError: + if len(input_dict["obs"].shape) > 2: + restored["obs_flat"] = flatten(input_dict["obs"], self.framework) + else: restored["obs_flat"] = input_dict["obs"] with self.context(): res = self.forward(restored, state or [], seq_lens) @@ -220,6 +216,15 @@ def __call__( "got {}".format(res)) outputs, state = res + try: + shape = outputs.shape + except AttributeError: + raise ValueError("Output is not a tensor: {}".format(outputs)) + else: + if len(shape) != 2 or int(shape[1]) != self.num_outputs: + raise ValueError( + "Expected output shape of [None, {}], got {}".format( + self.num_outputs, shape)) if not isinstance(state, list): raise ValueError("State output is not a list: {}".format(state)) @@ -413,15 +418,15 @@ def restore_original_dimensions(obs: TensorType, observation space. """ - if tensorlib == "tf": - tensorlib = tf - elif tensorlib == "torch": - assert torch is not None - tensorlib = torch - original_space = getattr(obs_space, "original_space", obs_space) - if original_space is obs_space: + if hasattr(obs_space, "original_space"): + if tensorlib == "tf": + tensorlib = tf + elif tensorlib == "torch": + assert torch is not None + tensorlib = torch + return _unpack_obs(obs, obs_space.original_space, tensorlib=tensorlib) + else: return obs - return _unpack_obs(obs, original_space, tensorlib=tensorlib) # Cache of preprocessors, for if the user is calling unpack obs often. @@ -485,8 +490,7 @@ def _unpack_obs(obs: TensorType, space: gym.Space, tensorlib.reshape(obs_slice, batch_dims + list(p.shape)), v, tensorlib=tensorlib) - # Repeated space. - else: + elif isinstance(space, Repeated): assert isinstance(prep, RepeatedValuesPreprocessor), prep child_size = prep.child_preprocessor.size # The list lengths are stored in the first slot of the flat obs. @@ -499,6 +503,8 @@ def _unpack_obs(obs: TensorType, space: gym.Space, with_repeat_dim, space.child_space, tensorlib=tensorlib) return RepeatedValues( u, lengths=lengths, max_len=prep._obs_space.max_len) + else: + assert False, space return u else: return obs diff --git a/rllib/models/tf/complex_input_net.py b/rllib/models/tf/complex_input_net.py deleted file mode 100644 index 8bc691e2405e..000000000000 --- a/rllib/models/tf/complex_input_net.py +++ /dev/null @@ -1,156 +0,0 @@ -from gym.spaces import Box, Discrete, Tuple -import numpy as np - -from ray.rllib.models.catalog import ModelCatalog -from ray.rllib.models.modelv2 import ModelV2, restore_original_dimensions -from ray.rllib.models.tf.misc import normc_initializer -from ray.rllib.models.tf.tf_modelv2 import TFModelV2 -from ray.rllib.models.utils import get_filter_config -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.tf_ops import one_hot - -tf1, tf, tfv = try_import_tf() - - -# __sphinx_doc_begin__ -class ComplexInputNetwork(TFModelV2): - """TFModelV2 concat'ing CNN outputs to flat input(s), followed by FC(s). - - Note: This model should be used for complex (Dict or Tuple) observation - spaces that have one or more image components. - - The data flow is as follows: - - `obs` (e.g. Tuple[img0, img1, discrete0]) -> `CNN0 + CNN1 + ONE-HOT` - `CNN0 + CNN1 + ONE-HOT` -> concat all flat outputs -> `out` - `out` -> (optional) FC-stack -> `out2` - `out2` -> action (logits) and vaulue heads. - """ - - def __init__(self, obs_space, action_space, num_outputs, model_config, - name): - # TODO: (sven) Support Dicts as well. - self.original_space = obs_space.original_space if \ - hasattr(obs_space, "original_space") else obs_space - assert isinstance(self.original_space, (Tuple)), \ - "`obs_space.original_space` must be Tuple!" - - super().__init__(self.original_space, action_space, num_outputs, - model_config, name) - - # Build the CNN(s) given obs_space's image components. - self.cnns = {} - self.one_hot = {} - self.flatten = {} - concat_size = 0 - for i, component in enumerate(self.original_space): - # Image space. - if len(component.shape) == 3: - config = { - "conv_filters": model_config.get( - "conv_filters", get_filter_config(component.shape)), - "conv_activation": model_config.get("conv_activation"), - "post_fcnet_hiddens": [], - } - cnn = ModelCatalog.get_model_v2( - component, - action_space, - num_outputs=None, - model_config=config, - framework="tf", - name="cnn_{}".format(i)) - concat_size += cnn.num_outputs - self.cnns[i] = cnn - # Discrete inputs -> One-hot encode. - elif isinstance(component, Discrete): - self.one_hot[i] = True - concat_size += component.n - # TODO: (sven) Multidiscrete (see e.g. our auto-LSTM wrappers). - # Everything else (1D Box). - else: - self.flatten[i] = int(np.product(component.shape)) - concat_size += self.flatten[i] - - # Optional post-concat FC-stack. - post_fc_stack_config = { - "fcnet_hiddens": model_config.get("post_fcnet_hiddens", []), - "fcnet_activation": model_config.get("post_fcnet_activation", - "relu") - } - self.post_fc_stack = ModelCatalog.get_model_v2( - Box(float("-inf"), - float("inf"), - shape=(concat_size, ), - dtype=np.float32), - self.action_space, - None, - post_fc_stack_config, - framework="tf", - name="post_fc_stack") - - # Actions and value heads. - self.logits_and_value_model = None - self._value_out = None - if num_outputs: - # Action-distribution head. - concat_layer = tf.keras.layers.Input( - (self.post_fc_stack.num_outputs, )) - logits_layer = tf.keras.layers.Dense( - num_outputs, - activation=tf.keras.activations.linear, - name="logits")(concat_layer) - - # Create the value branch model. - value_layer = tf.keras.layers.Dense( - 1, - name="value_out", - activation=None, - kernel_initializer=normc_initializer(0.01))(concat_layer) - self.logits_and_value_model = tf.keras.models.Model( - concat_layer, [logits_layer, value_layer]) - else: - self.num_outputs = self.post_fc_stack.num_outputs - - @override(ModelV2) - def forward(self, input_dict, state, seq_lens): - if SampleBatch.OBS in input_dict and "obs_flat" in input_dict: - orig_obs = input_dict[SampleBatch.OBS] - else: - orig_obs = restore_original_dimensions(input_dict[SampleBatch.OBS], - self.obs_space, "tf") - # Push image observations through our CNNs. - outs = [] - for i, component in enumerate(orig_obs): - if i in self.cnns: - cnn_out, _ = self.cnns[i]({SampleBatch.OBS: component}) - outs.append(cnn_out) - elif i in self.one_hot: - if component.dtype in [tf.int32, tf.int64, tf.uint8]: - outs.append( - one_hot(component, self.original_space.spaces[i])) - else: - outs.append(component) - else: - outs.append(tf.reshape(component, [-1, self.flatten[i]])) - # Concat all outputs and the non-image inputs. - out = tf.concat(outs, axis=1) - # Push through (optional) FC-stack (this may be an empty stack). - out, _ = self.post_fc_stack({SampleBatch.OBS: out}, [], None) - - # No logits/value branches. - if not self.logits_and_value_model: - return out, [] - - # Logits- and value branches. - logits, values = self.logits_and_value_model(out) - self._value_out = tf.reshape(values, [-1]) - return logits, [] - - @override(ModelV2) - def value_function(self): - return self._value_out - - -# __sphinx_doc_end__ diff --git a/rllib/models/tf/fcnet.py b/rllib/models/tf/fcnet.py index 9b0e8c565374..eea01014db9e 100644 --- a/rllib/models/tf/fcnet.py +++ b/rllib/models/tf/fcnet.py @@ -19,12 +19,8 @@ def __init__(self, obs_space: gym.spaces.Space, super(FullyConnectedNetwork, self).__init__( obs_space, action_space, num_outputs, model_config, name) - hiddens = model_config.get("fcnet_hiddens", []) + \ - model_config.get("post_fcnet_hiddens", []) - activation = model_config.get("fcnet_activation") - if not model_config.get("fcnet_hiddens", []): - activation = model_config.get("post_fcnet_activation") - activation = get_activation_fn(activation) + activation = get_activation_fn(model_config.get("fcnet_activation")) + hiddens = model_config.get("fcnet_hiddens", []) no_final_linear = model_config.get("no_final_linear") vf_share_layers = model_config.get("vf_share_layers") free_log_std = model_config.get("free_log_std") diff --git a/rllib/models/tf/tf_modelv2.py b/rllib/models/tf/tf_modelv2.py index dfb850a339f7..4394d321304a 100644 --- a/rllib/models/tf/tf_modelv2.py +++ b/rllib/models/tf/tf_modelv2.py @@ -107,8 +107,7 @@ def _find_sub_modules(current_key, struct): if isinstance(struct, tf.keras.models.Model): ret = {} for var in struct.variables: - name = re.sub("/", ".", var.name) - key = current_key + "." + name + key = current_key + "." + re.sub("/", ".", var.name) ret[key] = var return ret # Other TFModelV2: Include its vars into ours. @@ -119,7 +118,7 @@ def _find_sub_modules(current_key, struct): } # tf.Variable elif isinstance(struct, tf.Variable): - return {current_key: struct} + return {current_key + "." + struct.name: struct} # List/Tuple. elif isinstance(struct, (tuple, list)): ret = {} @@ -134,7 +133,7 @@ def _find_sub_modules(current_key, struct): current_key += "_" ret = {} for key, value in struct.items(): - sub_vars = TFModelV2._find_sub_modules(current_key + str(key), + sub_vars = TFModelV2._find_sub_modules(current_key + key, value) ret.update(sub_vars) return ret diff --git a/rllib/models/tf/visionnet.py b/rllib/models/tf/visionnet.py index 955ac1e52e7f..b83e867b6545 100644 --- a/rllib/models/tf/visionnet.py +++ b/rllib/models/tf/visionnet.py @@ -13,17 +13,7 @@ class VisionNetwork(TFModelV2): - """Generic vision network implemented in ModelV2 API. - - An additional post-conv fully connected stack can be added and configured - via the config keys: - `post_fcnet_hiddens`: Dense layer sizes after the Conv2D stack. - `post_fcnet_activation`: Activation function to use for this FC stack. - - Examples: - - - """ + """Generic vision network implemented in ModelV2 API.""" def __init__(self, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, num_outputs: int, @@ -39,12 +29,6 @@ def __init__(self, obs_space: gym.spaces.Space, filters = self.model_config["conv_filters"] assert len(filters) > 0,\ "Must provide at least 1 entry in `conv_filters`!" - - # Post FC net config. - post_fcnet_hiddens = model_config.get("post_fcnet_hiddens", []) - post_fcnet_activation = get_activation_fn( - model_config.get("post_fcnet_activation"), framework="tf") - no_final_linear = self.model_config.get("no_final_linear") vf_share_layers = self.model_config.get("vf_share_layers") self.traj_view_framestacking = False @@ -78,29 +62,17 @@ def __init__(self, obs_space: gym.spaces.Space, out_size, kernel, stride = filters[-1] - # No final linear: Last layer has activation function and exits with - # num_outputs nodes (this could be a 1x1 conv or a FC layer, depending - # on `post_fcnet_...` settings). + # No final linear: Last layer is a Conv2D and uses num_outputs. if no_final_linear and num_outputs: last_layer = tf.keras.layers.Conv2D( - out_size if post_fcnet_hiddens else num_outputs, + num_outputs, kernel, strides=(stride, stride), activation=activation, padding="valid", data_format="channels_last", name="conv_out")(last_layer) - # Add (optional) post-fc-stack after last Conv2D layer. - layer_sizes = post_fcnet_hiddens[:-1] + ([num_outputs] - if post_fcnet_hiddens else - []) - for i, out_size in enumerate(layer_sizes): - last_layer = tf.keras.layers.Dense( - out_size, - name="post_fcnet_{}".format(i), - activation=post_fcnet_activation, - kernel_initializer=normc_initializer(1.0))(last_layer) - + conv_out = last_layer # Finish network normally (w/o overriding last layer size with # `num_outputs`), then add another linear one of size `num_outputs`. else: @@ -116,56 +88,29 @@ def __init__(self, obs_space: gym.spaces.Space, # num_outputs defined. Use that to create an exact # `num_output`-sized (1,1)-Conv2D. if num_outputs: - if post_fcnet_hiddens: - last_cnn = last_layer = tf.keras.layers.Conv2D( - post_fcnet_hiddens[0], [1, 1], - activation=post_fcnet_activation, - padding="same", - data_format="channels_last", - name="conv_out")(last_layer) - # Add (optional) post-fc-stack after last Conv2D layer. - for i, out_size in enumerate(post_fcnet_hiddens[1:] + - [num_outputs]): - last_layer = tf.keras.layers.Dense( - out_size, - name="post_fcnet_{}".format(i + 1), - activation=post_fcnet_activation - if i < len(post_fcnet_hiddens) - 1 else None, - kernel_initializer=normc_initializer(1.0))( - last_layer) - else: - last_cnn = last_layer = tf.keras.layers.Conv2D( - num_outputs, [1, 1], - activation=None, - padding="same", - data_format="channels_last", - name="conv_out")(last_layer) - - if last_cnn.shape[1] != 1 or last_cnn.shape[2] != 1: + conv_out = tf.keras.layers.Conv2D( + num_outputs, [1, 1], + activation=None, + padding="same", + data_format="channels_last", + name="conv_out")(last_layer) + + if conv_out.shape[1] != 1 or conv_out.shape[2] != 1: raise ValueError( "Given `conv_filters` ({}) do not result in a [B, 1, " "1, {} (`num_outputs`)] shape (but in {})! Please " "adjust your Conv2D stack such that the dims 1 and 2 " "are both 1.".format(self.model_config["conv_filters"], self.num_outputs, - list(last_cnn.shape))) + list(conv_out.shape))) # num_outputs not known -> Flatten, then set self.num_outputs # to the resulting number of nodes. else: self.last_layer_is_flattened = True - last_layer = tf.keras.layers.Flatten( + conv_out = tf.keras.layers.Flatten( data_format="channels_last")(last_layer) - - # Add (optional) post-fc-stack after last Conv2D layer. - for i, out_size in enumerate(post_fcnet_hiddens): - last_layer = tf.keras.layers.Dense( - out_size, - name="post_fcnet_{}".format(i), - activation=post_fcnet_activation, - kernel_initializer=normc_initializer(1.0))(last_layer) - self.num_outputs = last_layer.shape[1] - logits_out = last_layer + self.num_outputs = conv_out.shape[1] # Build the value layers if vf_share_layers: @@ -206,7 +151,7 @@ def __init__(self, obs_space: gym.spaces.Space, value_out = tf.keras.layers.Lambda( lambda x: tf.squeeze(x, axis=[1, 2]))(last_layer) - self.base_model = tf.keras.Model(inputs, [logits_out, value_out]) + self.base_model = tf.keras.Model(inputs, [conv_out, value_out]) # Optional: framestacking obs/new_obs for Atari. if self.traj_view_framestacking: diff --git a/rllib/models/torch/complex_input_net.py b/rllib/models/torch/complex_input_net.py deleted file mode 100644 index 2b9601947a5e..000000000000 --- a/rllib/models/torch/complex_input_net.py +++ /dev/null @@ -1,163 +0,0 @@ -from gym.spaces import Box, Discrete, Tuple -import numpy as np - -# TODO (sven): add IMPALA-style option. -# from ray.rllib.examples.models.impala_vision_nets import TorchImpalaVisionNet -from ray.rllib.models.torch.misc import normc_initializer as \ - torch_normc_initializer, SlimFC -from ray.rllib.models.catalog import ModelCatalog -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 -from ray.rllib.models.utils import get_filter_config -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.torch_ops import one_hot - -torch, nn = try_import_torch() - - -class ComplexInputNetwork(TorchModelV2, nn.Module): - """TorchModelV2 concat'ing CNN outputs to flat input(s), followed by FC(s). - - Note: This model should be used for complex (Dict or Tuple) observation - spaces that have one or more image components. - - The data flow is as follows: - - `obs` (e.g. Tuple[img0, img1, discrete0]) -> `CNN0 + CNN1 + ONE-HOT` - `CNN0 + CNN1 + ONE-HOT` -> concat all flat outputs -> `out` - `out` -> (optional) FC-stack -> `out2` - `out2` -> action (logits) and vaulue heads. - """ - - def __init__(self, obs_space, action_space, num_outputs, model_config, - name): - # TODO: (sven) Support Dicts as well. - self.original_space = obs_space.original_space if \ - hasattr(obs_space, "original_space") else obs_space - assert isinstance(self.original_space, (Tuple)), \ - "`obs_space.original_space` must be Tuple!" - - nn.Module.__init__(self) - TorchModelV2.__init__(self, self.original_space, action_space, - num_outputs, model_config, name) - - # Atari type CNNs or IMPALA type CNNs (with residual layers)? - # self.cnn_type = self.model_config["custom_model_config"].get( - # "conv_type", "atari") - - # Build the CNN(s) given obs_space's image components. - self.cnns = {} - self.one_hot = {} - self.flatten = {} - concat_size = 0 - for i, component in enumerate(self.original_space): - # Image space. - if len(component.shape) == 3: - config = { - "conv_filters": model_config.get( - "conv_filters", get_filter_config(component.shape)), - "conv_activation": model_config.get("conv_activation"), - "post_fcnet_hiddens": [], - } - # if self.cnn_type == "atari": - cnn = ModelCatalog.get_model_v2( - component, - action_space, - num_outputs=None, - model_config=config, - framework="torch", - name="cnn_{}".format(i)) - # TODO (sven): add IMPALA-style option. - # else: - # cnn = TorchImpalaVisionNet( - # component, - # action_space, - # num_outputs=None, - # model_config=config, - # name="cnn_{}".format(i)) - - concat_size += cnn.num_outputs - self.cnns[i] = cnn - self.add_module("cnn_{}".format(i), cnn) - # Discrete inputs -> One-hot encode. - elif isinstance(component, Discrete): - self.one_hot[i] = True - concat_size += component.n - # TODO: (sven) Multidiscrete (see e.g. our auto-LSTM wrappers). - # Everything else (1D Box). - else: - self.flatten[i] = int(np.product(component.shape)) - concat_size += self.flatten[i] - - # Optional post-concat FC-stack. - post_fc_stack_config = { - "fcnet_hiddens": model_config.get("post_fcnet_hiddens", []), - "fcnet_activation": model_config.get("post_fcnet_activation", - "relu") - } - self.post_fc_stack = ModelCatalog.get_model_v2( - Box(float("-inf"), - float("inf"), - shape=(concat_size, ), - dtype=np.float32), - self.action_space, - None, - post_fc_stack_config, - framework="torch", - name="post_fc_stack") - - # Actions and value heads. - self.logits_layer = None - self.value_layer = None - self._value_out = None - - if num_outputs: - # Action-distribution head. - self.logits_layer = SlimFC( - in_size=self.post_fc_stack.num_outputs, - out_size=num_outputs, - activation_fn=None, - ) - # Create the value branch model. - self.value_layer = SlimFC( - in_size=self.post_fc_stack.num_outputs, - out_size=1, - activation_fn=None, - initializer=torch_normc_initializer(0.01)) - else: - self.num_outputs = concat_size - - @override(ModelV2) - def forward(self, input_dict, state, seq_lens): - # Push image observations through our CNNs. - outs = [] - for i, component in enumerate(input_dict["obs"]): - if i in self.cnns: - cnn_out, _ = self.cnns[i]({"obs": component}) - outs.append(cnn_out) - elif i in self.one_hot: - if component.dtype in [torch.int32, torch.int64, torch.uint8]: - outs.append( - one_hot(component, self.original_space.spaces[i])) - else: - outs.append(component) - else: - outs.append(torch.reshape(component, [-1, self.flatten[i]])) - # Concat all outputs and the non-image inputs. - out = torch.cat(outs, dim=1) - # Push through (optional) FC-stack (this may be an empty stack). - out, _ = self.post_fc_stack({"obs": out}, [], None) - - # No logits/value branches. - if self.logits_layer is None: - return out, [] - - # Logits- and value branches. - logits, values = self.logits_layer(out), self.value_layer(out) - self._value_out = torch.reshape(values, [-1]) - return logits, [] - - @override(ModelV2) - def value_function(self): - return self._value_out diff --git a/rllib/models/torch/fcnet.py b/rllib/models/torch/fcnet.py index 91b9c0e1d59d..58fbb6bc476d 100644 --- a/rllib/models/torch/fcnet.py +++ b/rllib/models/torch/fcnet.py @@ -24,11 +24,8 @@ def __init__(self, obs_space: gym.spaces.Space, model_config, name) nn.Module.__init__(self) - hiddens = model_config.get("fcnet_hiddens", []) + \ - model_config.get("post_fcnet_hiddens", []) activation = model_config.get("fcnet_activation") - if not model_config.get("fcnet_hiddens", []): - activation = model_config.get("post_fcnet_activation") + hiddens = model_config.get("fcnet_hiddens", []) no_final_linear = model_config.get("no_final_linear") self.vf_share_layers = model_config.get("vf_share_layers") self.free_log_std = model_config.get("free_log_std") diff --git a/rllib/models/torch/visionnet.py b/rllib/models/torch/visionnet.py index 133c851f5b7a..cd6352acd532 100644 --- a/rllib/models/torch/visionnet.py +++ b/rllib/models/torch/visionnet.py @@ -5,7 +5,7 @@ from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 from ray.rllib.models.torch.misc import normc_initializer, same_padding, \ SlimConv2d, SlimFC -from ray.rllib.models.utils import get_activation_fn, get_filter_config +from ray.rllib.models.utils import get_filter_config from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.policy.view_requirement import ViewRequirement from ray.rllib.utils.annotations import override @@ -33,12 +33,6 @@ def __init__(self, obs_space: gym.spaces.Space, filters = self.model_config["conv_filters"] assert len(filters) > 0,\ "Must provide at least 1 entry in `conv_filters`!" - - # Post FC net config. - post_fcnet_hiddens = model_config.get("post_fcnet_hiddens", []) - post_fcnet_activation = get_activation_fn( - model_config.get("post_fcnet_activation"), framework="torch") - no_final_linear = self.model_config.get("no_final_linear") vf_share_layers = self.model_config.get("vf_share_layers") @@ -74,33 +68,17 @@ def __init__(self, obs_space: gym.spaces.Space, out_channels, kernel, stride = filters[-1] - # No final linear: Last layer has activation function and exits with - # num_outputs nodes (this could be a 1x1 conv or a FC layer, depending - # on `post_fcnet_...` settings). + # No final linear: Last layer is a Conv2D and uses num_outputs. if no_final_linear and num_outputs: - out_channels = out_channels if post_fcnet_hiddens else num_outputs layers.append( SlimConv2d( in_channels, - out_channels, + num_outputs, kernel, stride, None, # padding=valid activation_fn=activation)) - - # Add (optional) post-fc-stack after last Conv2D layer. - layer_sizes = post_fcnet_hiddens[:-1] + ([num_outputs] - if post_fcnet_hiddens else - []) - for i, out_size in enumerate(layer_sizes): - layers.append( - SlimFC( - in_size=out_channels, - out_size=out_size, - activation_fn=post_fcnet_activation, - initializer=normc_initializer(1.0))) - out_channels = out_size - + out_channels = num_outputs # Finish network normally (w/o overriding last layer size with # `num_outputs`), then add another linear one of size `num_outputs`. else: @@ -121,31 +99,12 @@ def __init__(self, obs_space: gym.spaces.Space, np.ceil((in_size[1] - kernel[1]) / stride) ] padding, _ = same_padding(in_size, [1, 1], [1, 1]) - if post_fcnet_hiddens: - layers.append(nn.Flatten()) - in_size = out_channels - # Add (optional) post-fc-stack after last Conv2D layer. - for i, out_size in enumerate(post_fcnet_hiddens + - [num_outputs]): - layers.append( - SlimFC( - in_size=in_size, - out_size=out_size, - activation_fn=post_fcnet_activation - if i < len(post_fcnet_hiddens) - 1 else None, - initializer=normc_initializer(1.0))) - in_size = out_size - # Last layer is logits layer. - self._logits = layers.pop() - - else: - self._logits = SlimConv2d( - out_channels, - num_outputs, [1, 1], - 1, - padding, - activation_fn=None) - + self._logits = SlimConv2d( + out_channels, + num_outputs, [1, 1], + 1, + padding, + activation_fn=None) # num_outputs not known -> Flatten, then set self.num_outputs # to the resulting number of nodes. else: @@ -237,19 +196,16 @@ def forward(self, input_dict: Dict[str, TensorType], if not self.last_layer_is_flattened: if self._logits: conv_out = self._logits(conv_out) - if len(conv_out.shape) == 4: - if conv_out.shape[2] != 1 or conv_out.shape[3] != 1: - raise ValueError( - "Given `conv_filters` ({}) do not result in a [B, {} " - "(`num_outputs`), 1, 1] shape (but in {})! Please " - "adjust your Conv2D stack such that the last 2 dims " - "are both 1.".format(self.model_config["conv_filters"], - self.num_outputs, - list(conv_out.shape))) - logits = conv_out.squeeze(3) - logits = logits.squeeze(2) - else: - logits = conv_out + if conv_out.shape[2] != 1 or conv_out.shape[3] != 1: + raise ValueError( + "Given `conv_filters` ({}) do not result in a [B, {} " + "(`num_outputs`), 1, 1] shape (but in {})! Please adjust " + "your Conv2D stack such that the last 2 dims are both " + "1.".format(self.model_config["conv_filters"], + self.num_outputs, list(conv_out.shape))) + logits = conv_out.squeeze(3) + logits = logits.squeeze(2) + return logits, state else: return conv_out, state diff --git a/rllib/policy/tests/test_compute_log_likelihoods.py b/rllib/policy/tests/test_compute_log_likelihoods.py index 77c52d44b5d8..b64eabd47cea 100644 --- a/rllib/policy/tests/test_compute_log_likelihoods.py +++ b/rllib/policy/tests/test_compute_log_likelihoods.py @@ -177,8 +177,8 @@ def logp_func(means, log_stds, values, low=-1.0, high=1.0): config, prev_a, continuous=True, - layer_key=("fc", (0, 2), ("action_model._hidden_layers.0.", - "action_model._logits.")), + layer_key=("sequential/action", (2, 4), + ("action_model.action_0.", "action_model.action_out.")), logp_func=logp_func) def test_sac_discr(self): @@ -188,7 +188,12 @@ def test_sac_discr(self): config["policy_model"]["fcnet_activation"] = "linear" prev_a = np.array(0) - do_test_log_likelihood(sac.SACTrainer, config, prev_a) + do_test_log_likelihood( + sac.SACTrainer, + config, + prev_a, + layer_key=("sequential/action", (0, 2), + ("action_model.action_0.", "action_model.action_out."))) if __name__ == "__main__": diff --git a/rllib/tests/run_regression_tests.py b/rllib/tests/run_regression_tests.py index cc2650425fb9..3f42147e4071 100644 --- a/rllib/tests/run_regression_tests.py +++ b/rllib/tests/run_regression_tests.py @@ -37,10 +37,6 @@ "--yaml-dir", type=str, help="The directory in which to find all yamls to test.") -parser.add_argument( - "--local-mode", - action="store_true", - help="Run ray in local mode for easier debugging.") # Obsoleted arg, use --framework=torch instead. parser.add_argument( @@ -96,7 +92,7 @@ passed = False for i in range(3): try: - ray.init(num_cpus=5, local_mode=args.local_mode) + ray.init(num_cpus=5) trials = run_experiments(experiments, resume=False, verbose=2) finally: ray.shutdown() diff --git a/rllib/tests/test_nested_observation_spaces.py b/rllib/tests/test_nested_observation_spaces.py index e1aac7b42cb3..1a10e8c71d0e 100644 --- a/rllib/tests/test_nested_observation_spaces.py +++ b/rllib/tests/test_nested_observation_spaces.py @@ -333,7 +333,7 @@ def test_invalid_model(self): def test_invalid_model2(self): ModelCatalog.register_custom_model("invalid2", InvalidModel2) self.assertRaisesRegexp( - ValueError, "State output is not a list", + ValueError, "Expected output shape of", lambda: PGTrainer( env="CartPole-v0", config={ "model": { diff --git a/rllib/tests/test_supported_spaces.py b/rllib/tests/test_supported_spaces.py index 40bba43b2cb8..39a7ebb9382f 100644 --- a/rllib/tests/test_supported_spaces.py +++ b/rllib/tests/test_supported_spaces.py @@ -15,7 +15,7 @@ ACTION_SPACES_TO_TEST = { "discrete": Discrete(5), "vector": Box(-1.0, 1.0, (5, ), dtype=np.float32), - "vector2": Box(-1.0, 1.0, (5, 5), dtype=np.float32), + # "vector2": Box(-1.0, 1.0, (5, 5), dtype=np.float32), "multidiscrete": MultiDiscrete([1, 2, 3, 4]), "tuple": Tuple( [Discrete(2), @@ -63,6 +63,8 @@ def _do_check(alg, config, a_name, o_name): p_done=1.0, check_action_bounds=check_bounds))) stat = "ok" + if alg == "SAC": + config["use_state_preprocessor"] = o_name in ["atari", "image"] try: a = get_agent_class(alg)(config=config, env=RandomEnv) diff --git a/rllib/tuned_examples/sac/atari-sac.yaml b/rllib/tuned_examples/sac/atari-sac.yaml index 4efca862011d..28c6d26db6a1 100644 --- a/rllib/tuned_examples/sac/atari-sac.yaml +++ b/rllib/tuned_examples/sac/atari-sac.yaml @@ -14,6 +14,8 @@ atari-sac-tf-and-torch: framework: grid_search: [tf, torch] gamma: 0.99 + # state-preprocessor=Our default Atari Conv2D-net. + use_state_preprocessor: true Q_model: hidden_activation: relu hidden_layer_sizes: [512] diff --git a/rllib/tuned_examples/sac/mspacman-sac.yaml b/rllib/tuned_examples/sac/mspacman-sac.yaml index 9d563884bf2d..50883b114ecb 100644 --- a/rllib/tuned_examples/sac/mspacman-sac.yaml +++ b/rllib/tuned_examples/sac/mspacman-sac.yaml @@ -11,6 +11,8 @@ mspacman-sac-tf: # Works for both torch and tf. framework: tf gamma: 0.99 + # state-preprocessor=Our default Atari Conv2D-net. + use_state_preprocessor: true Q_model: fcnet_hiddens: [512] fcnet_activation: relu diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index 89a402117b4c..eda9d1cfa11a 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -301,10 +301,13 @@ def check_compute_single_action(trainer, assert worker_set if isinstance(worker_set, list): obs_space = trainer.get_policy().observation_space + try: + obs_space = obs_space.original_space + except AttributeError: + pass else: obs_space = worker_set.local_worker().for_policy( lambda p: p.observation_space) - obs_space = getattr(obs_space, "original_space", obs_space) else: method_to_test = pol.compute_single_action obs_space = pol.observation_space diff --git a/rllib/utils/threading.py b/rllib/utils/threading.py index adc7dfe10f40..7361dad65383 100644 --- a/rllib/utils/threading.py +++ b/rllib/utils/threading.py @@ -22,6 +22,6 @@ def wrapper(self, *a, **k): except AttributeError: raise AttributeError( "Object {} must have a `self._lock` property (assigned to a " - "threading.RLock() object in its constructor)!".format(self)) + "threading.Lock() object in its constructor)!".format(self)) return wrapper From aef07914c3e0507f6c25c02e0c384daccabec609 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 119/244] Revert "Add Ray client protocol version (#13846)" This reverts commit 4c6213353c68126c7a27761ac067e28af776d7a1. --- python/ray/tests/test_client_init.py | 1 - python/ray/util/client/server/dataservicer.py | 7 +------ python/ray/util/client/worker.py | 1 - src/ray/protobuf/ray_client.proto | 2 -- 4 files changed, 1 insertion(+), 10 deletions(-) diff --git a/python/ray/tests/test_client_init.py b/python/ray/tests/test_client_init.py index 5e43ac6314b7..0c54f93eafa9 100644 --- a/python/ray/tests/test_client_init.py +++ b/python/ray/tests/test_client_init.py @@ -36,7 +36,6 @@ def test_num_clients(): assert isinstance(info3["ray_version"], str), info3 assert isinstance(info3["ray_commit"], str), info3 assert isinstance(info3["python_version"], str), info3 - assert isinstance(info3["protocol_version"], str), info3 api3.disconnect() finally: ray_client_server.shutdown_with_server(server) diff --git a/python/ray/util/client/server/dataservicer.py b/python/ray/util/client/server/dataservicer.py index 7091478208f3..a01369e43662 100644 --- a/python/ray/util/client/server/dataservicer.py +++ b/python/ray/util/client/server/dataservicer.py @@ -14,10 +14,6 @@ logger = logging.getLogger(__name__) -# This version string is incremented to indicate breaking changes in the -# protocol that require upgrading the client version. -CURRENT_PROTOCOL_VERSION = "2020-02-01" - class DataServicer(ray_client_pb2_grpc.RayletDataStreamerServicer): def __init__(self, basic_service: "RayletServicer"): @@ -77,5 +73,4 @@ def _build_connection_response(self): python_version="{}.{}.{}".format( sys.version_info[0], sys.version_info[1], sys.version_info[2]), ray_version=ray.__version__, - ray_commit=ray.__commit__, - protocol_version=CURRENT_PROTOCOL_VERSION) + ray_commit=ray.__commit__) diff --git a/python/ray/util/client/worker.py b/python/ray/util/client/worker.py index 535ec5ab76b4..a97ccaca7798 100644 --- a/python/ray/util/client/worker.py +++ b/python/ray/util/client/worker.py @@ -139,7 +139,6 @@ def connection_info(self): "python_version": data.python_version, "ray_version": data.ray_version, "ray_commit": data.ray_commit, - "protocol_version": data.protocol_version, } def get(self, vals, *, timeout: Optional[float] = None) -> Any: diff --git a/src/ray/protobuf/ray_client.proto b/src/ray/protobuf/ray_client.proto index 6781f1935246..1ba8675017d8 100644 --- a/src/ray/protobuf/ray_client.proto +++ b/src/ray/protobuf/ray_client.proto @@ -266,8 +266,6 @@ message ConnectionInfoResponse { string ray_commit = 3; // The Python version (e.g., "3.7.2"). string python_version = 4; - // The protocol version of the server (e.g., "2020-02-01"). - string protocol_version = 5; } message DataRequest { From ded5c3f4b0dfd348304c51a04c8319e62dc2f1a9 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 120/244] Revert "Revert "Revert "Enable Ray client server by default (#13350)" (#13429)" (#13442)" This reverts commit 41d4d56c219c264b2386521319289def33309b59. --- cpp/src/ray/util/process_helper.cc | 2 +- doc/source/ray-client.rst | 37 ++++++++++++++++++++++-------- python/ray/scripts/scripts.py | 2 +- python/ray/tests/test_job.py | 4 ++-- 4 files changed, 31 insertions(+), 14 deletions(-) diff --git a/cpp/src/ray/util/process_helper.cc b/cpp/src/ray/util/process_helper.cc index 3ee6a2c34d8e..7227337edf4d 100644 --- a/cpp/src/ray/util/process_helper.cc +++ b/cpp/src/ray/util/process_helper.cc @@ -70,7 +70,7 @@ void ProcessHelper::RayStart(std::shared_ptr config, options.store_socket = store_socket; options.raylet_socket = raylet_socket; if (options.worker_type == WorkerType::DRIVER) { - options.job_id = JobID::FromInt(0); + options.job_id = JobID::FromInt(1); } options.gcs_options = gcs_options; options.enable_logging = true; diff --git a/doc/source/ray-client.rst b/doc/source/ray-client.rst index a0cd6292a5d9..a0335faaef1d 100644 --- a/doc/source/ray-client.rst +++ b/doc/source/ray-client.rst @@ -10,13 +10,11 @@ Ray Client Basic usage =========== -The Ray client server is automatically started on port ``10001`` when you use ``ray start --head`` or Ray in an autoscaling cluster. The port can be changed by specifying --ray-client-server-port in the ``ray start`` command. - -To start the server manually, you can run: +While in beta, the server is available as an executable module. To start the server, run ``python -m ray.util.client.server [--host host_ip] [--port port] [--redis-address address] [--redis-password password]`` -This runs ``ray.init()`` with default options and exposes the client gRPC port at ``host_ip:port`` (by default, ``0.0.0.0:10001``). Providing ``redis-address`` and ``redis-password`` will be passed into ``ray.init()`` when the server starts, allowing connection to an existing Ray cluster, as per the `cluster setup `_ instructions. +This runs ``ray.init()`` with default options and exposes the client gRPC port at ``host_ip:port`` (by default, ``0.0.0.0:50051``). Providing ``redis-address`` and ``redis-password`` will be passed into ``ray.init()`` when the server starts, allowing connection to an existing Ray cluster, as per the `cluster setup `_ instructions. From here, another Ray script can access that server from a networked machine with ``ray.util.connect()`` @@ -25,7 +23,7 @@ From here, another Ray script can access that server from a networked machine wi import ray import ray.util - ray.util.connect(":10001") # replace with the appropriate host and port + ray.util.connect("0.0.0.0:50051") # replace with the appropriate host and port # Normal Ray code follows @ray.remote @@ -34,12 +32,13 @@ From here, another Ray script can access that server from a networked machine wi do_work.remote(2) #.... - -When the client disconnects, any object or actor references held by the server on behalf of the client are dropped, as if directly disconnecting from the cluster. -============ -Known issues -============ +When the client disconnects, any object or actor references held by the server on behalf of the client are dropped, as if directly disconnecting from the cluster + + +=================== +``RAY_CLIENT_MODE`` +=================== Because Ray client mode affects the behavior of the Ray API, larger scripts or libraries imported before ``ray.util.connect()`` may not realize they're in client mode. This feature is being tracked with `issue #13272 `_ but the workaround here is provided for beta users. @@ -50,3 +49,21 @@ Therefore, an environment variable is also available to force a Ray program into .. code-block:: bash RAY_CLIENT_MODE=1 python my_ray_program.py + + +=================================== +Programatically creating the server +=================================== + +For larger use-cases, it may be desirable to connect remote Ray clients to an existing Ray environment. The server can be started separately via + +.. code-block:: python + + from ray.util.client.server import serve + + server = serve("0.0.0.0:50051") + # Server does some work + # ... + # Time to clean up + server.stop(0) + diff --git a/python/ray/scripts/scripts.py b/python/ray/scripts/scripts.py index d4ae094d95e3..b61c6939984c 100644 --- a/python/ray/scripts/scripts.py +++ b/python/ray/scripts/scripts.py @@ -285,7 +285,7 @@ def debug(address): "--ray-client-server-port", required=False, type=int, - default=10001, + default=None, help="the port number the ray client server will bind on. If not set, " "the ray client server will not be started.") @click.option( diff --git a/python/ray/tests/test_job.py b/python/ray/tests/test_job.py index cc7909dd8cb9..15b082b460e0 100644 --- a/python/ray/tests/test_job.py +++ b/python/ray/tests/test_job.py @@ -33,7 +33,7 @@ def __init__(self): assert len(actor_table) == 1 job_table = ray.jobs() - assert len(job_table) == 3 # dash, ray client server + assert len(job_table) == 2 # Kill the driver process. p.kill() @@ -79,7 +79,7 @@ def value(self): assert len(actor_table) == 1 job_table = ray.jobs() - assert len(job_table) == 3 # dash, ray client server + assert len(job_table) == 2 # Kill the driver process. p.kill() From 381b9c64cda83ef7ad6706c69df3d47f44aebef5 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 121/244] Revert "Unconditionally retry all RPC errors on client connect (#13845)" This reverts commit a25b8dd000128d8e483fcd6e3cb36686bb400de5. --- python/ray/util/client/worker.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/python/ray/util/client/worker.py b/python/ray/util/client/worker.py index a97ccaca7798..b0a4b78f52b1 100644 --- a/python/ray/util/client/worker.py +++ b/python/ray/util/client/worker.py @@ -101,11 +101,17 @@ def __init__(self, # Note that channel_ready_future constitutes its own timeout, # which is why we do not sleep here. except grpc.RpcError as e: - logger.info("Ray client server unavailable, " - f"retrying in {timeout}s...") - logger.debug(f"Received when checking init: {e.details()}") - # Ray is not ready yet, wait a timeout. - time.sleep(timeout) + if e.code() == grpc.StatusCode.UNAVAILABLE: + # UNAVAILABLE is gRPC's retryable error, + # so we do that here. + logger.info("Ray client server unavailable, " + f"retrying in {timeout}s...") + logger.debug(f"Received when checking init: {e.details()}") + # Ray is not ready yet, wait a timeout + time.sleep(timeout) + else: + # Any other gRPC error gets a reraise + raise e # Fallthrough, backoff, and retry at the top of the loop logger.info("Waiting for Ray to become ready on the server, " f"retry in {timeout}s...") From e4b8bd850208d987d669ad57780030a5bbbd5219 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 122/244] Revert "remove lru evict docs (#13849)" This reverts commit 9387052278e00d5baa923239f0c276f45dd57028. --- doc/source/memory-management.rst | 50 ++++++++++++++++++++++++++++++-- doc/source/walkthrough.rst | 16 ++++++++-- 2 files changed, 60 insertions(+), 6 deletions(-) diff --git a/doc/source/memory-management.rst b/doc/source/memory-management.rst index 8892800a6e94..ca4551750c50 100644 --- a/doc/source/memory-management.rst +++ b/doc/source/memory-management.rst @@ -18,7 +18,7 @@ Ray system memory: this is memory used internally by Ray Application memory: this is memory used by your application - **Worker heap**: memory used by your application (e.g., in Python code or TensorFlow), best measured as the *resident set size (RSS)* of your application minus its *shared memory usage (SHR)* in commands such as ``top``. The reason you need to subtract *SHR* is that object store shared memory is reported by the OS as shared with each worker. Not subtracting *SHR* will result in double counting memory usage. - - **Object store memory**: memory used when your application creates objects in the object store via ``ray.put`` and when returning values from remote functions. Objects are reference counted and evicted when they fall out of scope. There is an object store server running on each node. In Ray 1.3+, objects will be `spilled to disk <#object-spilling>`__ if the object store fills up. + - **Object store memory**: memory used when your application creates objects in the object store via ``ray.put`` and when returning values from remote functions. Objects are reference counted and evicted when they fall out of scope. There is an object store server running on each node. - **Object store shared memory**: memory used when your application reads objects via ``ray.get``. Note that if an object is already present on the node, this does not cause additional allocations. This allows large objects to be efficiently shared among many actors and tasks. ObjectRef Reference Counting @@ -26,6 +26,27 @@ ObjectRef Reference Counting Ray implements distributed reference counting so that any ``ObjectRef`` in scope in the cluster is pinned in the object store. This includes local python references, arguments to pending tasks, and IDs serialized inside of other objects. +Frequently Asked Questions (FAQ) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**My application failed with ObjectStoreFullError. What happened?** + +Ensure that you're removing ``ObjectRef`` references when they're no longer needed. See `Debugging using 'ray memory'`_ for information on how to identify what objects are in scope in your application. + +This exception is raised when the object store on a node was full of pinned objects when the application tried to create a new object (either by calling ``ray.put()`` or returning an object from a task). If you're sure that the configured object store size was large enough for your application to run, ensure that you're removing ``ObjectRef`` references when they're no longer in use so their objects can be evicted from the object store. + +**I'm running Ray inside IPython or a Jupyter Notebook and there are ObjectRef references causing problems even though I'm not storing them anywhere.** + +Try `Enabling LRU Fallback`_, which will cause unused objects referenced by IPython to be LRU evicted when the object store is full instead of erroring. + +IPython stores the output of every cell in a local Python variable indefinitely. This causes Ray to pin the objects even though your application may not actually be using them. + +**My application used to run on previous versions of Ray but now I'm getting ObjectStoreFullError.** + +Either modify your application to remove ``ObjectRef`` references when they're no longer needed or try `Enabling LRU Fallback`_ to revert to the old behavior. + +In previous versions of Ray, there was no reference counting and instead objects in the object store were LRU evicted once the object store ran out of space. Some applications (e.g., applications that keep references to all objects ever created) may have worked with LRU eviction but do not with reference counting. + Debugging using 'ray memory' ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -177,16 +198,38 @@ In this example, we first create an object via ``ray.put()``, then capture its ` In the output of ``ray memory``, we see that the second object displays as a normal ``LOCAL_REFERENCE``, but the first object is listed as ``CAPTURED_IN_OBJECT``. +Enabling LRU Fallback +~~~~~~~~~~~~~~~~~~~~~ + +By default, Ray will raise an exception if the object store is full of pinned objects when an application tries to create a new object. However, in some cases applications might keep references to objects much longer than they actually use them, so simply LRU evicting objects from the object store when it's full can prevent the application from failing. + +Please note that relying on this is **not recommended** - instead, if possible you should try to remove references as they're no longer needed in your application to free space in the object store. + +To enable LRU eviction when the object store is full, initialize ray with the ``lru_evict`` option set: + +.. code-block:: python + + ray.init(lru_evict=True) + +.. code-block:: bash + + ray start --lru-evict + Object Spilling --------------- -Ray 1.3+ spills objects to external storage once the object store is full. By default, objects are spilled to the local filesystem. -To configure the directory where objects are placed, use: +Ray 1.2.0+ has *beta* support for spilling objects to external storage once the capacity +of the object store is used up. Please file a `GitHub issue `__ +if you encounter any problems with this new feature. Eventually, object spilling will be +enabled by default, but for now you need to enable it manually: + +To enable object spilling to the local filesystem (single node clusters only): .. code-block:: python ray.init( _system_config={ + "automatic_object_spilling_enabled": True, "object_spilling_config": json.dumps( {"type": "filesystem", "params": {"directory_path": "/tmp/spill"}}, ) @@ -199,6 +242,7 @@ To enable object spilling to remote storage (any URI supported by `smart_open `__. -This feature is available in Ray 1.3+. +When the object store gets full, objects will be evicted to make room for new objects. +This happens in approximate LRU (least recently used) order. To avoid objects from +being evicted, you can call ``get`` and store their values instead. Numpy array +objects cannot be evicted while they are mapped in any Python process. + +.. note:: + + Objects created with ``put`` are pinned in memory while a Python/Java reference + to the object ref returned by the put exists. This only applies to the specific + ref returned by put, not refs in general or copies of that refs. + +See also: `object spilling `__. Remote Classes (Actors) ----------------------- From d6a0cb5211a520e4406c0c54dd95e51307503a04 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 123/244] Revert "[Object Spilling] Skip normal ray.get path when spilling objects. (#13831)" This reverts commit dd25fb41c4b34eddccda8b49736711fd26310469. --- python/ray/_raylet.pyx | 12 ------- python/ray/external_storage.py | 10 +++--- python/ray/includes/libcoreworker.pxd | 3 -- src/ray/core_worker/core_worker.cc | 17 ---------- src/ray/core_worker/core_worker.h | 14 -------- .../store_provider/plasma_store_provider.cc | 32 ------------------- .../store_provider/plasma_store_provider.h | 12 ------- 7 files changed, 5 insertions(+), 95 deletions(-) diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 3d2b9ea737c4..dc9fceaca7df 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -898,18 +898,6 @@ cdef class CoreWorker: return RayObjectsToDataMetadataPairs(results) - def get_if_local(self, object_refs): - """Get objects from local plasma store directly - without a fetch request to raylet.""" - cdef: - c_vector[shared_ptr[CRayObject]] results - c_vector[CObjectID] c_object_ids = ObjectRefsToVector(object_refs) - with nogil: - check_status( - CCoreWorkerProcess.GetCoreWorker().GetIfLocal( - c_object_ids, &results)) - return RayObjectsToDataMetadataPairs(results) - def object_exists(self, ObjectRef object_ref): cdef: c_bool has_object diff --git a/python/ray/external_storage.py b/python/ray/external_storage.py index 26d5c4a4dbd9..f764e9c0fc5e 100644 --- a/python/ray/external_storage.py +++ b/python/ray/external_storage.py @@ -82,11 +82,11 @@ class ExternalStorage(metaclass=abc.ABCMeta): def _get_objects_from_store(self, object_refs): worker = ray.worker.global_worker - # Since the object should always exist in the plasma store before - # spilling, it can directly get the object from the local plasma - # store. - # issue: https://github.com/ray-project/ray/pull/13831 - ray_object_pairs = worker.core_worker.get_if_local(object_refs) + ray_object_pairs = worker.core_worker.get_objects( + object_refs, + worker.current_task_id, + timeout_ms=0, + plasma_objects_only=True) return ray_object_pairs def _put_object_to_store(self, metadata, data_size, file_like, object_ref): diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd index 0b7c3b0f537f..f1acad1fadd8 100644 --- a/python/ray/includes/libcoreworker.pxd +++ b/python/ray/includes/libcoreworker.pxd @@ -183,9 +183,6 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: CRayStatus Get(const c_vector[CObjectID] &ids, int64_t timeout_ms, c_vector[shared_ptr[CRayObject]] *results, c_bool plasma_objects_only) - CRayStatus GetIfLocal( - const c_vector[CObjectID] &ids, - c_vector[shared_ptr[CRayObject]] *results) CRayStatus Contains(const CObjectID &object_id, c_bool *has_object) CRayStatus Wait(const c_vector[CObjectID] &object_ids, int num_objects, int64_t timeout_ms, c_vector[c_bool] *results, diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 1961406d8a8a..2f5dcc57efc1 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -1058,23 +1058,6 @@ Status CoreWorker::Get(const std::vector &ids, const int64_t timeout_m return Status::OK(); } -Status CoreWorker::GetIfLocal(const std::vector &ids, - std::vector> *results) { - results->resize(ids.size(), nullptr); - - absl::flat_hash_map> result_map; - RAY_RETURN_NOT_OK(plasma_store_provider_->GetIfLocal(ids, &result_map)); - for (size_t i = 0; i < ids.size(); i++) { - auto pair = result_map.find(ids[i]); - // The caller of this method should guarantee that the object exists in the plasma - // store when this method is called. - RAY_CHECK(pair != result_map.end()); - RAY_CHECK(pair->second != nullptr); - (*results)[i] = pair->second; - } - return Status::OK(); -} - Status CoreWorker::Contains(const ObjectID &object_id, bool *has_object) { bool found = false; bool in_plasma = false; diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h index 89331b5ce10f..088ba346a70c 100644 --- a/src/ray/core_worker/core_worker.h +++ b/src/ray/core_worker/core_worker.h @@ -555,20 +555,6 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { std::vector> *results, bool plasma_objects_only = false); - /// Get objects directly from the local plasma store, without waiting for the - /// objects to be fetched from another node. This should only be used - /// internally, never by user code. - /// NOTE: Caller of this method should guarantee that the object already exists in the - /// plasma store, thus it doesn't need to fetch from other nodes. - /// - /// \param[in] ids The IDs of the objects to get. - /// \param[out] results The results will be stored here. A nullptr will be - /// added for objects that were not in the local store. - /// \return Status OK if all objects were found. Returns ObjectNotFound error - /// if at least one object was not in the local store. - Status GetIfLocal(const std::vector &ids, - std::vector> *results); - /// Return whether or not the object store contains the given object. /// /// \param[in] object_id ID of the objects to check for. diff --git a/src/ray/core_worker/store_provider/plasma_store_provider.cc b/src/ray/core_worker/store_provider/plasma_store_provider.cc index b42c4b50941f..a8f116287228 100644 --- a/src/ray/core_worker/store_provider/plasma_store_provider.cc +++ b/src/ray/core_worker/store_provider/plasma_store_provider.cc @@ -225,38 +225,6 @@ Status CoreWorkerPlasmaStoreProvider::FetchAndGetFromPlasmaStore( return Status::OK(); } -Status CoreWorkerPlasmaStoreProvider::GetIfLocal( - const std::vector &object_ids, - absl::flat_hash_map> *results) { - std::vector plasma_results; - { - std::lock_guard guard(store_client_mutex_); - RAY_RETURN_NOT_OK(store_client_.Get(object_ids, /*timeout_ms=*/0, &plasma_results)); - } - - for (size_t i = 0; i < object_ids.size(); i++) { - if (plasma_results[i].data != nullptr || plasma_results[i].metadata != nullptr) { - const auto &object_id = object_ids[i]; - std::shared_ptr data = nullptr; - std::shared_ptr metadata = nullptr; - if (plasma_results[i].data && plasma_results[i].data->Size()) { - // We track the set of active data buffers in active_buffers_. On destruction, - // the buffer entry will be removed from the set via callback. - data = std::make_shared(plasma_results[i].data, buffer_tracker_, - object_id); - buffer_tracker_->Record(object_id, data.get(), get_current_call_site_()); - } - if (plasma_results[i].metadata && plasma_results[i].metadata->Size()) { - metadata = plasma_results[i].metadata; - } - const auto result_object = - std::make_shared(data, metadata, std::vector()); - (*results)[object_id] = result_object; - } - } - return Status::OK(); -} - Status UnblockIfNeeded(const std::shared_ptr &client, const WorkerContext &ctx) { if (ctx.CurrentTaskIsDirectCall()) { diff --git a/src/ray/core_worker/store_provider/plasma_store_provider.h b/src/ray/core_worker/store_provider/plasma_store_provider.h index e67c561b6c9c..2282a09a91b1 100644 --- a/src/ray/core_worker/store_provider/plasma_store_provider.h +++ b/src/ray/core_worker/store_provider/plasma_store_provider.h @@ -143,18 +143,6 @@ class CoreWorkerPlasmaStoreProvider { absl::flat_hash_map> *results, bool *got_exception); - /// Get objects directly from the local plasma store, without waiting for the - /// objects to be fetched from another node. This should only be used - /// internally, never by user code. - /// - /// \param[in] ids The IDs of the objects to get. - /// \param[out] results The results will be stored here. A nullptr will be - /// added for objects that were not in the local store. - /// \return Status OK if the request to the local object store was - /// successful. - Status GetIfLocal(const std::vector &ids, - absl::flat_hash_map> *results); - Status Contains(const ObjectID &object_id, bool *has_object); Status Wait(const absl::flat_hash_set &object_ids, int num_objects, From 974ea412da8ddda0136242b0df828aaee86834e4 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 124/244] Revert "Fix naming of ray_spilled_objects directory" This reverts commit b5bd888d4bc902bd65f845ae6266da6176690f9c. --- python/ray/ray_constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/ray_constants.py b/python/ray/ray_constants.py index cbfbaaa5bc08..04dfd8f173b7 100644 --- a/python/ray/ray_constants.py +++ b/python/ray/ray_constants.py @@ -234,4 +234,4 @@ def to_memory_units(memory_bytes, round_up): MAX_INT64_VALUE = 9223372036854775807 # Object Spilling related constants -DEFAULT_OBJECT_PREFIX = "ray_spilled_objects" +DEFAULT_OBJECT_PREFIX = "ray_spilled_object" From 1db568de1c56096f70add49e0a3a9e4d01d22455 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 125/244] Revert "[python/ray]: add cloudpickle dependency (#13838)" This reverts commit 3580ed33f97b602997164b6af8117edefb24ff1d. --- python/requirements/requirements.txt | 1 - python/setup.py | 1 - 2 files changed, 2 deletions(-) diff --git a/python/requirements/requirements.txt b/python/requirements/requirements.txt index 17a3c233f26a..28c387fde7b3 100644 --- a/python/requirements/requirements.txt +++ b/python/requirements/requirements.txt @@ -8,7 +8,6 @@ aiohttp==3.7 aioredis click >= 7.0 -cloudpickle colorama colorful filelock diff --git a/python/setup.py b/python/setup.py index 76e540ada294..e00fcc0820bb 100644 --- a/python/setup.py +++ b/python/setup.py @@ -129,7 +129,6 @@ "aiohttp_cors", "aioredis", "click >= 7.0", - "cloudpickle", "colorama", "colorful", "filelock", From a651b412fb4c30e50fd08dda689e61fd7eef92ee Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 126/244] Revert "[AWS] Fill-in AMI if not provided (#13808)" This reverts commit 62b9616388a8c84772f9b8612a332ddd3e655a42. --- python/ray/autoscaler/_private/aws/config.py | 6 ++-- python/ray/tests/aws/test_autoscaler_aws.py | 31 +------------------- 2 files changed, 3 insertions(+), 34 deletions(-) diff --git a/python/ray/autoscaler/_private/aws/config.py b/python/ray/autoscaler/_private/aws/config.py index 2fb90787b5eb..4c3a1c448102 100644 --- a/python/ray/autoscaler/_private/aws/config.py +++ b/python/ray/autoscaler/_private/aws/config.py @@ -496,13 +496,11 @@ def _check_ami(config): # If we do not provide a default AMI for the given region, noop. return - head_ami = config["head_node"].get("ImageId", "").lower() - if head_ami in ["", "latest_dlami"]: + if config["head_node"].get("ImageId", "").lower() == "latest_dlami": config["head_node"]["ImageId"] = default_ami _set_config_info(head_ami_src="dlami") - worker_ami = config["worker_nodes"].get("ImageId", "").lower() - if worker_ami in ["", "latest_dlami"]: + if config["worker_nodes"].get("ImageId", "").lower() == "latest_dlami": config["worker_nodes"]["ImageId"] = default_ami _set_config_info(workers_ami_src="dlami") diff --git a/python/ray/tests/aws/test_autoscaler_aws.py b/python/ray/tests/aws/test_autoscaler_aws.py index acf6c2d628c2..52ceb9fb8ecd 100644 --- a/python/ray/tests/aws/test_autoscaler_aws.py +++ b/python/ray/tests/aws/test_autoscaler_aws.py @@ -1,8 +1,6 @@ import pytest -from ray.autoscaler._private.aws.config import _get_vpc_id_or_die, \ - bootstrap_aws, \ - DEFAULT_AMI +from ray.autoscaler._private.aws.config import _get_vpc_id_or_die import ray.tests.aws.utils.stubs as stubs import ray.tests.aws.utils.helpers as helpers from ray.tests.aws.utils.constants import AUX_SUBNET, DEFAULT_SUBNET, \ @@ -135,33 +133,6 @@ def test_subnet_given_head_and_worker_sg(iam_client_stub, ec2_client_stub): ec2_client_stub.assert_no_pending_responses() -def test_fills_out_amis(iam_client_stub, ec2_client_stub): - # Setup stubs to mock out boto3 - stubs.configure_iam_role_default(iam_client_stub) - stubs.configure_key_pair_default(ec2_client_stub) - stubs.describe_a_security_group(ec2_client_stub, DEFAULT_SG) - stubs.configure_subnet_default(ec2_client_stub) - - config = helpers.load_aws_example_config_file("example-full.yaml") - del config["head_node"]["ImageId"] - del config["worker_nodes"]["ImageId"] - - # Pass in SG for stub to work - config["head_node"]["SecurityGroupIds"] = ["sg-1234abcd"] - config["worker_nodes"]["SecurityGroupIds"] = ["sg-1234abcd"] - - defaults_filled = bootstrap_aws(config) - - ami = DEFAULT_AMI.get(config.get("provider", {}).get("region")) - - assert defaults_filled["head_node"].get("ImageId") == ami - - assert defaults_filled["worker_nodes"].get("ImageId") == ami - - iam_client_stub.assert_no_pending_responses() - ec2_client_stub.assert_no_pending_responses() - - if __name__ == "__main__": import sys sys.exit(pytest.main(["-v", __file__])) From 3d046383ed91fa91f144e4041df119b0d7757253 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 127/244] Revert "[ray_client]: Add python version check and test (and some minor fixes along the way) (#13722)" This reverts commit b4c187da78e7feed8f1e78993033611315bfdca7. --- python/ray/tests/test_client_init.py | 43 +----------------- python/ray/tests/test_client_references.py | 44 +++++++++---------- python/ray/util/client/__init__.py | 24 ++-------- python/ray/util/client/ray_client_helpers.py | 9 +--- python/ray/util/client/server/dataservicer.py | 22 +++++----- python/ray/util/client/server/server.py | 32 +++++--------- 6 files changed, 47 insertions(+), 127 deletions(-) diff --git a/python/ray/tests/test_client_init.py b/python/ray/tests/test_client_init.py index 0c54f93eafa9..1949fe3fdc8f 100644 --- a/python/ray/tests/test_client_init.py +++ b/python/ray/tests/test_client_init.py @@ -1,11 +1,7 @@ """Client tests that run their own init (as with init_and_serve) live here""" -import pytest - import time -import sys import ray.util.client.server.server as ray_client_server -import ray.core.generated.ray_client_pb2 as ray_client_pb2 from ray.util.client import RayAPIStub @@ -13,8 +9,7 @@ def test_num_clients(): # Tests num clients reporting; useful if you want to build an app that # load balances clients between Ray client servers. - server_handle, _ = ray_client_server.init_and_serve("localhost:50051") - server = server_handle.grpc_server + server, _ = ray_client_server.init_and_serve("localhost:50051") try: api1 = RayAPIStub() info1 = api1.connect("localhost:50051") @@ -40,39 +35,3 @@ def test_num_clients(): finally: ray_client_server.shutdown_with_server(server) time.sleep(2) - - -def test_python_version(): - - server_handle, _ = ray_client_server.init_and_serve("localhost:50051") - try: - ray = RayAPIStub() - info1 = ray.connect("localhost:50051") - assert info1["python_version"] == ".".join( - [str(x) for x in list(sys.version_info)[:3]]) - ray.disconnect() - time.sleep(1) - - def mock_connection_response(): - return ray_client_pb2.ConnectionInfoResponse( - num_clients=1, - python_version="2.7.12", - ray_version="", - ray_commit="", - ) - - # inject mock connection function - server_handle.data_servicer._build_connection_response = \ - mock_connection_response - - ray = RayAPIStub() - with pytest.raises(RuntimeError): - _ = ray.connect("localhost:50051") - - ray = RayAPIStub() - info3 = ray.connect("localhost:50051", ignore_version=True) - assert info3["num_clients"] == 1, info3 - ray.disconnect() - finally: - ray_client_server.shutdown_with_server(server_handle.grpc_server) - time.sleep(2) diff --git a/python/ray/tests/test_client_references.py b/python/ray/tests/test_client_references.py index 8a4458e14af8..834fadfcf874 100644 --- a/python/ray/tests/test_client_references.py +++ b/python/ray/tests/test_client_references.py @@ -1,38 +1,39 @@ from ray.util.client.ray_client_helpers import ray_start_client_server -from ray.util.client.ray_client_helpers import ray_start_client_server_pair from ray.test_utils import wait_for_condition import ray as real_ray from ray.core.generated.gcs_pb2 import ActorTableData +from ray.util.client.server.server import _get_current_servicer -def server_object_ref_count(server, n): +def server_object_ref_count(n): + server = _get_current_servicer() assert server is not None def test_cond(): - if len(server.task_servicer.object_refs) == 0: + if len(server.object_refs) == 0: # No open clients return n == 0 - client_id = list(server.task_servicer.object_refs.keys())[0] - return len(server.task_servicer.object_refs[client_id]) == n + client_id = list(server.object_refs.keys())[0] + return len(server.object_refs[client_id]) == n return test_cond -def server_actor_ref_count(server, n): +def server_actor_ref_count(n): + server = _get_current_servicer() assert server is not None def test_cond(): - if len(server.task_servicer.actor_refs) == 0: + if len(server.actor_refs) == 0: # No running actors return n == 0 - return len(server.task_servicer.actor_refs) == n + return len(server.actor_refs) == n return test_cond def test_delete_refs_on_disconnect(ray_start_regular): - with ray_start_client_server_pair() as pair: - ray, server = pair + with ray_start_client_server() as ray: @ray.remote def f(x): @@ -45,14 +46,14 @@ def f(x): # in a different category, according to the raylet. assert len(real_ray.objects()) == 2 # But we're maintaining the reference - assert server_object_ref_count(server, 3)() + assert server_object_ref_count(3)() # And can get the data assert ray.get(thing1) == 8 # Close the client ray.close() - wait_for_condition(server_object_ref_count(server, 0), timeout=5) + wait_for_condition(server_object_ref_count(0), timeout=5) def test_cond(): return len(real_ray.objects()) == 0 @@ -61,8 +62,7 @@ def test_cond(): def test_delete_ref_on_object_deletion(ray_start_regular): - with ray_start_client_server_pair() as pair: - ray, server = pair + with ray_start_client_server() as ray: vals = { "ref": ray.put("Hello World"), "ref2": ray.put("This value stays"), @@ -70,12 +70,11 @@ def test_delete_ref_on_object_deletion(ray_start_regular): del vals["ref"] - wait_for_condition(server_object_ref_count(server, 1), timeout=5) + wait_for_condition(server_object_ref_count(1), timeout=5) def test_delete_actor_on_disconnect(ray_start_regular): - with ray_start_client_server_pair() as pair: - ray, server = pair + with ray_start_client_server() as ray: @ray.remote class Accumulator: @@ -91,13 +90,13 @@ def get(self): actor = Accumulator.remote() actor.inc.remote() - assert server_actor_ref_count(server, 1)() + assert server_actor_ref_count(1)() assert ray.get(actor.get.remote()) == 1 ray.close() - wait_for_condition(server_actor_ref_count(server, 0), timeout=5) + wait_for_condition(server_actor_ref_count(0), timeout=5) def test_cond(): alive_actors = [ @@ -110,8 +109,7 @@ def test_cond(): def test_delete_actor(ray_start_regular): - with ray_start_client_server_pair() as pair: - ray, server = pair + with ray_start_client_server() as ray: @ray.remote class Accumulator: @@ -126,11 +124,11 @@ def inc(self): actor2 = Accumulator.remote() actor2.inc.remote() - assert server_actor_ref_count(server, 2)() + assert server_actor_ref_count(2)() del actor - wait_for_condition(server_actor_ref_count(server, 1), timeout=5) + wait_for_condition(server_actor_ref_count(1), timeout=5) def test_simple_multiple_references(ray_start_regular): diff --git a/python/ray/util/client/__init__.py b/python/ray/util/client/__init__.py index 9a2d14877936..1c28dc53c64a 100644 --- a/python/ray/util/client/__init__.py +++ b/python/ray/util/client/__init__.py @@ -1,6 +1,5 @@ from typing import List, Tuple, Dict, Any -import sys import logging logger = logging.getLogger(__name__) @@ -26,9 +25,7 @@ def connect(self, conn_str: str, secure: bool = False, metadata: List[Tuple[str, str]] = None, - connection_retries: int = 3, - *, - ignore_version: bool = False) -> Dict[str, Any]: + connection_retries: int = 3) -> Dict[str, Any]: """Connect the Ray Client to a server. Args: @@ -59,25 +56,11 @@ def connect(self, metadata=metadata, connection_retries=connection_retries) self.api.worker = self.client_worker - conn_info = self.client_worker.connection_info() - self._check_versions(conn_info, ignore_version) - return conn_info + return self.client_worker.connection_info() except Exception: self.disconnect() raise - def _check_versions(self, conn_info, ignore_version: bool) -> None: - local_major_minor = f"{sys.version_info[0]}.{sys.version_info[1]}" - if not conn_info["python_version"].startswith(local_major_minor): - version_str = f"{local_major_minor}.{sys.version_info[2]}" - msg = "Python minor versions differ between client and server:" + \ - f" client is {version_str}," + \ - f" server is {conn_info['python_version']}" - if ignore_version: - logger.warning(msg) - else: - raise RuntimeError(msg) - def disconnect(self): """Disconnect the Ray Client. """ @@ -114,9 +97,8 @@ def init(self, *args, **kwargs): if self._server is not None: raise Exception("Trying to start two instances of ray via client") import ray.util.client.server.server as ray_client_server - server_handle, address_info = ray_client_server.init_and_serve( + self._server, address_info = ray_client_server.init_and_serve( "localhost:50051", *args, **kwargs) - self._server = server_handle.grpc_server self.connect("localhost:50051") self._connected_with_init = True return address_info diff --git a/python/ray/util/client/ray_client_helpers.py b/python/ray/util/client/ray_client_helpers.py index 77f09346d7af..be5a2918c3b2 100644 --- a/python/ray/util/client/ray_client_helpers.py +++ b/python/ray/util/client/ray_client_helpers.py @@ -6,18 +6,11 @@ @contextmanager def ray_start_client_server(): - with ray_start_client_server_pair() as pair: - client, server = pair - yield client - - -@contextmanager -def ray_start_client_server_pair(): ray._inside_client_test = True server = ray_client_server.serve("localhost:50051") ray.connect("localhost:50051") try: - yield ray, server + yield ray finally: ray._inside_client_test = False ray.disconnect() diff --git a/python/ray/util/client/server/dataservicer.py b/python/ray/util/client/server/dataservicer.py index a01369e43662..7a7fb3eae73f 100644 --- a/python/ray/util/client/server/dataservicer.py +++ b/python/ray/util/client/server/dataservicer.py @@ -50,8 +50,16 @@ def Datapath(self, request_iterator, context): resp = ray_client_pb2.DataResponse( release=ray_client_pb2.ReleaseResponse(ok=released)) elif req_type == "connection_info": - resp = ray_client_pb2.DataResponse( - connection_info=self._build_connection_response()) + with self._clients_lock: + cur_num_clients = self._num_clients + info = ray_client_pb2.ConnectionInfoResponse( + num_clients=cur_num_clients, + python_version="{}.{}.{}".format( + sys.version_info[0], sys.version_info[1], + sys.version_info[2]), + ray_version=ray.__version__, + ray_commit=ray.__commit__) + resp = ray_client_pb2.DataResponse(connection_info=info) else: raise Exception(f"Unreachable code: Request type " f"{req_type} not handled in Datapath") @@ -64,13 +72,3 @@ def Datapath(self, request_iterator, context): self.basic_service.release_all(client_id) with self._clients_lock: self._num_clients -= 1 - - def _build_connection_response(self): - with self._clients_lock: - cur_num_clients = self._num_clients - return ray_client_pb2.ConnectionInfoResponse( - num_clients=cur_num_clients, - python_version="{}.{}.{}".format( - sys.version_info[0], sys.version_info[1], sys.version_info[2]), - ray_version=ray.__version__, - ray_commit=ray.__commit__) diff --git a/python/ray/util/client/server/server.py b/python/ray/util/client/server/server.py index 6a7badaf703a..19a192337105 100644 --- a/python/ray/util/client/server/server.py +++ b/python/ray/util/client/server/server.py @@ -3,7 +3,6 @@ import grpc import base64 from collections import defaultdict -from dataclasses import dataclass from typing import Any from typing import Dict @@ -408,18 +407,13 @@ def decode_options( return opts -@dataclass -class ClientServerHandle: - """Holds the handles to the registered gRPC servicers and their server.""" - task_servicer: RayletServicer - data_servicer: DataServicer - logs_servicer: LogstreamServicer - grpc_server: grpc.Server +_current_servicer: Optional[RayletServicer] = None - # Add a hook for all the cases that previously - # expected simply a gRPC server - def __getattr__(self, attr): - return getattr(self.grpc_server, attr) + +# Used by tests to peek inside the servicer +def _get_current_servicer(): + global _current_servicer + return _current_servicer def serve(connection_str): @@ -427,6 +421,8 @@ def serve(connection_str): task_servicer = RayletServicer() data_servicer = DataServicer(task_servicer) logs_servicer = LogstreamServicer() + global _current_servicer + _current_servicer = task_servicer ray_client_pb2_grpc.add_RayletDriverServicer_to_server( task_servicer, server) ray_client_pb2_grpc.add_RayletDataStreamerServicer_to_server( @@ -434,22 +430,16 @@ def serve(connection_str): ray_client_pb2_grpc.add_RayletLogStreamerServicer_to_server( logs_servicer, server) server.add_insecure_port(connection_str) - current_handle = ClientServerHandle( - task_servicer=task_servicer, - data_servicer=data_servicer, - logs_servicer=logs_servicer, - grpc_server=server, - ) server.start() - return current_handle + return server def init_and_serve(connection_str, *args, **kwargs): with disable_client_hook(): # Disable client mode inside the worker's environment info = ray.init(*args, **kwargs) - server_handle = serve(connection_str) - return (server_handle, info) + server = serve(connection_str) + return (server, info) def shutdown_with_server(server, _exiting_interpreter=False): From f90734684967030f21e3c3ba184e6e2e5271a1c6 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 128/244] Revert "[core][object spillin] Fix bugs in admission control (#13781)" This reverts commit f3ce811b696879830a7959de8cccc70def03811c. --- src/ray/gcs/accessor.h | 2 +- src/ray/gcs/gcs_client/service_based_accessor.cc | 3 +-- src/ray/gcs/gcs_client/service_based_accessor.h | 2 +- src/ray/object_manager/plasma/store.cc | 2 -- src/ray/object_manager/plasma/store.h | 7 ++----- src/ray/object_manager/pull_manager.cc | 10 ++-------- src/ray/raylet/local_object_manager.cc | 6 +----- src/ray/raylet/test/local_object_manager_test.cc | 2 +- 8 files changed, 9 insertions(+), 25 deletions(-) diff --git a/src/ray/gcs/accessor.h b/src/ray/gcs/accessor.h index e7ddb765b9d3..3bc7002021b3 100644 --- a/src/ray/gcs/accessor.h +++ b/src/ray/gcs/accessor.h @@ -308,7 +308,7 @@ class ObjectInfoAccessor { /// \return Status virtual Status AsyncAddSpilledUrl(const ObjectID &object_id, const std::string &spilled_url, - const NodeID &spilled_node_id, size_t object_size, + const NodeID &spilled_node_id, const StatusCallback &callback) = 0; /// Remove location of object from GCS asynchronously. diff --git a/src/ray/gcs/gcs_client/service_based_accessor.cc b/src/ray/gcs/gcs_client/service_based_accessor.cc index c4f550e5075b..891bd6ba6a54 100644 --- a/src/ray/gcs/gcs_client/service_based_accessor.cc +++ b/src/ray/gcs/gcs_client/service_based_accessor.cc @@ -1102,7 +1102,7 @@ Status ServiceBasedObjectInfoAccessor::AsyncAddLocation(const ObjectID &object_i Status ServiceBasedObjectInfoAccessor::AsyncAddSpilledUrl( const ObjectID &object_id, const std::string &spilled_url, - const NodeID &spilled_node_id, size_t object_size, const StatusCallback &callback) { + const NodeID &spilled_node_id, const StatusCallback &callback) { RAY_LOG(DEBUG) << "Adding object spilled location, object id = " << object_id << ", spilled_url = " << spilled_url << ", job id = " << object_id.TaskId().JobId(); @@ -1110,7 +1110,6 @@ Status ServiceBasedObjectInfoAccessor::AsyncAddSpilledUrl( request.set_object_id(object_id.Binary()); request.set_spilled_url(spilled_url); request.set_spilled_node_id(spilled_node_id.Binary()); - request.set_size(object_size); auto operation = [this, request, callback](const SequencerDoneCallback &done_callback) { client_impl_->GetGcsRpcClient().AddObjectLocation( diff --git a/src/ray/gcs/gcs_client/service_based_accessor.h b/src/ray/gcs/gcs_client/service_based_accessor.h index 79deb2a6c3b2..149fa6d2e8d4 100644 --- a/src/ray/gcs/gcs_client/service_based_accessor.h +++ b/src/ray/gcs/gcs_client/service_based_accessor.h @@ -326,7 +326,7 @@ class ServiceBasedObjectInfoAccessor : public ObjectInfoAccessor { size_t object_size, const StatusCallback &callback) override; Status AsyncAddSpilledUrl(const ObjectID &object_id, const std::string &spilled_url, - const NodeID &node_id, size_t object_size, + const NodeID &node_id, const StatusCallback &callback) override; Status AsyncRemoveLocation(const ObjectID &object_id, const NodeID &node_id, diff --git a/src/ray/object_manager/plasma/store.cc b/src/ray/object_manager/plasma/store.cc index e101c5a9b71a..9bae68b3a3a8 100644 --- a/src/ray/object_manager/plasma/store.cc +++ b/src/ray/object_manager/plasma/store.cc @@ -159,7 +159,6 @@ void PlasmaStore::AddToClientObjectIds(const ObjectID &object_id, ObjectTableEnt if (entry->ref_count == 0) { // Tell the eviction policy that this object is being used. eviction_policy_.BeginObjectAccess(object_id); - num_bytes_in_use_ += entry->data_size + entry->metadata_size; } // Increase reference count. entry->ref_count++; @@ -538,7 +537,6 @@ int PlasmaStore::RemoveFromClientObjectIds(const ObjectID &object_id, // If no more clients are using this object, notify the eviction policy // that the object is no longer being used. if (entry->ref_count == 0) { - num_bytes_in_use_ -= entry->data_size + entry->metadata_size; RAY_LOG(DEBUG) << "Releasing object no longer in use " << object_id; if (deletion_cache_.count(object_id) == 0) { // Tell the eviction policy that this object is no longer being used. diff --git a/src/ray/object_manager/plasma/store.h b/src/ray/object_manager/plasma/store.h index 214cf9763bf6..2ad3aad261c7 100644 --- a/src/ray/object_manager/plasma/store.h +++ b/src/ray/object_manager/plasma/store.h @@ -211,9 +211,8 @@ class PlasmaStore { void ProcessCreateRequests(); void GetAvailableMemory(std::function callback) const { - int64_t num_bytes_in_use = static_cast(num_bytes_in_use_); - RAY_CHECK(PlasmaAllocator::GetFootprintLimit() >= num_bytes_in_use); - size_t available = PlasmaAllocator::GetFootprintLimit() - num_bytes_in_use; + size_t available = + PlasmaAllocator::GetFootprintLimit() - eviction_policy_.GetPinnedMemoryBytes(); callback(available); } @@ -314,8 +313,6 @@ class PlasmaStore { /// interface that node manager or object manager can access the plasma store with this /// mutex if it is not absolutely necessary. std::recursive_mutex mutex_; - - size_t num_bytes_in_use_ = 0; }; } // namespace plasma diff --git a/src/ray/object_manager/pull_manager.cc b/src/ray/object_manager/pull_manager.cc index 1ce460b81004..9be63c7e1d64 100644 --- a/src/ray/object_manager/pull_manager.cc +++ b/src/ray/object_manager/pull_manager.cc @@ -277,17 +277,11 @@ void PullManager::OnLocationChange(const ObjectID &object_id, it->second.spilled_url = spilled_url; it->second.spilled_node_id = spilled_node_id; if (!it->second.object_size_set) { + RAY_LOG(DEBUG) << "Updated size of object " << object_id << " to " << object_size + << ", num bytes being pulled is now " << num_bytes_being_pulled_; it->second.object_size = object_size; it->second.object_size_set = true; UpdatePullsBasedOnAvailableMemory(num_bytes_available_); - RAY_LOG(DEBUG) << "Updated size of object " << object_id << " to " << object_size - << ", num bytes being pulled is now " << num_bytes_being_pulled_; - if (it->second.object_size == 0) { - RAY_LOG(WARNING) << "Size of object " << object_id - << " stored in object store is zero. This may be a bug since " - "objects in the object store should be large, and can result " - "in too many objects being fetched to this node"; - } } RAY_LOG(DEBUG) << "OnLocationChange " << spilled_url << " num clients " << client_ids.size(); diff --git a/src/ray/raylet/local_object_manager.cc b/src/ray/raylet/local_object_manager.cc index ef9e53e21baf..9ebaf75a8088 100644 --- a/src/ray/raylet/local_object_manager.cc +++ b/src/ray/raylet/local_object_manager.cc @@ -270,15 +270,11 @@ void LocalObjectManager::AddSpilledUrls( // don't need to report where this object is spilled. const auto node_id_object_spilled = is_external_storage_type_fs_ ? self_node_id_ : NodeID::Nil(); - - auto it = objects_pending_spill_.find(object_id); - RAY_CHECK(it != objects_pending_spill_.end()); - // Write to object directory. Wait for the write to finish before // releasing the object to make sure that the spilled object can // be retrieved by other raylets. RAY_CHECK_OK(object_info_accessor_.AsyncAddSpilledUrl( - object_id, object_url, node_id_object_spilled, it->second->GetSize(), + object_id, object_url, node_id_object_spilled, [this, object_id, object_url, callback, num_remaining](Status status) { RAY_CHECK_OK(status); // Unpin the object. diff --git a/src/ray/raylet/test/local_object_manager_test.cc b/src/ray/raylet/test/local_object_manager_test.cc index f68707ce7a01..8ff77250f78f 100644 --- a/src/ray/raylet/test/local_object_manager_test.cc +++ b/src/ray/raylet/test/local_object_manager_test.cc @@ -194,7 +194,7 @@ class MockObjectInfoAccessor : public gcs::ObjectInfoAccessor { size_t object_size, const gcs::StatusCallback &callback)); Status AsyncAddSpilledUrl(const ObjectID &object_id, const std::string &spilled_url, - const NodeID &spilled_node_id, size_t object_size, + const NodeID &spilled_node_id, const gcs::StatusCallback &callback) { object_urls[object_id] = spilled_url; callbacks.push_back(callback); From bd87d14d9900209737193bfb6aec9517a5716ec6 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 129/244] Revert "bug fix for doc (#13834)" This reverts commit fc09a4008edeb20c8cb35ddc3d644110d4d9191f. --- doc/source/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/index.rst b/doc/source/index.rst index 76bfa3f60a12..be01da3cf2a8 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -119,7 +119,7 @@ Ray provides Python, Java, and *EXPERIMENTAL* C++ API. And Ray uses Tasks (funct | - Build ray from source with *bazel* as shown `here `__. | - Modify `cpp/example/example.cc`. | - Run `"bazel build //cpp:example"`. - | Option 1: run the example directly with a dynamic library path. It will start a Ray cluster automatically. + | Option 1:, run the example directly with a dynamic library path. It will start a Ray cluster automatically. | - Run `"ray stop"`. | - Run `"./bazel-bin/cpp/example/example --dynamic-library-path=bazel-bin/cpp/example/example.so"` | Option 2: connect to an existing Ray cluster with a known redis address (e.g. `127.0.0.1:6379`). From 226d5c5352b8782e6151cb5c503427f3d1d77cb6 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 130/244] Revert "support dynamic library loading in C++ worker (#13734)" This reverts commit de11032e0f0ec491c94db0a8ea7006c8e926de0d. --- ci/travis/bazel-format.sh | 2 +- ci/travis/ci.sh | 3 - cpp/BUILD.bazel | 58 +++++--------- cpp/dev_BUILD.bazel | 74 ++++++++++++++++++ cpp/example/BUILD.bazel | 37 --------- cpp/include/ray/api/ray_config.h | 7 -- cpp/include/ray/experimental/default_worker.h | 9 +++ cpp/src/example/example.cc | 76 +++++++++++++++++++ cpp/{ => src/ray}/example/example.cc | 46 +++++------ cpp/src/ray/runtime/task/task_executor.cc | 2 +- cpp/src/ray/test/cluster/cluster_mode_test.cc | 23 +++--- cpp/src/ray/util/function_helper.cc | 11 ++- cpp/src/ray/worker/default_worker.cc | 18 +++-- doc/source/index.rst | 15 ++-- python/ray/_private/services.py | 6 +- 15 files changed, 243 insertions(+), 144 deletions(-) create mode 100644 cpp/dev_BUILD.bazel delete mode 100644 cpp/example/BUILD.bazel create mode 100644 cpp/include/ray/experimental/default_worker.h create mode 100644 cpp/src/example/example.cc rename cpp/{ => src/ray}/example/example.cc (81%) diff --git a/ci/travis/bazel-format.sh b/ci/travis/bazel-format.sh index a97b97e6f777..3910529a4997 100755 --- a/ci/travis/bazel-format.sh +++ b/ci/travis/bazel-format.sh @@ -45,6 +45,6 @@ done pushd "$ROOT_DIR"/../.. BAZEL_FILES=(bazel/BUILD bazel/ray.bzl BUILD.bazel java/BUILD.bazel \ - cpp/BUILD.bazel cpp/example/BUILD.bazel streaming/BUILD.bazel streaming/java/BUILD.bazel WORKSPACE) + cpp/BUILD.bazel streaming/BUILD.bazel streaming/java/BUILD.bazel WORKSPACE) buildifier -mode=$RUN_TYPE -diff_command="diff -u" "${BAZEL_FILES[@]}" popd diff --git a/ci/travis/ci.sh b/ci/travis/ci.sh index 6267a232125a..e72380bdb8c6 100755 --- a/ci/travis/ci.sh +++ b/ci/travis/ci.sh @@ -188,9 +188,6 @@ test_cpp() { bazel build --config=ci //cpp:all # shellcheck disable=SC2046 bazel test --config=ci $(./scripts/bazel_export_options) //cpp:all --build_tests_only - # run the cpp example - bazel run //cpp/example:example - } test_wheels() { diff --git a/cpp/BUILD.bazel b/cpp/BUILD.bazel index a4dc5b505dcb..af82486a0d2d 100644 --- a/cpp/BUILD.bazel +++ b/cpp/BUILD.bazel @@ -21,6 +21,7 @@ cc_library( "src/ray/util/*.h", "src/ray/*.cc", "src/ray/*.h", + "src/ray/worker/default_worker.cc", ]), hdrs = glob([ "include/ray/*.h", @@ -44,36 +45,18 @@ cc_library( ) cc_binary( - name = "default_worker", + name = "example", + testonly = 1, srcs = glob([ - "src/ray/worker/default_worker.cc", + "src/example/example.cc", ]), copts = COPTS, - linkstatic = True, + linkstatic = False, deps = [ "ray_api", ], ) -genrule( - name = "ray_cpp_pkg", - srcs = [ - "default_worker", - "ray_api", - ], - outs = ["ray_cpp_pkg.out"], - cmd = """ - WORK_DIR="$$(pwd)" && - mkdir -p "$$WORK_DIR/python/ray/core/src/ray/cpp/" && - cp -f $(location default_worker) "$$WORK_DIR/python/ray/core/src/ray/cpp/default_worker" && - cp -f $(locations ray_api) "$$WORK_DIR/python/ray/core/src/ray/cpp/" && - echo "$$WORK_DIR" > $@ - """, - local = 1, - visibility = ["//visibility:public"], -) - -# test cc_test( name = "api_test", srcs = glob([ @@ -93,32 +76,27 @@ cc_test( srcs = glob([ "src/ray/test/cluster/*.cc", ]), - args = [ - "$(location cluster_mode_test.so)", - ], copts = COPTS, - data = [ - "cluster_mode_test.so", - "ray_cpp_pkg", - ], linkstatic = True, deps = [ "ray_api", - "@com_github_gflags_gflags//:gflags", "@com_google_googletest//:gtest_main", ], ) -cc_binary( - name = "cluster_mode_test.so", - srcs = glob([ - "src/ray/test/cluster/*.cc", - ]), - copts = COPTS, - linkstatic = True, - deps = [ +genrule( + name = "ray_cpp_pkg", + srcs = [ + "cluster_mode_test", "ray_api", - "@com_github_gflags_gflags//:gflags", - "@com_google_googletest//:gtest_main", ], + outs = ["ray_cpp_pkg.out"], + cmd = """ + WORK_DIR="$$(pwd)" && + mkdir -p "$$WORK_DIR/python/ray/core/src/ray/cpp/" && + cp -f $(location cluster_mode_test) "$$WORK_DIR/python/ray/core/src/ray/cpp/default_worker" && + cp -f $(locations ray_api) "$$WORK_DIR/python/ray/core/src/ray/cpp/" && + echo "$$WORK_DIR" > $@ + """, + local = 1, ) diff --git a/cpp/dev_BUILD.bazel b/cpp/dev_BUILD.bazel new file mode 100644 index 000000000000..8c7470b99cbe --- /dev/null +++ b/cpp/dev_BUILD.bazel @@ -0,0 +1,74 @@ +# Bazel development build for C++ API. +# C/C++ documentation: https://docs.bazel.build/versions/master/be/c-cpp.html + +load("//bazel:ray.bzl", "COPTS") + +cc_library( + name = "ray_api", + srcs = glob([ + "src/ray/api.cc", + "src/ray/api/*.cc", + "src/ray/api/*.h", + "src/ray/app/*.cc", + "src/ray/app/*.h", + "src/ray/runtime/*.cc", + "src/ray/runtime/*.h", + "src/ray/runtime/**/*.cc", + "src/ray/runtime/**/*.h", + "src/ray/runtime/task/*.cc", + "src/ray/runtime/task/*.h", + "src/ray/util/*.cc", + "src/ray/util/*.h", + "src/ray/*.cc", + "src/ray/*.h", + "src/ray/worker/default_worker.cc", + ]), + hdrs = glob([ + "include/ray/*.h", + "include/ray/**/*.h", + "include/ray/**/**/*.h", + ]), + copts = COPTS, + linkopts = ["-ldl"], + linkstatic = True, + strip_include_prefix = "include", + visibility = ["//visibility:public"], + deps = [ + "//:core_worker_lib", + "//:ray_common", + "//:ray_util", + "@boost//:asio", + "@boost//:thread", + "@com_google_absl//absl/synchronization", + "@msgpack", + ], +) + +cc_binary( + name = "example", + srcs = glob([ + "src/ray/example/*.cc", + ]), + copts = COPTS, + linkstatic = True, + deps = [ + "ray_api", + ], +) + +genrule( + name = "ray_cpp_pkg", + srcs = [ + "example", + "ray_api", + ], + outs = ["ray_cpp_pkg.out"], + cmd = """ + WORK_DIR="$$(pwd)" && + mkdir -p "$$WORK_DIR/python/ray/core/src/ray/cpp/" && + cp -f $(location example) "$$WORK_DIR/python/ray/core/src/ray/cpp/default_worker" && + cp -f $(locations ray_api) "$$WORK_DIR/python/ray/core/src/ray/cpp/" && + echo "$$WORK_DIR" > $@ + """, + local = 1, +) diff --git a/cpp/example/BUILD.bazel b/cpp/example/BUILD.bazel deleted file mode 100644 index a14212042812..000000000000 --- a/cpp/example/BUILD.bazel +++ /dev/null @@ -1,37 +0,0 @@ -# Bazel development build for C++ API. -# C/C++ documentation: https://docs.bazel.build/versions/master/be/c-cpp.html - -load("//bazel:ray.bzl", "COPTS") - -cc_binary( - name = "example", - srcs = glob([ - "*.cc", - ]), - args = [ - "--dynamic-library-path $(location example.so)", - ], - copts = COPTS, - data = [ - "example.so", - "//cpp:ray_cpp_pkg", - ], - linkstatic = True, - deps = [ - "//cpp:ray_api", - "@com_github_gflags_gflags//:gflags", - ], -) - -cc_binary( - name = "example.so", - srcs = glob([ - "*.cc", - ]), - copts = COPTS, - linkstatic = True, - deps = [ - "//cpp:ray_api", - "@com_github_gflags_gflags//:gflags", - ], -) diff --git a/cpp/include/ray/api/ray_config.h b/cpp/include/ray/api/ray_config.h index b8c4f0cd285e..b6bc55d5dcfe 100644 --- a/cpp/include/ray/api/ray_config.h +++ b/cpp/include/ray/api/ray_config.h @@ -34,13 +34,6 @@ class RayConfig { static std::shared_ptr GetInstance(); - void SetRedisAddress(const std::string address) { - auto pos = address.find(':'); - RAY_CHECK(pos != std::string::npos); - redis_ip = address.substr(0, pos); - redis_port = std::stoi(address.substr(pos + 1, address.length())); - } - private: static std::shared_ptr config_; }; diff --git a/cpp/include/ray/experimental/default_worker.h b/cpp/include/ray/experimental/default_worker.h new file mode 100644 index 000000000000..2c0e02259d6e --- /dev/null +++ b/cpp/include/ray/experimental/default_worker.h @@ -0,0 +1,9 @@ +#pragma once + +namespace ray { +namespace api { + +int default_worker_main(int argc, char **argv); + +} // namespace api +} // namespace ray diff --git a/cpp/src/example/example.cc b/cpp/src/example/example.cc new file mode 100644 index 000000000000..1375136caac0 --- /dev/null +++ b/cpp/src/example/example.cc @@ -0,0 +1,76 @@ + +/// This is a complete example of writing a distributed program using the C ++ worker API. + +/// including the header +#include + +/// using namespace +using namespace ::ray::api; + +/// general function of user code +int Return1() { return 1; } +int Plus1(int x) { return x + 1; } +int Plus(int x, int y) { return x + y; } + +/// a class of user code +class Counter { + public: + int count; + + Counter() { count = 0; } + + static Counter *FactoryCreate() { return new Counter(); } + /// non static function + int Add(int x) { + count += x; + return count; + } +}; + +int main() { + /// initialization + Ray::Init(); + + /// put and get object + auto obj = Ray::Put(123); + auto get_result = obj.Get(); + + /// general function remote call(args passed by value) + auto r0 = Ray::Task(Return1).Remote(); + auto r1 = Ray::Task(Plus1, 1).Remote(); + auto r2 = Ray::Task(Plus, 1, 2).Remote(); + + int result0 = *(r0.Get()); + int result1 = *(r1.Get()); + int result2 = *(r2.Get()); + + std::cout << "Ray::call with value results: " << result0 << " " << result1 << " " + << result2 << std::endl; + + /// general function remote call(args passed by reference) + auto r3 = Ray::Task(Return1).Remote(); + auto r4 = Ray::Task(Plus1, r3).Remote(); + auto r5 = Ray::Task(Plus, r4, 1).Remote(); + + int result3 = *(r3.Get()); + int result4 = *(r4.Get()); + int result5 = *(r5.Get()); + + std::cout << "Ray::call with reference results: " << result3 << " " << result4 << " " + << result5 << std::endl; + + /// create actor and actor function remote call + ActorHandle actor = Ray::Actor(Counter::FactoryCreate).Remote(); + auto r6 = actor.Task(&Counter::Add, 5).Remote(); + auto r7 = actor.Task(&Counter::Add, 1).Remote(); + auto r8 = actor.Task(&Counter::Add, 1).Remote(); + auto r9 = actor.Task(&Counter::Add, r8).Remote(); + + int result6 = *(r6.Get()); + int result7 = *(r7.Get()); + int result8 = *(r8.Get()); + int result9 = *(r9.Get()); + + std::cout << "Ray::call with actor results: " << result6 << " " << result7 << " " + << result8 << " " << result9 << std::endl; +} diff --git a/cpp/example/example.cc b/cpp/src/ray/example/example.cc similarity index 81% rename from cpp/example/example.cc rename to cpp/src/ray/example/example.cc index 13f82192d0ab..7ada6f1f5f22 100644 --- a/cpp/example/example.cc +++ b/cpp/src/ray/example/example.cc @@ -1,12 +1,8 @@ -/// This is a complete example of writing a distributed program using the C ++ worker API. - -/// including the header #include #include -#include "gflags/gflags.h" +#include -/// using namespace using namespace ::ray::api; /// general function of user code @@ -36,25 +32,22 @@ class Counter { } }; -DEFINE_string(redis_address, "", "The ip address of redis server."); - -DEFINE_string(dynamic_library_path, "", "The local path of the dynamic library."); - int main(int argc, char **argv) { - /// configuration - gflags::ParseCommandLineFlags(&argc, &argv, true); - const std::string dynamic_library_path = FLAGS_dynamic_library_path; - const std::string redis_address = FLAGS_redis_address; - gflags::ShutDownCommandLineFlags(); - RAY_CHECK(!dynamic_library_path.empty()) - << "Please add a local dynamic library by '--dynamic-library-path'"; - ray::api::RayConfig::GetInstance()->lib_name = dynamic_library_path; - if (!redis_address.empty()) { - ray::api::RayConfig::GetInstance()->SetRedisAddress(redis_address); + /// Currently, we compile `default_worker` and `example` in one single binary, + /// to work around a symbol conflicting issue. + /// This is the main function of the binary, and we use the `is_default_worker` arg to + /// tell if this binary is used as `default_worker` or `example`. + const char *default_worker_magic = "is_default_worker"; + /// `is_default_worker` is the last arg of `argv` + if (argc > 1 && + memcmp(argv[argc - 1], default_worker_magic, strlen(default_worker_magic)) == 0) { + default_worker_main(argc, argv); + return 0; } - ::ray::api::RayConfig::GetInstance()->run_mode = RunMode::CLUSTER; - - /// initialization + /// initialization to cluster mode + ray::api::RayConfig::GetInstance()->run_mode = RunMode::CLUSTER; + /// Dynamic library loading is not supported yet. + ray::api::RayConfig::GetInstance()->lib_name = ""; Ray::Init(); /// put and get object @@ -93,6 +86,7 @@ int main(int argc, char **argv) { /// general function remote call(args passed by value) auto r0 = Ray::Task(Return1).Remote(); auto r2 = Ray::Task(Plus, 3, 22).Remote(); + int task_result3 = *(Ray::Get(r2)); std::cout << "task_result3 = " << task_result3 << std::endl; @@ -101,6 +95,7 @@ int main(int argc, char **argv) { auto r4 = Ray::Task(Plus1, r3).Remote(); auto r5 = Ray::Task(Plus, r4, r3).Remote(); auto r6 = Ray::Task(Plus, r4, 10).Remote(); + int task_result4 = *(Ray::Get(r6)); int task_result5 = *(Ray::Get(r5)); std::cout << "task_result4 = " << task_result4 << ", task_result5 = " << task_result5 @@ -109,30 +104,31 @@ int main(int argc, char **argv) { /// create actor and actor function remote call with args passed by value ActorHandle actor4 = Ray::Actor(Counter::FactoryCreate, 10).Remote(); auto r10 = actor4.Task(&Counter::Add, 8).Remote(); + int actor_result4 = *(Ray::Get(r10)); std::cout << "actor_result4 = " << actor_result4 << std::endl; /// create actor and task function remote call with args passed by reference ActorHandle actor5 = Ray::Actor(Counter::FactoryCreate, r10, 0).Remote(); + auto r11 = actor5.Task(&Counter::Add, r0).Remote(); auto r12 = actor5.Task(&Counter::Add, r11).Remote(); auto r13 = actor5.Task(&Counter::Add, r10).Remote(); auto r14 = actor5.Task(&Counter::Add, r13).Remote(); auto r15 = Ray::Task(Plus, r0, r11).Remote(); auto r16 = Ray::Task(Plus1, r15).Remote(); + int result12 = *(Ray::Get(r12)); int result14 = *(Ray::Get(r14)); int result11 = *(Ray::Get(r11)); int result13 = *(Ray::Get(r13)); int result16 = *(Ray::Get(r16)); int result15 = *(Ray::Get(r15)); + std::cout << "Final result:" << std::endl; std::cout << "result11 = " << result11 << ", result12 = " << result12 << ", result13 = " << result13 << ", result14 = " << result14 << ", result15 = " << result15 << ", result16 = " << result16 << std::endl; - - /// shutdown Ray::Shutdown(); - return 0; } diff --git a/cpp/src/ray/runtime/task/task_executor.cc b/cpp/src/ray/runtime/task/task_executor.cc index d0879112fcf3..f2b06af09370 100644 --- a/cpp/src/ray/runtime/task/task_executor.cc +++ b/cpp/src/ray/runtime/task/task_executor.cc @@ -29,7 +29,7 @@ Status TaskExecutor::ExecuteTask( const std::vector &arg_reference_ids, const std::vector &return_ids, const std::string &debugger_breakpoint, std::vector> *results) { - RAY_LOG(INFO) << "Execute task: " << TaskType_Name(task_type); + RAY_LOG(INFO) << "TaskExecutor::ExecuteTask"; RAY_CHECK(ray_function.GetLanguage() == Language::CPP); auto function_descriptor = ray_function.GetFunctionDescriptor(); RAY_CHECK(function_descriptor->Type() == diff --git a/cpp/src/ray/test/cluster/cluster_mode_test.cc b/cpp/src/ray/test/cluster/cluster_mode_test.cc index e00c6af14958..780fb0d3024c 100644 --- a/cpp/src/ray/test/cluster/cluster_mode_test.cc +++ b/cpp/src/ray/test/cluster/cluster_mode_test.cc @@ -2,6 +2,7 @@ #include #include #include +#include using namespace ::ray::api; @@ -32,16 +33,11 @@ class Counter { } }; -std::string lib_name = ""; - -std::string redis_ip = ""; - TEST(RayClusterModeTest, FullTest) { /// initialization to cluster mode ray::api::RayConfig::GetInstance()->run_mode = RunMode::CLUSTER; /// TODO(Guyang Song): add the dynamic library name - ray::api::RayConfig::GetInstance()->lib_name = lib_name; - ray::api::RayConfig::GetInstance()->redis_ip = redis_ip; + ray::api::RayConfig::GetInstance()->lib_name = ""; Ray::Init(); /// put and get object @@ -148,11 +144,18 @@ TEST(RayClusterModeTest, FullTest) { Ray::Shutdown(); } +/// TODO(Guyang Song): Separate default worker from this test. +/// Currently, we compile `default_worker` and `cluster_mode_test` in one single binary, +/// to work around a symbol conflicting issue. +/// This is the main function of the binary, and we use the `is_default_worker` arg to +/// tell if this binary is used as `default_worker` or `cluster_mode_test`. int main(int argc, char **argv) { - RAY_CHECK(argc == 2 || argc == 3); - lib_name = std::string(argv[1]); - if (argc == 3) { - redis_ip = std::string(argv[2]); + const char *default_worker_magic = "is_default_worker"; + /// `is_default_worker` is the last arg of `argv` + if (argc > 1 && + memcmp(argv[argc - 1], default_worker_magic, strlen(default_worker_magic)) == 0) { + default_worker_main(argc, argv); + return 0; } ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/cpp/src/ray/util/function_helper.cc b/cpp/src/ray/util/function_helper.cc index 8693ea6b1466..5dfa8a012904 100644 --- a/cpp/src/ray/util/function_helper.cc +++ b/cpp/src/ray/util/function_helper.cc @@ -14,14 +14,19 @@ uintptr_t base_addr = 0; static const uintptr_t BaseAddressForHandle(void *handle) { /// TODO(Guyang Song): Implement a cross-platform function. - return (uintptr_t)((NULL == handle) ? NULL : (void *)*(size_t const *)(handle)); + /// Not Implemented. + return -1; } uintptr_t FunctionHelper::LoadLibrary(std::string lib_name) { + if (dynamic_library_base_addr != 0) { + /// Base address has been generated. + return dynamic_library_base_addr; + } /// Generate base address from library. RAY_LOG(INFO) << "Start load library " << lib_name; - void *handle = dlopen(lib_name.c_str(), RTLD_LAZY); - uintptr_t base_addr = BaseAddressForHandle(handle); + void *example = dlopen(lib_name.c_str(), RTLD_LAZY); + uintptr_t base_addr = BaseAddressForHandle(example); RAY_CHECK(base_addr > 0); RAY_LOG(INFO) << "Loaded library " << lib_name << " to base address " << base_addr; loaded_library_.emplace(lib_name, base_addr); diff --git a/cpp/src/ray/worker/default_worker.cc b/cpp/src/ray/worker/default_worker.cc index dd61bb457bed..2ebfb8d6ca9c 100644 --- a/cpp/src/ray/worker/default_worker.cc +++ b/cpp/src/ray/worker/default_worker.cc @@ -3,11 +3,14 @@ #include #include -using namespace ::ray::api; +using namespace ::ray; + +namespace ray { +namespace api { int default_worker_main(int argc, char **argv) { RAY_LOG(INFO) << "CPP default worker started"; - RAY_CHECK(argc == 7); + RAY_CHECK(argc == 8); auto config = ray::api::RayConfig::GetInstance(); config->run_mode = RunMode::CLUSTER; @@ -16,7 +19,10 @@ int default_worker_main(int argc, char **argv) { config->raylet_socket = std::string(argv[2]); config->node_manager_port = std::stoi(std::string(argv[3])); std::string redis_address = std::string(std::string(argv[4])); - config->SetRedisAddress(redis_address); + auto pos = redis_address.find(':'); + RAY_CHECK(pos != std::string::npos); + config->redis_ip = redis_address.substr(0, pos); + config->redis_port = std::stoi(redis_address.substr(pos + 1, redis_address.length())); config->redis_password = std::string(std::string(argv[5])); config->session_dir = std::string(std::string(argv[6])); @@ -26,7 +32,5 @@ int default_worker_main(int argc, char **argv) { return 0; } -int main(int argc, char **argv) { - default_worker_main(argc, argv); - return 0; -} +} // namespace api +} // namespace ray diff --git a/doc/source/index.rst b/doc/source/index.rst index be01da3cf2a8..9edb823b20ad 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -117,17 +117,14 @@ Ray provides Python, Java, and *EXPERIMENTAL* C++ API. And Ray uses Tasks (funct | The C++ Ray API is currently experimental with limited support. You can track its development `here `__ and report issues on GitHub. | Run the following commands to get started: | - Build ray from source with *bazel* as shown `here `__. - | - Modify `cpp/example/example.cc`. - | - Run `"bazel build //cpp:example"`. - | Option 1:, run the example directly with a dynamic library path. It will start a Ray cluster automatically. + | - Run `"cd ray/cpp"`. + | - Run `"cp dev_BUILD.bazel BUILD.bazel"`. + | - Modify `src/ray/example/example.cc`. | - Run `"ray stop"`. - | - Run `"./bazel-bin/cpp/example/example --dynamic-library-path=bazel-bin/cpp/example/example.so"` - | Option 2: connect to an existing Ray cluster with a known redis address (e.g. `127.0.0.1:6379`). - | - Run `"ray stop"`. - | - Run `"ray start --head --port 6379 --redis-password 5241590000000000 --node-manager-port 62665"`. - | - Run `"./bazel-bin/cpp/example/example --dynamic-library-path=bazel-bin/cpp/example/example.so --redis-address=127.0.0.1:6379"`. + | - Run `"bazel build //cpp:all"`. + | - Run `"bazel run //cpp:example"`. - .. literalinclude:: ../../cpp/example/example.cc + .. literalinclude:: ../../cpp/src/ray/example/example.cc :language: cpp You can also get started by visiting our `Tutorials `_. For the latest wheels (nightlies), see the `installation page `__. diff --git a/python/ray/_private/services.py b/python/ray/_private/services.py index 996cede111d6..1c4c6497dca6 100644 --- a/python/ray/_private/services.py +++ b/python/ray/_private/services.py @@ -1580,9 +1580,13 @@ def build_cpp_worker_command( The command string for starting CPP worker. """ + # TODO(Guyang Song): Remove the arg is_default_worker. + # See `cluster_mode_test.cc` for why this workaround is currently needed + # for C++ workers. command = [ DEFAULT_WORKER_EXECUTABLE, plasma_store_name, raylet_name, - str(node_manager_port), redis_address, redis_password, session_dir + str(node_manager_port), redis_address, redis_password, session_dir, + "is_default_worker" ] return command From d65b6fb44e1753fee4f4457b3ceb8fba1296390d Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 131/244] Revert "Use right reserve size (#13829)" This reverts commit 95e3b0885ca16640c2436a09c311a5cf9c7dc0b0. --- src/ray/gcs/store_client/redis_store_client.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ray/gcs/store_client/redis_store_client.cc b/src/ray/gcs/store_client/redis_store_client.cc index 4db20698861d..b104be3adbf4 100644 --- a/src/ray/gcs/store_client/redis_store_client.cc +++ b/src/ray/gcs/store_client/redis_store_client.cc @@ -115,7 +115,7 @@ Status RedisStoreClient::AsyncDeleteWithIndex(const std::string &table_name, const std::string &index_key, const StatusCallback &callback) { std::vector redis_keys; - redis_keys.reserve(2); + redis_keys.reserve(20); redis_keys.push_back(GenRedisKey(table_name, key)); redis_keys.push_back(GenRedisKey(table_name, key, index_key)); From 5c28227d7869f5336c9c9951d07ee161e3341b17 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 132/244] Revert "[autoscaler] Remove min workers from multi node type examples (#13814)" This reverts commit c4308de25a5ac8289f22dbfb68c4d00a0e122004. --- python/ray/autoscaler/_private/util.py | 2 +- python/ray/autoscaler/aws/example-multi-node-type.yaml | 1 + python/ray/autoscaler/staroid/example-multi-node-type.yaml | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/python/ray/autoscaler/_private/util.py b/python/ray/autoscaler/_private/util.py index 39ebd5e799fe..32758dec649f 100644 --- a/python/ray/autoscaler/_private/util.py +++ b/python/ray/autoscaler/_private/util.py @@ -124,7 +124,7 @@ def rewrite_legacy_yaml_to_available_node_types( }, } config["head_node_type"] = NODE_TYPE_LEGACY_HEAD - del config["min_workers"] + return config diff --git a/python/ray/autoscaler/aws/example-multi-node-type.yaml b/python/ray/autoscaler/aws/example-multi-node-type.yaml index 19584c69df2d..1a83b8cc6212 100644 --- a/python/ray/autoscaler/aws/example-multi-node-type.yaml +++ b/python/ray/autoscaler/aws/example-multi-node-type.yaml @@ -1,5 +1,6 @@ # Experimental: an example of configuring a mixed-node-type cluster. cluster_name: multi_node_type +min_workers: 1 max_workers: 40 # The autoscaler will scale up the cluster faster with higher upscaling speed. diff --git a/python/ray/autoscaler/staroid/example-multi-node-type.yaml b/python/ray/autoscaler/staroid/example-multi-node-type.yaml index f0291963ec3c..563e3a74c6e4 100644 --- a/python/ray/autoscaler/staroid/example-multi-node-type.yaml +++ b/python/ray/autoscaler/staroid/example-multi-node-type.yaml @@ -1,5 +1,6 @@ # an example of configuring a mixed-node-type cluster. cluster_name: multi-node-type # name with 'a-z' and '-' +min_workers: 1 max_workers: 40 # The autoscaler will scale up the cluster faster with higher upscaling speed. From 97cfc965e0f3f6af85efd2a8019433ea21638710 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 133/244] Revert "[Object Spilling] Turn on by default. (#13745)" This reverts commit 0cdd4ab475fa79ed701e11aef2805a630ae7c4a8. --- python/ray/node.py | 46 ------------ python/ray/parameter.py | 14 ++++ python/ray/tests/test_basic.py | 3 - python/ray/tests/test_failure.py | 5 +- python/ray/tests/test_object_spilling.py | 73 +------------------ python/ray/tests/test_reference_counting.py | 2 - python/ray/tests/test_reference_counting_2.py | 1 - 7 files changed, 18 insertions(+), 126 deletions(-) diff --git a/python/ray/node.py b/python/ray/node.py index a63a0a8a8996..2668d9aa0735 100644 --- a/python/ray/node.py +++ b/python/ray/node.py @@ -167,11 +167,6 @@ def __init__(self, self._init_temp(redis_client) - # If it is a head node, try validating if - # external storage is configurable. - if head: - self.validate_external_storage() - if connect_only: # Get socket names from the configuration. self._plasma_store_socket_name = ( @@ -1169,44 +1164,3 @@ def destroy_external_storage(self): storage = external_storage.setup_external_storage( object_spilling_config) storage.destroy_external_storage() - - def validate_external_storage(self): - """Make sure we can setup the object spilling external storage. - This will also fill up the default setting for object spilling - if not specified. - """ - object_spilling_config = self._config.get("object_spilling_config", {}) - automatic_spilling_enabled = self._config.get( - "automatic_object_spilling_enabled", True) - if not automatic_spilling_enabled: - return - - # If the config is not specified, we fill up the default. - if not object_spilling_config: - object_spilling_config = json.dumps({ - "type": "filesystem", - "params": { - "directory_path": self._session_dir - } - }) - - # Try setting up the storage. - # Configure the proper system config. - # We need to set both ray param's system config and self._config - # because they could've been diverged at this point. - deserialized_config = json.loads(object_spilling_config) - self._ray_params._system_config["object_spilling_config"] = ( - object_spilling_config) - self._config["object_spilling_config"] = object_spilling_config - - is_external_storage_type_fs = ( - deserialized_config["type"] == "filesystem") - self._ray_params._system_config["is_external_storage_type_fs"] = ( - is_external_storage_type_fs) - self._config["is_external_storage_type_fs"] = ( - is_external_storage_type_fs) - - # Validate external storage usage. - from ray import external_storage - external_storage.setup_external_storage(deserialized_config) - external_storage.reset_external_storage() diff --git a/python/ray/parameter.py b/python/ray/parameter.py index af7bdf47593d..666b82905b1e 100644 --- a/python/ray/parameter.py +++ b/python/ray/parameter.py @@ -1,3 +1,4 @@ +import json import logging import os @@ -319,3 +320,16 @@ def _check_usage(self): if numpy_major <= 1 and numpy_minor < 16: logger.warning("Using ray with numpy < 1.16.0 will result in slow " "serialization. Upgrade numpy if using with ray.") + + # Make sure object spilling configuration is applicable. + object_spilling_config = self._system_config.get( + "object_spilling_config", {}) + if object_spilling_config: + object_spilling_config = json.loads(object_spilling_config) + from ray import external_storage + # Validate external storage usage. + external_storage.setup_external_storage(object_spilling_config) + external_storage.reset_external_storage() + # Configure the proper system config. + self._system_config["is_external_storage_type_fs"] = ( + object_spilling_config["type"] == "filesystem") diff --git a/python/ray/tests/test_basic.py b/python/ray/tests/test_basic.py index 4c80aea70ebb..e33af42deb46 100644 --- a/python/ray/tests/test_basic.py +++ b/python/ray/tests/test_basic.py @@ -261,9 +261,6 @@ def foo(): "ray_start_cluster_head", [{ "num_cpus": 0, "object_store_memory": 75 * 1024 * 1024, - "_system_config": { - "automatic_object_spilling_enabled": False - } }], indirect=True) def test_fetch_local(ray_start_cluster_head): diff --git a/python/ray/tests/test_failure.py b/python/ray/tests/test_failure.py index abd82011d1e4..f45aea9b4292 100644 --- a/python/ray/tests/test_failure.py +++ b/python/ray/tests/test_failure.py @@ -1039,10 +1039,7 @@ def some_expensive_task(self): def test_fill_object_store_exception(shutdown_only): - ray.init( - num_cpus=2, - object_store_memory=10**8, - _system_config={"automatic_object_spilling_enabled": False}) + ray.init(num_cpus=2, object_store_memory=10**8) @ray.remote def expensive_task(): diff --git a/python/ray/tests/test_object_spilling.py b/python/ray/tests/test_object_spilling.py index 159e0aaf79b1..242799dc9281 100644 --- a/python/ray/tests/test_object_spilling.py +++ b/python/ray/tests/test_object_spilling.py @@ -69,14 +69,6 @@ def multi_node_object_spilling_config(request, tmp_path): yield create_object_spilling_config(request, tmp_path) -def run_basic_workload(): - """Run the workload that requires spilling.""" - arr = np.random.rand(5 * 1024 * 1024) # 40 MB - refs = [] - refs.append([ray.put(arr) for _ in range(2)]) - ray.get(ray.put(arr)) - - def is_dir_empty(temp_folder, append_path=ray.ray_constants.DEFAULT_OBJECT_PREFIX): # append_path is used because the file based spilling will append @@ -119,68 +111,6 @@ def test_url_generation_and_parse(): assert parsed_result.size == size -@pytest.mark.skipif( - platform.system() == "Windows", reason="Failing on Windows.") -def test_default_config(shutdown_only): - ray.init(num_cpus=0, object_store_memory=75 * 1024 * 1024) - # Make sure the object spilling configuration is properly set. - config = json.loads( - ray.worker._global_node._config["object_spilling_config"]) - assert config["type"] == "filesystem" - assert (config["params"]["directory_path"] == - ray.worker._global_node._session_dir) - # Make sure the basic workload can succeed. - run_basic_workload() - ray.shutdown() - - # Make sure config is not initalized if spilling is not enabled.. - ray.init( - num_cpus=0, - object_store_memory=75 * 1024 * 1024, - _system_config={ - "automatic_object_spilling_enabled": False, - "object_store_full_delay_ms": 100 - }) - assert "object_spilling_config" not in ray.worker._global_node._config - with pytest.raises(ray.exceptions.ObjectStoreFullError): - run_basic_workload() - ray.shutdown() - - # Make sure when we use a different config, it is reflected. - ray.init( - num_cpus=0, - _system_config={ - "object_spilling_config": ( - json.dumps(mock_distributed_fs_object_spilling_config)) - }) - config = json.loads( - ray.worker._global_node._config["object_spilling_config"]) - assert config["type"] == "mock_distributed_fs" - - -@pytest.mark.skipif( - platform.system() == "Windows", reason="Failing on Windows.") -def test_default_config_cluster(ray_start_cluster): - cluster = ray_start_cluster - cluster.add_node(num_cpus=0) - ray.init(cluster.address) - worker_nodes = [] - worker_nodes.append( - cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024)) - cluster.wait_for_nodes() - - # Run the basic spilling workload on both - # worker nodes and make sure they are working. - @ray.remote - def task(): - arr = np.random.rand(5 * 1024 * 1024) # 40 MB - refs = [] - refs.append([ray.put(arr) for _ in range(2)]) - ray.get(ray.put(arr)) - - ray.get([task.remote() for _ in range(2)]) - - @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") def test_spilling_not_done_for_pinned_object(object_spilling_config, @@ -760,7 +690,9 @@ def test_file_deleted_when_driver_exits(tmp_path, shutdown_only): import os import signal import numpy as np + import ray + ray.init( object_store_memory=75 * 1024 * 1024, _system_config={{ @@ -777,6 +709,7 @@ def test_file_deleted_when_driver_exits(tmp_path, shutdown_only): }}) arr = np.random.rand(1024 * 1024) # 8 MB data replay_buffer = [] + # Spill lots of objects for _ in range(30): ref = None diff --git a/python/ray/tests/test_reference_counting.py b/python/ray/tests/test_reference_counting.py index 02638ed3dea8..a47a9a828c11 100644 --- a/python/ray/tests/test_reference_counting.py +++ b/python/ray/tests/test_reference_counting.py @@ -18,10 +18,8 @@ @pytest.fixture def one_worker_100MiB(request): - # It has lots of tests that don't require object spilling. config = { "task_retry_delay_ms": 0, - "automatic_object_spilling_enabled": False } yield ray.init( num_cpus=1, diff --git a/python/ray/tests/test_reference_counting_2.py b/python/ray/tests/test_reference_counting_2.py index 416afcec0378..8cc7576aa46c 100644 --- a/python/ray/tests/test_reference_counting_2.py +++ b/python/ray/tests/test_reference_counting_2.py @@ -22,7 +22,6 @@ def one_worker_100MiB(request): config = { "task_retry_delay_ms": 0, "object_timeout_milliseconds": 1000, - "automatic_object_spilling_enabled": False } yield ray.init( num_cpus=1, From a9dffad14f9a29b0d2a21b6e4ed4789732e38574 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 134/244] Revert "[Release] Fix SGD+Tune long running distributed release test (#13812)" This reverts commit 3a7704d90eb212e1234f155a3eb2edac68a479eb. --- python/ray/util/sgd/BUILD | 14 -- .../sgd/torch/examples/pytorch_pbt_failure.py | 128 ---------------- .../workloads/pytorch_pbt_failure.py | 139 +++++++++++++++++- 3 files changed, 138 insertions(+), 143 deletions(-) delete mode 100644 python/ray/util/sgd/torch/examples/pytorch_pbt_failure.py mode change 120000 => 100644 release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py diff --git a/python/ray/util/sgd/BUILD b/python/ray/util/sgd/BUILD index cbdc52cb479a..896560136626 100644 --- a/python/ray/util/sgd/BUILD +++ b/python/ray/util/sgd/BUILD @@ -241,20 +241,6 @@ py_test( args = ["--smoke-test"] ) -# -------------------------------------------------------------------- -# SGD related tests from the ../../../../release directory. -# Please keep these sorted alphabetically. -# -------------------------------------------------------------------- - -py_test( - name = "pytorch_pbt_failure", - size = "medium", - srcs = ["torch/examples/pytorch_pbt_failure.py"], - tags = ["exlusive", "pytorch", "release"], - deps = [":sgd_lib"], - args = ["--smoke-test"] -) - # This is a dummy test dependency that causes the above tests to be # re-run if any of these files changes. py_library( diff --git a/python/ray/util/sgd/torch/examples/pytorch_pbt_failure.py b/python/ray/util/sgd/torch/examples/pytorch_pbt_failure.py deleted file mode 100644 index 053991885b4b..000000000000 --- a/python/ray/util/sgd/torch/examples/pytorch_pbt_failure.py +++ /dev/null @@ -1,128 +0,0 @@ -import argparse -import numpy as np -import os -import torch -import torch.nn as nn -from torch.utils.data import DataLoader, Subset -from torchvision.datasets import CIFAR10 -import torchvision.transforms as transforms - -import ray -from ray import tune -from ray.tune import CLIReporter -from ray.tune.schedulers import PopulationBasedTraining -from ray.tune.utils.mock import FailureInjectorCallback -from ray.util.sgd.torch import TorchTrainer, TrainingOperator -from ray.util.sgd.torch.resnet import ResNet18 -from ray.util.sgd.utils import BATCH_SIZE - -parser = argparse.ArgumentParser() -parser.add_argument( - "--smoke-test", - action="store_true", - default=False, - help="Finish quickly for training.") -args = parser.parse_args() - - -def initialization_hook(): - # Need this for avoiding a connection restart issue on AWS. - os.environ["NCCL_SOCKET_IFNAME"] = "^docker0,lo" - os.environ["NCCL_LL_THRESHOLD"] = "0" - - # set the below if needed - # print("NCCL DEBUG SET") - # os.environ["NCCL_DEBUG"] = "INFO" - - -def cifar_creator(config): - transform_train = transforms.Compose([ - transforms.RandomCrop(32, padding=4), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - transforms.Normalize((0.4914, 0.4822, 0.4465), - (0.2023, 0.1994, 0.2010)), - ]) # meanstd transformation - - transform_test = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.4914, 0.4822, 0.4465), - (0.2023, 0.1994, 0.2010)), - ]) - train_dataset = CIFAR10( - root="~/data", train=True, download=True, transform=transform_train) - validation_dataset = CIFAR10( - root="~/data", train=False, download=False, transform=transform_test) - - if config.get("test_mode"): - train_dataset = Subset(train_dataset, list(range(64))) - validation_dataset = Subset(validation_dataset, list(range(64))) - - train_loader = DataLoader( - train_dataset, batch_size=config[BATCH_SIZE], num_workers=2) - validation_loader = DataLoader( - validation_dataset, batch_size=config[BATCH_SIZE], num_workers=2) - return train_loader, validation_loader - - -def optimizer_creator(model, config): - """Returns optimizer""" - return torch.optim.SGD( - model.parameters(), - lr=config.get("lr", 0.1), - momentum=config.get("momentum", 0.9)) - - -ray.init(address="auto" if not args.smoke_test else None, log_to_driver=True) -num_training_workers = 1 if args.smoke_test else 3 - -CustomTrainingOperator = TrainingOperator.from_creators( - model_creator=ResNet18, - optimizer_creator=optimizer_creator, - data_creator=cifar_creator, - loss_creator=nn.CrossEntropyLoss) - -TorchTrainable = TorchTrainer.as_trainable( - training_operator_cls=CustomTrainingOperator, - initialization_hook=initialization_hook, - num_workers=num_training_workers, - config={ - "test_mode": args.smoke_test, - BATCH_SIZE: 128 * num_training_workers, - }, - use_gpu=not args.smoke_test) - -pbt_scheduler = PopulationBasedTraining( - time_attr="training_iteration", - metric="val_loss", - mode="min", - perturbation_interval=1, - hyperparam_mutations={ - # distribution for resampling - "lr": lambda: np.random.uniform(0.001, 1), - # allow perturbations within this set of categorical values - "momentum": [0.8, 0.9, 0.99], - }) - -reporter = CLIReporter() -reporter.add_metric_column("val_loss", "loss") -reporter.add_metric_column("val_accuracy", "acc") - -analysis = tune.run( - TorchTrainable, - num_samples=4, - config={ - "lr": tune.choice([0.001, 0.01, 0.1]), - "momentum": 0.8, - "head_location": None, - "worker_locations": None - }, - max_failures=-1, # used for fault tolerance - checkpoint_freq=2, # used for fault tolerance - progress_reporter=reporter, - scheduler=pbt_scheduler, - callbacks=[FailureInjectorCallback()], - queue_trials=True, - stop={"training_iteration": 1} if args.smoke_test else None) - -print(analysis.get_best_config(metric="val_loss", mode="min")) diff --git a/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py b/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py deleted file mode 120000 index 4bc3925a1e83..000000000000 --- a/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py +++ /dev/null @@ -1 +0,0 @@ -../../../python/ray/util/sgd/torch/examples/pytorch_pbt_failure.py \ No newline at end of file diff --git a/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py b/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py new file mode 100644 index 000000000000..2451fe4a2228 --- /dev/null +++ b/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py @@ -0,0 +1,138 @@ +import argparse +import numpy as np +import os +import torch +import torch.nn as nn +from torch.utils.data import DataLoader, Subset +from torchvision.datasets import CIFAR10 +import torchvision.transforms as transforms + +import ray +from ray import tune +from ray.tune import CLIReporter +from ray.tune.schedulers import PopulationBasedTraining +from ray.tune.utils.util import merge_dicts +from ray.tune.utils.mock import FailureInjectorCallback +from ray.util.sgd.torch import TorchTrainer, TrainingOperator +from ray.util.sgd.torch.resnet import ResNet18 +from ray.util.sgd.utils import BATCH_SIZE + +parser = argparse.ArgumentParser() +parser.add_argument( + "--smoke-test", + action="store_true", + default=False, + help="Finish quickly for training.") +args = parser.parse_args() + + +def initialization_hook(): + # Need this for avoiding a connection restart issue on AWS. + os.environ["NCCL_SOCKET_IFNAME"] = "^docker0,lo" + os.environ["NCCL_LL_THRESHOLD"] = "0" + + # set the below if needed + # print("NCCL DEBUG SET") + # os.environ["NCCL_DEBUG"] = "INFO" + + +def cifar_creator(config): + transform_train = transforms.Compose([ + transforms.RandomCrop(32, padding=4), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), + (0.2023, 0.1994, 0.2010)), + ]) # meanstd transformation + + transform_test = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), + (0.2023, 0.1994, 0.2010)), + ]) + train_dataset = CIFAR10( + root="~/data", train=True, download=True, transform=transform_train) + validation_dataset = CIFAR10( + root="~/data", train=False, download=False, transform=transform_test) + + if config.get("test_mode"): + train_dataset = Subset(train_dataset, list(range(64))) + validation_dataset = Subset(validation_dataset, list(range(64))) + + train_loader = DataLoader( + train_dataset, batch_size=config[BATCH_SIZE], num_workers=2) + validation_loader = DataLoader( + validation_dataset, batch_size=config[BATCH_SIZE], num_workers=2) + return train_loader, validation_loader + + +def optimizer_creator(model, config): + """Returns optimizer""" + return torch.optim.SGD( + model.parameters(), + lr=config.get("lr", 0.1), + momentum=config.get("momentum", 0.9)) + + +ray.init(address="auto" if not args.smoke_test else None, log_to_driver=True) +num_training_workers = 1 if args.smoke_test else 3 + +CustomTrainingOperator = TrainingOperator.from_creators( + model_creator=ResNet18, + optimizer_creator=optimizer_creator, + data_creator=cifar_creator, + loss_creator=nn.CrossEntropyLoss) + +TorchTrainable = TorchTrainer.as_trainable( + training_operator_cls=CustomTrainingOperator, + initialization_hook=initialization_hook, + num_workers=num_training_workers, + config={ + "test_mode": args.smoke_test, + BATCH_SIZE: 128 * num_training_workers, + }, + use_gpu=not args.smoke_test) + + +class NoFaultToleranceTrainable(TorchTrainable): + def _train(self): + train_stats = self.trainer.train(max_retries=0, profile=True) + validation_stats = self.trainer.validate(profile=True) + stats = merge_dicts(train_stats, validation_stats) + return stats + + +pbt_scheduler = PopulationBasedTraining( + time_attr="training_iteration", + metric="val_loss", + mode="min", + perturbation_interval=1, + hyperparam_mutations={ + # distribution for resampling + "lr": lambda: np.random.uniform(0.001, 1), + # allow perturbations within this set of categorical values + "momentum": [0.8, 0.9, 0.99], + }) + +reporter = CLIReporter() +reporter.add_metric_column("val_loss", "loss") +reporter.add_metric_column("val_accuracy", "acc") + +analysis = tune.run( + NoFaultToleranceTrainable, + num_samples=4, + config={ + "lr": tune.choice([0.001, 0.01, 0.1]), + "momentum": 0.8, + "head_location": None, + "worker_locations": None + }, + max_failures=-1, # used for fault tolerance + checkpoint_freq=2, # used for fault tolerance + progress_reporter=reporter, + scheduler=pbt_scheduler, + callbacks=[FailureInjectorCallback()], + queue_trials=True, + stop={"training_iteration": 1} if args.smoke_test else None) + +print(analysis.get_best_config(metric="val_loss", mode="min")) From 7132529aecc3d1efae5644b9a32f7ce540a2164f Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 135/244] Revert "[Log] use default stderr logger if no raylog starting (#13762)" This reverts commit 565baf8b3d3e2c9174092eb7aecbb2594ef05bfb. --- src/ray/util/logging.cc | 56 ++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 34 deletions(-) diff --git a/src/ray/util/logging.cc b/src/ray/util/logging.cc index 104fff0ec317..b06d64441087 100644 --- a/src/ray/util/logging.cc +++ b/src/ray/util/logging.cc @@ -55,17 +55,6 @@ namespace ray { -RayLogLevel RayLog::severity_threshold_ = RayLogLevel::INFO; -std::string RayLog::app_name_ = ""; -std::string RayLog::log_dir_ = ""; -// Format pattern is 2020-08-21 17:00:00,000 I 100 1001 msg. -// %L is loglevel, %P is process id, %t for thread id. -std::string RayLog::log_format_pattern_ = "[%Y-%m-%d %H:%M:%S,%e %L %P %t] %v"; -std::string RayLog::logger_name_ = "ray_log_sink"; -long RayLog::log_rotation_max_size_ = 1 << 29; -long RayLog::log_rotation_file_num_ = 10; -bool RayLog::is_failure_signal_handler_installed_ = false; - std::string GetCallTrace() { std::string return_message = "Cannot get callstack information."; #if defined(RAY_USE_GLOG) || defined(RAY_USE_SPDLOG) @@ -102,34 +91,23 @@ inline const char *ConstBasename(const char *filepath) { return base ? (base + 1) : filepath; } -/// A logger that prints logs to stderr. -/// This is the default logger if logging is not initialized. -class DefaultStdErrLogger final { - public: - DefaultStdErrLogger() { - default_stderr_logger_ = spdlog::stderr_color_mt("stderr"); - default_stderr_logger_->set_pattern(RayLog::GetLogFormatPattern()); - } - std::shared_ptr GetDefaultLogger() { return default_stderr_logger_; } - - private: - std::shared_ptr default_stderr_logger_; -}; - -/// NOTE(lingxuan.zlx): Default stderr logger must be singleton and global -/// variable so core worker process can invoke `RAY_LOG` in its whole lifecyle. -std::unique_ptr default_stderr_logger(new DefaultStdErrLogger()); - class SpdLogMessage final { public: explicit SpdLogMessage(const char *file, int line, int loglevel) : loglevel_(loglevel) { stream() << ConstBasename(file) << ":" << line << ": "; } + inline std::shared_ptr GetDefaultLogger() { + // We just emit all log informations to stderr when no default logger has been created + // before starting ray log, which is for glog compatible. + static auto logger = spdlog::stderr_color_mt("stderr"); + logger->set_pattern(RayLog::GetLogFormatPattern()); + return logger; + } inline void Flush() { auto logger = spdlog::get(RayLog::GetLoggerName()); if (!logger) { - logger = default_stderr_logger->GetDefaultLogger(); + logger = GetDefaultLogger(); } // To avoid dump duplicated stacktrace with installed failure signal // handler, we have to check whether glog failure signal handler is enabled. @@ -150,13 +128,12 @@ class SpdLogMessage final { ~SpdLogMessage() { Flush(); } inline std::ostream &stream() { return str_; } - private: - SpdLogMessage(const SpdLogMessage &) = delete; - SpdLogMessage &operator=(const SpdLogMessage &) = delete; - private: std::ostringstream str_; int loglevel_; + + SpdLogMessage(const SpdLogMessage &) = delete; + SpdLogMessage &operator=(const SpdLogMessage &) = delete; }; #endif @@ -211,6 +188,17 @@ typedef ray::SpdLogMessage LoggingProvider; typedef ray::CerrLog LoggingProvider; #endif +RayLogLevel RayLog::severity_threshold_ = RayLogLevel::INFO; +std::string RayLog::app_name_ = ""; +std::string RayLog::log_dir_ = ""; +// Format pattern is 2020-08-21 17:00:00,000 I 100 1001 msg. +// %L is loglevel, %P is process id, %t for thread id. +std::string RayLog::log_format_pattern_ = "[%Y-%m-%d %H:%M:%S,%e %L %P %t] %v"; +std::string RayLog::logger_name_ = "ray_log_sink"; +long RayLog::log_rotation_max_size_ = 1 << 29; +long RayLog::log_rotation_file_num_ = 10; +bool RayLog::is_failure_signal_handler_installed_ = false; + #ifdef RAY_USE_GLOG using namespace google; From 41b9970be025419c2bfeeb3aa6e2c47fe92b4092 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 136/244] Revert "Fix windows test (#13811)" This reverts commit 33c190b2f15370c7362f42ec907a84fda319d61f. --- python/ray/autoscaler/_private/util.py | 8 ------- python/ray/tests/test_autoscaler_yaml.py | 30 ++---------------------- 2 files changed, 2 insertions(+), 36 deletions(-) diff --git a/python/ray/autoscaler/_private/util.py b/python/ray/autoscaler/_private/util.py index 32758dec649f..2bd1e13e9c38 100644 --- a/python/ray/autoscaler/_private/util.py +++ b/python/ray/autoscaler/_private/util.py @@ -86,14 +86,6 @@ def validate_config(config: Dict[str, Any]) -> None: raise ValueError( "`head_node_type` must be one of `available_node_types`.") - sum_min_workers = sum( - config["available_node_types"][node_type].get("min_workers", 0) - for node_type in config["available_node_types"]) - if sum_min_workers > config["max_workers"]: - raise ValueError( - "The specified global `max_workers` is smaller than the " - "sum of `min_workers` of all the available node types.") - def prepare_config(config): with_defaults = fillout_defaults(config) diff --git a/python/ray/tests/test_autoscaler_yaml.py b/python/ray/tests/test_autoscaler_yaml.py index 10edbb8fe7e0..b712c8955e97 100644 --- a/python/ray/tests/test_autoscaler_yaml.py +++ b/python/ray/tests/test_autoscaler_yaml.py @@ -46,34 +46,8 @@ def testValidateDefaultConfig(self): self.fail("Config did not pass validation test!") @pytest.mark.skipif( - sys.platform.startswith("win"), reason="Fails on Windows.") - def testValidateDefaultConfigMinMaxWorkers(self): - aws_config_path = os.path.join( - RAY_PATH, "autoscaler/aws/example-multi-node-type.yaml") - with open(aws_config_path) as f: - config = yaml.safe_load(f) - config = prepare_config(config) - for node_type in config["available_node_types"]: - config["available_node_types"][node_type]["resources"] = config[ - "available_node_types"][node_type].get("resources", {}) - try: - validate_config(config) - except Exception: - self.fail("Config did not pass validation test!") - - config["max_workers"] = 0 # the sum of min_workers is 1. - with pytest.raises(ValueError): - validate_config(config) - - # make sure edge case of exactly 1 passes too. - config["max_workers"] = 1 - try: - validate_config(config) - except Exception: - self.fail("Config did not pass validation test!") - - @pytest.mark.skipif( - sys.platform.startswith("win"), reason="Fails on Windows.") + sys.platform.startswith("win"), + reason="TODO(ameer): fails on Windows.") def testValidateDefaultConfigAWSMultiNodeTypes(self): aws_config_path = os.path.join( RAY_PATH, "autoscaler/aws/example-multi-node-type.yaml") From b2f6174ddeda4feb5cfbee8364bf587287c225e3 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 137/244] Revert "[Dashboard] fix new dashboard entrance and some table problem (#13790)" This reverts commit b15dcf4bb21685ee2adb311436f6d9b3d51c4f05. --- .../client/src/pages/dashboard/Dashboard.tsx | 11 +---- dashboard/client/src/pages/job/index.tsx | 3 ++ dashboard/client/src/pages/layout/index.tsx | 12 +++++- dashboard/client/src/pages/node/index.tsx | 43 ++++++++++++++++++- 4 files changed, 58 insertions(+), 11 deletions(-) diff --git a/dashboard/client/src/pages/dashboard/Dashboard.tsx b/dashboard/client/src/pages/dashboard/Dashboard.tsx index 07f266961451..d7eeaf936b45 100644 --- a/dashboard/client/src/pages/dashboard/Dashboard.tsx +++ b/dashboard/client/src/pages/dashboard/Dashboard.tsx @@ -35,7 +35,6 @@ const useDashboardStyles = makeStyles((theme: Theme) => "& > :not(:first-child)": { marginTop: theme.spacing(4), }, - position: "relative", }, tabs: { borderBottomColor: theme.palette.divider, @@ -107,14 +106,8 @@ const Dashboard: React.FC = () => { return (
Ray Dashboard - { driverIpAddress, isDead, driverPid, + state, timestamp, + namespaceId, }) => ( @@ -112,6 +114,7 @@ const JobList = () => { {dayjs(timestamp * 1000).format("YYYY/MM/DD HH:mm:ss")} + {namespaceId} ), )} diff --git a/dashboard/client/src/pages/layout/index.tsx b/dashboard/client/src/pages/layout/index.tsx index bcaffafce6ec..b484a29db646 100644 --- a/dashboard/client/src/pages/layout/index.tsx +++ b/dashboard/client/src/pages/layout/index.tsx @@ -77,6 +77,16 @@ const BasicLayout = ( Ray
Ray Dashboard + history.push("/summary")} + > + SUMMARY + history.push("/")} > - BACK TO EXISTING DASHBOARD + BACK TO LEGACY { } const { raylet, hostname, ip, cpu, mem, net, disk, logUrl } = node; - const { nodeId, state } = raylet; + const { nodeId, state, brpcPort } = raylet; return ( @@ -123,6 +126,15 @@ export const NodeCard = (props: { node: NodeDetail }) => { )} + + + - -
- )}{" "} - - - - {[ - "", - "Pid", - "CPU", - "CPU Times", - "Memory", - "CMD Line", - "Create Time", - "Log", - "Ops", - "IP/Hostname", - ].map((col) => ( - - {col} - - ))} - - - - {workers - .filter(filterFunc) - .sort((aWorker, bWorker) => { - const a = - (aWorker.coreWorkerStats || []).filter( - (e) => actorMap[e.actorId], - ).length || 0; - const b = - (bWorker.coreWorkerStats || []).filter( - (e) => actorMap[e.actorId], - ).length || 0; - return b - a; - }) - .map( - ({ - pid, - cpuPercent, - cpuTimes, - memoryInfo, - cmdline, - createTime, - coreWorkerStats = [], - language, - ip, - hostname, - }) => ( - - } - length={ - (coreWorkerStats || []).filter((e) => actorMap[e.actorId]) - .length - } - key={pid} - stateKey={key} - > - {pid} - - - {cpuPercent}% - - - -
- {Object.entries(cpuTimes || {}).map(([key, val]) => ( -
- {key}:{val} -
- ))} -
-
- -
- {Object.entries(memoryInfo || {}).map(([key, val]) => ( -
- {key}:{memoryConverter(val)} -
- ))} -
-
- - {cmdline && longTextCut(cmdline.filter((e) => e).join(" "))} - - - {dayjs(createTime * 1000).format("YYYY/MM/DD HH:mm:ss")} - - - - {ipLogMap[ip] && ( - - - Log - - - )} - - - - {language === "JAVA" && ( -
- {" "} - - -
- )} -
- - {ip} -
- {nodeMap[hostname] ? ( - - {hostname} - - ) : ( - hostname - )} -
-
- ), - )} -
-
- - ); -}; - -export default RayletWorkerTable; diff --git a/dashboard/client/src/logo.svg b/dashboard/client/src/logo.svg deleted file mode 100644 index 70be9ee548c6..000000000000 --- a/dashboard/client/src/logo.svg +++ /dev/null @@ -1,34 +0,0 @@ - - - - -Ray Logo - - - - - - - - - - diff --git a/dashboard/client/src/pages/actor/index.tsx b/dashboard/client/src/pages/actor/index.tsx deleted file mode 100644 index cbcd264e26af..000000000000 --- a/dashboard/client/src/pages/actor/index.tsx +++ /dev/null @@ -1,36 +0,0 @@ -import { makeStyles } from "@material-ui/core"; -import React, { useEffect, useState } from "react"; -import ActorTable from "../../components/ActorTable"; -import TitleCard from "../../components/TitleCard"; -import { getActors } from "../../service/actor"; -import { Actor } from "../../type/actor"; - -const useStyles = makeStyles((theme) => ({ - root: { - padding: theme.spacing(2), - width: "100%", - }, -})); - -const Actors = () => { - const classes = useStyles(); - const [actors, setActors] = useState<{ [actorId: string]: Actor }>({}); - - useEffect(() => { - getActors().then((res) => { - if (res?.data?.data?.actors) { - setActors(res.data.data.actors); - } - }); - }, []); - - return ( -
- - - -
- ); -}; - -export default Actors; diff --git a/dashboard/client/src/pages/cmd/CMDResult.tsx b/dashboard/client/src/pages/cmd/CMDResult.tsx deleted file mode 100644 index ed87c10d8e7c..000000000000 --- a/dashboard/client/src/pages/cmd/CMDResult.tsx +++ /dev/null @@ -1,137 +0,0 @@ -import { - Button, - Grid, - makeStyles, - MenuItem, - Paper, - Select, -} from "@material-ui/core"; -import React, { useCallback, useEffect, useState } from "react"; -import { RouteComponentProps } from "react-router-dom"; -import LogVirtualView from "../../components/LogView/LogVirtualView"; -import TitleCard from "../../components/TitleCard"; -import { getJmap, getJstack, getJstat } from "../../service/util"; - -const useStyles = makeStyles((theme) => ({ - root: { - padding: theme.spacing(4), - width: "100%", - }, - table: { - marginTop: theme.spacing(4), - padding: theme.spacing(2), - }, - pageMeta: { - padding: theme.spacing(2), - marginTop: theme.spacing(2), - }, - search: { - margin: theme.spacing(1), - }, -})); - -const CMDResult = ( - props: RouteComponentProps<{ cmd: string; ip: string; pid: string }>, -) => { - const classes = useStyles(); - const { - match: { params }, - } = props; - const { cmd, ip, pid } = params; - const [result, setResult] = useState(); - const [option, setOption] = useState("gcutil"); - const executeJstat = useCallback( - () => - getJstat(ip, pid, option) - .then((rsp) => { - if (rsp.data.result) { - setResult(rsp.data.data.output); - } else { - setResult(rsp.data.msg); - } - }) - .catch((err) => setResult(err.toString())), - [ip, pid, option], - ); - - useEffect(() => { - switch (cmd) { - case "jstack": - getJstack(ip, pid) - .then((rsp) => { - if (rsp.data.result) { - setResult(rsp.data.data.output); - } else { - setResult(rsp.data.msg); - } - }) - .catch((err) => setResult(err.toString())); - break; - case "jmap": - getJmap(ip, pid) - .then((rsp) => { - if (rsp.data.result) { - setResult(rsp.data.data.output); - } else { - setResult(rsp.data.msg); - } - }) - .catch((err) => setResult(err.toString())); - break; - case "jstat": - executeJstat(); - break; - default: - setResult(`Command ${cmd} is not supported.`); - break; - } - }, [cmd, executeJstat, ip, pid]); - - return ( -
- - {cmd === "jstat" && ( - - - - - - - - - - - )} - - - - -
- ); -}; - -export default CMDResult; diff --git a/dashboard/client/src/pages/dashboard/Dashboard.tsx b/dashboard/client/src/pages/dashboard/Dashboard.tsx index d7eeaf936b45..0ffbce7f5d5f 100644 --- a/dashboard/client/src/pages/dashboard/Dashboard.tsx +++ b/dashboard/client/src/pages/dashboard/Dashboard.tsx @@ -1,5 +1,4 @@ import { - Button, createStyles, makeStyles, Tab, @@ -9,7 +8,6 @@ import { } from "@material-ui/core"; import React, { useCallback, useEffect, useRef } from "react"; import { useDispatch, useSelector } from "react-redux"; -import { useHistory } from "react-router-dom"; import { getActorGroups, getNodeInfo, getTuneAvailability } from "../../api"; import { StoreState } from "../../store"; import LastUpdated from "./LastUpdated"; @@ -61,7 +59,6 @@ const Dashboard: React.FC = () => { const tuneAvailability = useSelector(tuneAvailabilitySelector); const tab = useSelector(tabSelector); const classes = useDashboardStyles(); - const history = useHistory(); // Polling Function const refreshInfo = useCallback(async () => { @@ -106,9 +103,6 @@ const Dashboard: React.FC = () => { return (
Ray Dashboard - { - return ( -
-
- - - - 404 NOT FOUND -

- We can't provide the page you wanted yet, better try with another path - next time. -

-
-
- ); -}; - -export default Error404; diff --git a/dashboard/client/src/pages/exception/Loading.tsx b/dashboard/client/src/pages/exception/Loading.tsx deleted file mode 100644 index 24140c4dc0de..000000000000 --- a/dashboard/client/src/pages/exception/Loading.tsx +++ /dev/null @@ -1,21 +0,0 @@ -import React from "react"; -import Logo from "../../logo.svg"; - -export default () => { - return ( -
-
- Loading -
- Loading... -
-
- ); -}; diff --git a/dashboard/client/src/pages/index/Index.tsx b/dashboard/client/src/pages/index/Index.tsx deleted file mode 100644 index 9612164499f4..000000000000 --- a/dashboard/client/src/pages/index/Index.tsx +++ /dev/null @@ -1,110 +0,0 @@ -import { - makeStyles, - TableBody, - TableCell, - TableContainer, - TableHead, - TableRow, -} from "@material-ui/core"; -import React, { useEffect, useState } from "react"; -import { version } from "../../../package.json"; -import TitleCard from "../../components/TitleCard"; -import { getRayConfig } from "../../service/cluster"; -import { getNodeList } from "../../service/node"; -import { RayConfig } from "../../type/config"; -import { NodeDetail } from "../../type/node"; -import { memoryConverter } from "../../util/converter"; - -const useStyle = makeStyles((theme) => ({ - root: { - padding: theme.spacing(2), - }, - label: { - fontWeight: "bold", - }, -})); - -const getVal = (key: string, value: any) => { - if (key === "containerMemory") { - return memoryConverter(value * 1024 * 1024); - } - return JSON.stringify(value); -}; - -const useIndex = () => { - const [rayConfig, setConfig] = useState(); - const [nodes, setNodes] = useState([]); - useEffect(() => { - getRayConfig().then((res) => { - if (res?.data?.data?.config) { - setConfig(res.data.data.config); - } - }); - }, []); - useEffect(() => { - getNodeList().then((res) => { - if (res?.data?.data?.summary) { - setNodes(res.data.data.summary); - } - }); - }, []); - - return { rayConfig, nodes }; -}; - -const Index = () => { - const { rayConfig } = useIndex(); - const classes = useStyle(); - - return ( -
- -

Dashboard Frontend Version: {version}

- {rayConfig?.imageUrl && ( -

- Image Url:{" "} - - {rayConfig.imageUrl} - -

- )} - {rayConfig?.sourceCodeLink && ( -

- Source Code:{" "} - - {rayConfig.sourceCodeLink} - -

- )} -
- {rayConfig && ( - - - - Key - Value - - - {Object.entries(rayConfig).map(([key, value]) => ( - - {key} - {getVal(key, value)} - - ))} - - - - )} -
- ); -}; - -export default Index; diff --git a/dashboard/client/src/pages/job/JobDetail.tsx b/dashboard/client/src/pages/job/JobDetail.tsx deleted file mode 100644 index b720b9c057de..000000000000 --- a/dashboard/client/src/pages/job/JobDetail.tsx +++ /dev/null @@ -1,246 +0,0 @@ -import { - Grid, - makeStyles, - Switch, - Tab, - Table, - TableBody, - TableCell, - TableContainer, - TableHead, - TableRow, - Tabs, -} from "@material-ui/core"; -import React from "react"; -import { Link, RouteComponentProps } from "react-router-dom"; -import ActorTable from "../../components/ActorTable"; -import Loading from "../../components/Loading"; -import { StatusChip } from "../../components/StatusChip"; -import TitleCard from "../../components/TitleCard"; -import RayletWorkerTable from "../../components/WorkerTable"; -import { longTextCut } from "../../util/func"; -import { useJobDetail } from "./hook/useJobDetail"; - -const useStyle = makeStyles((theme) => ({ - root: { - padding: theme.spacing(2), - }, - paper: { - padding: theme.spacing(2), - marginTop: theme.spacing(2), - marginBottom: theme.spacing(2), - }, - label: { - fontWeight: "bold", - }, - pageMeta: { - padding: theme.spacing(2), - marginTop: theme.spacing(2), - }, - tab: { - marginBottom: theme.spacing(2), - }, - dependenciesChip: { - margin: theme.spacing(0.5), - wordBreak: "break-all", - }, - alert: { - color: theme.palette.error.main, - }, -})); - -const JobDetailPage = (props: RouteComponentProps<{ id: string }>) => { - const classes = useStyle(); - const { - actorMap, - jobInfo, - job, - msg, - selectedTab, - handleChange, - handleSwitchChange, - params, - refreshing, - ipLogMap, - } = useJobDetail(props); - - if (!job || !jobInfo) { - return ( -
- - - -
- Auto Refresh: - -
- Request Status: {msg}
-
-
- ); - } - - return ( -
- - -
- Auto Refresh: - -
- Request Status: {msg}
-
- - - - - - - - {selectedTab === "info" && ( - - - Driver IP:{" "} - {jobInfo.driverIpAddress} - - {ipLogMap[jobInfo.driverIpAddress] && ( - - Driver Log:{" "} - - Log - - - )} - - Driver Pid:{" "} - {jobInfo.driverPid} - - {jobInfo.eventUrl && ( - - Event Link:{" "} - - Event Log - - - )} - {jobInfo.failErrorMessage && ( - - Fail Error:{" "} - - {jobInfo.failErrorMessage} - - - )} - - )} - {jobInfo?.dependencies && selectedTab === "dep" && ( -
- {jobInfo?.dependencies?.python && ( - -
- {jobInfo.dependencies.python.map((e) => ( - - ))} -
-
- )} - {jobInfo?.dependencies?.java && ( - - - - - - {["Name", "Version", "URL"].map((col) => ( - - {col} - - ))} - - - - {jobInfo.dependencies.java.map( - ({ name, version, url }) => ( - - {name} - {version} - - - {url} - - - - ), - )} - -
-
-
- )} -
- )} - {selectedTab === "worker" && ( -
- - - -
- )} - {selectedTab === "actor" && ( -
- - - -
- )} -
-
- ); -}; - -export default JobDetailPage; diff --git a/dashboard/client/src/pages/job/hook/useJobDetail.ts b/dashboard/client/src/pages/job/hook/useJobDetail.ts deleted file mode 100644 index 695fca760931..000000000000 --- a/dashboard/client/src/pages/job/hook/useJobDetail.ts +++ /dev/null @@ -1,73 +0,0 @@ -import { useCallback, useContext, useEffect, useRef, useState } from "react"; -import { RouteComponentProps } from "react-router-dom"; -import { GlobalContext } from "../../../App"; -import { getJobDetail } from "../../../service/job"; -import { JobDetail } from "../../../type/job"; - -export const useJobDetail = (props: RouteComponentProps<{ id: string }>) => { - const { - match: { params }, - } = props; - const [job, setJob] = useState(); - const [msg, setMsg] = useState("Loading the job detail"); - const [refreshing, setRefresh] = useState(true); - const [selectedTab, setTab] = useState("info"); - const { ipLogMap } = useContext(GlobalContext); - const tot = useRef(); - const handleChange = (event: React.ChangeEvent<{}>, newValue: string) => { - setTab(newValue); - }; - const handleSwitchChange = (event: React.ChangeEvent) => { - setRefresh(event.target.checked); - }; - const getJob = useCallback(async () => { - if (!refreshing) { - return; - } - const rsp = await getJobDetail(params.id); - - if (rsp.data?.data?.detail) { - setJob(rsp.data.data.detail); - } - - if (rsp.data?.msg) { - setMsg(rsp.data.msg || ""); - } - - if (rsp.data.result === false) { - setMsg("Job Query Error Please Check JobId"); - setJob(undefined); - setRefresh(false); - } - - tot.current = setTimeout(getJob, 4000); - }, [refreshing, params.id]); - - useEffect(() => { - if (tot.current) { - clearTimeout(tot.current); - } - getJob(); - return () => { - if (tot.current) { - clearTimeout(tot.current); - } - }; - }, [getJob]); - - const { jobInfo } = job || {}; - const actorMap = job?.jobActors; - - return { - actorMap, - jobInfo, - job, - msg, - selectedTab, - handleChange, - handleSwitchChange, - params, - refreshing, - ipLogMap, - }; -}; diff --git a/dashboard/client/src/pages/job/hook/useJobList.ts b/dashboard/client/src/pages/job/hook/useJobList.ts deleted file mode 100644 index 04f97532f75c..000000000000 --- a/dashboard/client/src/pages/job/hook/useJobList.ts +++ /dev/null @@ -1,68 +0,0 @@ -import { useCallback, useEffect, useRef, useState } from "react"; -import { getJobList } from "../../../service/job"; -import { Job } from "../../../type/job"; - -export const useJobList = () => { - const [jobList, setList] = useState([]); - const [page, setPage] = useState({ pageSize: 10, pageNo: 1 }); - const [msg, setMsg] = useState("Loading the job list..."); - const [isRefreshing, setRefresh] = useState(true); - const [filter, setFilter] = useState< - { - key: "jobId" | "name" | "language" | "state" | "namespaceId"; - val: string; - }[] - >([]); - const refreshRef = useRef(isRefreshing); - const tot = useRef(); - const changeFilter = ( - key: "jobId" | "name" | "language" | "state" | "namespaceId", - val: string, - ) => { - const f = filter.find((e) => e.key === key); - if (f) { - f.val = val; - } else { - filter.push({ key, val }); - } - setFilter([...filter]); - }; - const onSwitchChange = (event: React.ChangeEvent) => { - setRefresh(event.target.checked); - }; - refreshRef.current = isRefreshing; - const getJob = useCallback(async () => { - if (!refreshRef.current) { - return; - } - const rsp = await getJobList(); - - if (rsp?.data?.data?.summary) { - setList(rsp.data.data.summary.sort((a, b) => b.timestamp - a.timestamp)); - setMsg(rsp.data.msg || ""); - } - - tot.current = setTimeout(getJob, 4000); - }, []); - - useEffect(() => { - getJob(); - return () => { - if (tot.current) { - clearTimeout(tot.current); - } - }; - }, [getJob]); - return { - jobList: jobList.filter((node) => - filter.every((f) => node[f.key] && node[f.key].includes(f.val)), - ), - msg, - isRefreshing, - onSwitchChange, - changeFilter, - page, - originalJobs: jobList, - setPage: (key: string, val: number) => setPage({ ...page, [key]: val }), - }; -}; diff --git a/dashboard/client/src/pages/job/index.tsx b/dashboard/client/src/pages/job/index.tsx deleted file mode 100644 index 8d2a4aaa4c96..000000000000 --- a/dashboard/client/src/pages/job/index.tsx +++ /dev/null @@ -1,129 +0,0 @@ -import { - Switch, - Table, - TableBody, - TableCell, - TableContainer, - TableHead, - TableRow, -} from "@material-ui/core"; -import { makeStyles } from "@material-ui/core/styles"; -import Pagination from "@material-ui/lab/Pagination"; -import dayjs from "dayjs"; -import React from "react"; -import { Link } from "react-router-dom"; -import Loading from "../../components/Loading"; -import { SearchInput, SearchSelect } from "../../components/SearchComponent"; -import TitleCard from "../../components/TitleCard"; -import { useJobList } from "./hook/useJobList"; - -const useStyles = makeStyles((theme) => ({ - root: { - padding: theme.spacing(2), - width: "100%", - }, -})); - -const columns = ["ID", "DriverIpAddress", "DriverPid", "IsDead", "Timestamp"]; - -const JobList = () => { - const classes = useStyles(); - const { - msg, - isRefreshing, - onSwitchChange, - jobList, - changeFilter, - page, - setPage, - } = useJobList(); - - return ( -
- - - Auto Refresh: - -
- Request Status: {msg} -
- - - changeFilter("jobId", value)} - /> - changeFilter("language", value)} - options={["JAVA", "PYTHON"]} - /> - - setPage("pageSize", Math.min(Number(value), 500) || 10) - } - /> -
- setPage("pageNo", pageNo)} - /> -
- - - - {columns.map((col) => ( - - {col} - - ))} - - - - {jobList - .slice( - (page.pageNo - 1) * page.pageSize, - page.pageNo * page.pageSize, - ) - .map( - ({ - jobId = "", - driverIpAddress, - isDead, - driverPid, - state, - timestamp, - namespaceId, - }) => ( - - - {jobId} - - {driverIpAddress} - {driverPid} - - {isDead ? "true" : "false"} - - - {dayjs(timestamp * 1000).format("YYYY/MM/DD HH:mm:ss")} - - {namespaceId} - - ), - )} - -
-
-
-
- ); -}; - -export default JobList; diff --git a/dashboard/client/src/pages/layout/index.tsx b/dashboard/client/src/pages/layout/index.tsx deleted file mode 100644 index b484a29db646..000000000000 --- a/dashboard/client/src/pages/layout/index.tsx +++ /dev/null @@ -1,167 +0,0 @@ -import { IconButton, Tooltip } from "@material-ui/core"; -import Drawer from "@material-ui/core/Drawer"; -import List from "@material-ui/core/List"; -import ListItem from "@material-ui/core/ListItem"; -import ListItemText from "@material-ui/core/ListItemText"; -import { makeStyles } from "@material-ui/core/styles"; -import Typography from "@material-ui/core/Typography"; -import { NightsStay, VerticalAlignTop, WbSunny } from "@material-ui/icons"; -import classnames from "classnames"; -import React, { PropsWithChildren } from "react"; -import { RouteComponentProps } from "react-router-dom"; - -import SpeedTools from "../../components/SpeedTools"; -import Logo from "../../logo.svg"; - -const drawerWidth = 200; - -const useStyles = makeStyles((theme) => ({ - root: { - display: "flex", - "& a": { - color: theme.palette.primary.main, - }, - }, - drawer: { - width: drawerWidth, - flexShrink: 0, - background: theme.palette.background.paper, - }, - drawerPaper: { - width: drawerWidth, - border: "none", - background: theme.palette.background.paper, - boxShadow: theme.shadows[1], - }, - title: { - padding: theme.spacing(2), - textAlign: "center", - lineHeight: "36px", - }, - divider: { - background: "rgba(255, 255, 255, .12)", - }, - menuItem: { - cursor: "pointer", - "&:hover": { - background: theme.palette.primary.main, - }, - }, - selected: { - background: `linear-gradient(45deg, ${theme.palette.primary.main} 30%, ${theme.palette.secondary.main} 90%)`, - }, - child: { - flex: 1, - }, -})); - -const BasicLayout = ( - props: PropsWithChildren< - { setTheme: (theme: string) => void; theme: string } & RouteComponentProps - >, -) => { - const classes = useStyles(); - const { location, history, children, setTheme, theme } = props; - - return ( -
- - - Ray
Ray Dashboard -
- - history.push("/summary")} - > - SUMMARY - - history.push("/node")} - > - NODES - - history.push("/job")} - > - JOBS - - history.push("/actors")} - > - ACTORS - - history.push("/log")} - > - LOGS - - history.push("/")} - > - BACK TO LEGACY - - - { - window.scrollTo(0, 0); - }} - > - - - - - { - setTheme(theme === "dark" ? "light" : "dark"); - }} - > - - {theme === "dark" ? : } - - - - - -
-
{children}
-
- ); -}; - -export default BasicLayout; diff --git a/dashboard/client/src/pages/log/Logs.tsx b/dashboard/client/src/pages/log/Logs.tsx deleted file mode 100644 index 12218d52a0fa..000000000000 --- a/dashboard/client/src/pages/log/Logs.tsx +++ /dev/null @@ -1,306 +0,0 @@ -import { - Button, - InputAdornment, - LinearProgress, - List, - ListItem, - makeStyles, - Paper, - Switch, - TextField, -} from "@material-ui/core"; -import { SearchOutlined } from "@material-ui/icons"; -import React, { useEffect, useRef, useState } from "react"; -import { RouteComponentProps } from "react-router-dom"; -import LogVirtualView from "../../components/LogView/LogVirtualView"; -import { SearchInput } from "../../components/SearchComponent"; -import TitleCard from "../../components/TitleCard"; -import { getLogDetail } from "../../service/log"; - -const useStyles = makeStyles((theme) => ({ - root: { - padding: theme.spacing(2), - width: "100%", - }, - table: { - marginTop: theme.spacing(4), - padding: theme.spacing(2), - }, - pageMeta: { - padding: theme.spacing(2), - marginTop: theme.spacing(2), - }, - search: { - margin: theme.spacing(1), - }, -})); - -type LogsProps = RouteComponentProps<{ host?: string; path?: string }> & { - theme?: "dark" | "light"; -}; - -const useLogs = (props: LogsProps) => { - const { - match: { params }, - location: { search: urlSearch }, - theme, - } = props; - const { host, path } = params; - const searchMap = new URLSearchParams(urlSearch); - const urlFileName = searchMap.get("fileName"); - const el = useRef(null); - const [origin, setOrigin] = useState(); - const [search, setSearch] = useState<{ - keywords?: string; - lineNumber?: string; - fontSize?: number; - revert?: boolean; - }>(); - const [fileName, setFileName] = useState(searchMap.get("fileName") || ""); - const [log, setLogs] = useState< - undefined | string | { [key: string]: string }[] - >(); - const [startTime, setStart] = useState(); - const [endTime, setEnd] = useState(); - - useEffect(() => { - setFileName(urlFileName || ""); - }, [urlFileName]); - - useEffect(() => { - let url = "log_index"; - setLogs("Loading..."); - if (host) { - url = decodeURIComponent(host); - setOrigin(new URL(url).origin); - if (path) { - url += decodeURIComponent(path); - } - } else { - setOrigin(undefined); - } - getLogDetail(url) - .then((res) => { - if (res) { - setLogs(res); - } else { - setLogs("(null)"); - } - }) - .catch(() => { - setLogs("Failed to load"); - }); - }, [host, path]); - - return { - log, - origin, - host, - path, - el, - search, - setSearch, - theme, - fileName, - setFileName, - startTime, - setStart, - endTime, - setEnd, - }; -}; - -const Logs = (props: LogsProps) => { - const classes = useStyles(); - const { - log, - origin, - path, - el, - search, - setSearch, - theme, - fileName, - setFileName, - startTime, - setStart, - endTime, - setEnd, - } = useLogs(props); - let href = "#/log/"; - - if (origin) { - if (path) { - const after = decodeURIComponent(path).split("/"); - after.pop(); - if (after.length > 1) { - href += encodeURIComponent(origin); - href += "/"; - href += encodeURIComponent(after.join("/")); - } - } - } - - return ( -
- - - {!origin &&

Please choose an url to get log path

} - {origin && ( -

- Now Path: {origin} - {decodeURIComponent(path || "")} -

- )} - {origin && ( -
- - {typeof log === "object" && ( - { - setFileName(val); - }} - /> - )} -
- )} -
- - {typeof log === "object" && ( - - {log - .filter((e) => !fileName || e?.name?.includes(fileName)) - .map((e: { [key: string]: string }) => ( - - - {e.name} - - - ))} - - )} - {typeof log === "string" && log !== "Loading..." && ( -
-
- { - setSearch({ ...search, keywords: value }); - }, - type: "", - endAdornment: ( - - - - ), - }} - /> - { - setSearch({ ...search, lineNumber: value }); - }, - type: "", - endAdornment: ( - - - - ), - }} - /> - { - setSearch({ ...search, fontSize: Number(value) }); - }, - type: "", - }} - /> - { - setStart(val.target.value); - }} - InputLabelProps={{ - shrink: true, - }} - /> - { - setEnd(val.target.value); - }} - InputLabelProps={{ - shrink: true, - }} - /> -
- Reverse:{" "} - setSearch({ ...search, revert: v })} - /> - -
-
- -
- )} - {log === "Loading..." && ( -
-
- -
- )} -
-
-
- ); -}; - -export default Logs; diff --git a/dashboard/client/src/pages/node/NodeDetail.tsx b/dashboard/client/src/pages/node/NodeDetail.tsx deleted file mode 100644 index 6f5187bdb822..000000000000 --- a/dashboard/client/src/pages/node/NodeDetail.tsx +++ /dev/null @@ -1,287 +0,0 @@ -import { - Grid, - makeStyles, - Switch, - Tab, - TableContainer, - Tabs, -} from "@material-ui/core"; -import dayjs from "dayjs"; -import React from "react"; -import { Link, RouteComponentProps } from "react-router-dom"; -import ActorTable from "../../components/ActorTable"; -import Loading from "../../components/Loading"; -import PercentageBar from "../../components/PercentageBar"; -import { StatusChip } from "../../components/StatusChip"; -import TitleCard from "../../components/TitleCard"; -import RayletWorkerTable from "../../components/WorkerTable"; -import { ViewMeasures } from "../../type/raylet"; -import { memoryConverter } from "../../util/converter"; -import { useNodeDetail } from "./hook/useNodeDetail"; - -const useStyle = makeStyles((theme) => ({ - root: { - padding: theme.spacing(2), - }, - paper: { - padding: theme.spacing(2), - marginTop: theme.spacing(2), - marginBottom: theme.spacing(2), - }, - label: { - fontWeight: "bold", - }, - tab: { - marginBottom: theme.spacing(2), - }, -})); - -const showMeasureKeys = [ - "local_total_resource", - "local_available_resource", - "actor_stats", - "task_dependency_manager_stats", - "reconstruction_policy_stats", - "scheduling_queue_stats", - "object_manager_stats", -]; - -const ViewDataDisplayer = ({ view }: { view?: ViewMeasures }) => { - if (!view) { - return null; - } - const { tags = "", ...otherProps } = view; - - return ( - - {tags.split(",").pop()?.split(":").slice(1).join(":")}= - {Object.keys(otherProps).length > 0 ? ( - JSON.stringify(Object.values(otherProps).pop()) - ) : ( - null - )} - - ); -}; - -const NodeDetailPage = (props: RouteComponentProps<{ id: string }>) => { - const classes = useStyle(); - const { - params, - selectedTab, - nodeDetail, - msg, - isRefreshing, - onRefreshChange, - raylet, - handleChange, - } = useNodeDetail(props); - - return ( -
- - - -
- Auto Refresh: - -
- Request Status: {msg} -
- - - - - - - - {nodeDetail && selectedTab === "info" && ( -
- - -
Hostname
{" "} - {nodeDetail.hostname} -
- -
IP
{nodeDetail.ip} -
-
- - -
CPU (Logic/Physic)
{" "} - {nodeDetail.cpus[0]}/ {nodeDetail.cpus[1]} -
- -
Load (1/5/15min)
{" "} - {nodeDetail?.loadAvg[0] && - nodeDetail.loadAvg[0] - .map((e) => Number(e).toFixed(2)) - .join("/")} -
-
- - -
Load per CPU (1/5/15min)
{" "} - {nodeDetail?.loadAvg[1] && - nodeDetail.loadAvg[1] - .map((e) => Number(e).toFixed(2)) - .join("/")} -
- -
Boot Time
{" "} - {dayjs(nodeDetail.bootTime * 1000).format( - "YYYY/MM/DD HH:mm:ss", - )} -
-
- - -
Sent Tps
{" "} - {memoryConverter(nodeDetail?.net[0])}/s -
- -
Recieved Tps
{" "} - {memoryConverter(nodeDetail?.net[1])}/s -
-
- - -
Memory
{" "} - {nodeDetail?.mem && ( - - {memoryConverter(nodeDetail?.mem[0] - nodeDetail?.mem[1])}/ - {memoryConverter(nodeDetail?.mem[0])}({nodeDetail?.mem[2]}%) - - )} -
- -
CPU
{" "} - - {nodeDetail.cpu}% - -
-
- - {nodeDetail?.disk && - Object.entries(nodeDetail?.disk).map(([path, obj]) => ( - -
Disk ({path})
{" "} - {obj && ( - - {memoryConverter(obj.used)}/{memoryConverter(obj.total)} - ({obj.percent}%, {memoryConverter(obj.free)} free) - - )} -
- ))} -
- - -
Logs
{" "} - - log - -
-
-
- )} - {raylet && Object.keys(raylet).length > 0 && selectedTab === "raylet" && ( - -
- - -
Command
-
-
- {nodeDetail?.cmdline.join(" ")} -
-
-
- - -
Pid
{raylet?.pid} -
- -
Workers Num
{" "} - {raylet?.numWorkers} -
- -
Node Manager Port
{" "} - {raylet?.nodeManagerPort} -
-
- {showMeasureKeys - .map((e) => raylet.viewData.find((view) => view.viewName === e)) - .map((e) => - e ? ( - -

- {e.viewName - .split("_") - .map((e) => e[0].toUpperCase() + e.slice(1)) - .join(" ")} -

- - {e.measures.map((e) => ( - - ))} - -
- ) : null, - )} -
-
- )} - {nodeDetail?.workers && selectedTab === "worker" && ( - - - - - - )} - {nodeDetail?.actors && selectedTab === "actor" && ( - - - - - - )} -
-
- ); -}; - -export default NodeDetailPage; diff --git a/dashboard/client/src/pages/node/hook/useNodeDetail.ts b/dashboard/client/src/pages/node/hook/useNodeDetail.ts deleted file mode 100644 index 1ca3570a20ff..000000000000 --- a/dashboard/client/src/pages/node/hook/useNodeDetail.ts +++ /dev/null @@ -1,66 +0,0 @@ -import { useCallback, useContext, useEffect, useRef, useState } from "react"; -import { RouteComponentProps } from "react-router-dom"; -import { GlobalContext } from "../../../App"; -import { getNodeDetail } from "../../../service/node"; -import { NodeDetailExtend } from "../../../type/node"; - -export const useNodeDetail = (props: RouteComponentProps<{ id: string }>) => { - const { - match: { params }, - } = props; - const [selectedTab, setTab] = useState("info"); - const [nodeDetail, setNode] = useState(); - const [msg, setMsg] = useState("Loading the node infos..."); - const { namespaceMap } = useContext(GlobalContext); - const [isRefreshing, setRefresh] = useState(true); - const tot = useRef(); - const onRefreshChange = (event: React.ChangeEvent) => { - setRefresh(event.target.checked); - }; - const getDetail = useCallback(async () => { - if (!isRefreshing) { - return; - } - const { data } = await getNodeDetail(params.id); - const { data: rspData, msg, result } = data; - if (rspData?.detail) { - setNode(rspData.detail); - } - - if (msg) { - setMsg(msg); - } - - if (result === false) { - setMsg("Node Query Error Please Check Node Name"); - setRefresh(false); - } - - tot.current = setTimeout(getDetail, 4000); - }, [isRefreshing, params.id]); - const raylet = nodeDetail?.raylet; - const handleChange = (event: React.ChangeEvent<{}>, newValue: string) => { - setTab(newValue); - }; - - useEffect(() => { - getDetail(); - return () => { - if (tot.current) { - clearTimeout(tot.current); - } - }; - }, [getDetail]); - - return { - params, - selectedTab, - nodeDetail, - msg, - isRefreshing, - onRefreshChange, - raylet, - handleChange, - namespaceMap, - }; -}; diff --git a/dashboard/client/src/pages/node/hook/useNodeList.ts b/dashboard/client/src/pages/node/hook/useNodeList.ts deleted file mode 100644 index 96a3339ba4e8..000000000000 --- a/dashboard/client/src/pages/node/hook/useNodeList.ts +++ /dev/null @@ -1,74 +0,0 @@ -import { useCallback, useEffect, useRef, useState } from "react"; -import { getNodeList } from "../../../service/node"; -import { NodeDetail } from "../../../type/node"; -import { useSorter } from "../../../util/hook"; - -export const useNodeList = () => { - const [nodeList, setList] = useState([]); - const [msg, setMsg] = useState("Loading the nodes infos..."); - const [isRefreshing, setRefresh] = useState(true); - const [mode, setMode] = useState("table"); - const [filter, setFilter] = useState< - { key: "hostname" | "ip" | "state"; val: string }[] - >([]); - const [page, setPage] = useState({ pageSize: 10, pageNo: 1 }); - const { sorterFunc, setOrderDesc, setSortKey, sorterKey } = useSorter("cpu"); - const tot = useRef(); - const changeFilter = (key: "hostname" | "ip" | "state", val: string) => { - const f = filter.find((e) => e.key === key); - if (f) { - f.val = val; - } else { - filter.push({ key, val }); - } - setFilter([...filter]); - }; - const onSwitchChange = (event: React.ChangeEvent) => { - setRefresh(event.target.checked); - }; - const getList = useCallback(async () => { - if (!isRefreshing) { - return; - } - const { data } = await getNodeList(); - const { data: rspData, msg } = data; - setList(rspData.summary || []); - if (msg) { - setMsg(msg); - } else { - setMsg(""); - } - tot.current = setTimeout(getList, 4000); - }, [isRefreshing]); - - useEffect(() => { - getList(); - return () => { - if (tot.current) { - clearTimeout(tot.current); - } - }; - }, [getList]); - - return { - nodeList: nodeList - .map((e) => ({ ...e, state: e.raylet.state })) - .sort((a, b) => (a.raylet.nodeId > b.raylet.nodeId ? 1 : -1)) - .sort(sorterFunc) - .filter((node) => - filter.every((f) => node[f.key] && node[f.key].includes(f.val)), - ), - msg, - isRefreshing, - onSwitchChange, - changeFilter, - page, - originalNodes: nodeList, - setPage: (key: string, val: number) => setPage({ ...page, [key]: val }), - sorterKey, - setSortKey, - setOrderDesc, - mode, - setMode, - }; -}; diff --git a/dashboard/client/src/pages/node/index.tsx b/dashboard/client/src/pages/node/index.tsx deleted file mode 100644 index 3713fdc15748..000000000000 --- a/dashboard/client/src/pages/node/index.tsx +++ /dev/null @@ -1,392 +0,0 @@ -import { - Button, - ButtonGroup, - Grid, - Paper, - Switch, - Table, - TableBody, - TableCell, - TableContainer, - TableHead, - TableRow, - Tooltip, -} from "@material-ui/core"; -import { makeStyles } from "@material-ui/core/styles"; -import Pagination from "@material-ui/lab/Pagination"; -import dayjs from "dayjs"; -import React from "react"; -import { Link } from "react-router-dom"; -import Loading from "../../components/Loading"; -import PercentageBar from "../../components/PercentageBar"; -import { SearchInput, SearchSelect } from "../../components/SearchComponent"; -import StateCounter from "../../components/StatesCounter"; -import { StatusChip } from "../../components/StatusChip"; -import TitleCard from "../../components/TitleCard"; -import { NodeDetail } from "../../type/node"; -import { memoryConverter } from "../../util/converter"; -import { useNodeList } from "./hook/useNodeList"; - -const useStyles = makeStyles((theme) => ({ - root: { - padding: theme.spacing(2), - width: "100%", - position: "relative", - }, -})); - -const columns = [ - "State", - "ID", - "Host", - "IP", - "CPU Usage", - "Memory", - "Disk(root)", - "Sent", - "Received", - "BRPC Port", - "Time Info", - "Log", -]; - -export const brpcLinkChanger = (href: string) => { - const { location } = window; - const { pathname } = location; - const pathArr = pathname.split("/"); - if (pathArr.some((e) => e.split(".").length > 1)) { - const index = pathArr.findIndex((e) => e.includes(".")); - const resultArr = pathArr.slice(0, index); - resultArr.push(href); - return `${location.protocol}//${location.host}${resultArr.join("/")}`; - } - - return `http://${href}`; -}; - -export const NodeCard = (props: { node: NodeDetail }) => { - const { node } = props; - - if (!node) { - return null; - } - - const { raylet, hostname, ip, cpu, mem, net, disk, logUrl } = node; - const { nodeId, state, brpcPort } = raylet; - - return ( - -

- {nodeId}{" "} -

-

- - - - - - {hostname}({ip}) - - {net && net[0] >= 0 && ( - - Sent{" "} - {memoryConverter(net[0])}/s{" "} - Received{" "} - {memoryConverter(net[1])}/s - - )} - -

- - {cpu >= 0 && ( - - CPU - - {cpu}% - - - )} - {mem && ( - - Memory - - {memoryConverter(mem[0] - mem[1])}/{memoryConverter(mem[0])}( - {mem[2]}%) - - - )} - {disk && disk["/"] && ( - - Disk('/') - - {memoryConverter(disk["/"].used)}/ - {memoryConverter(disk["/"].total)}({disk["/"].percent}%) - - - )} - - - - - - - - - -
- ); -}; - -const Nodes = () => { - const classes = useStyles(); - const { - msg, - isRefreshing, - onSwitchChange, - nodeList, - changeFilter, - page, - setPage, - setSortKey, - setOrderDesc, - mode, - setMode, - } = useNodeList(); - - return ( -
- - - Auto Refresh: - -
- Request Status: {msg} -
- - - - - - - changeFilter("hostname", value.trim())} - /> - - - changeFilter("ip", value.trim())} - /> - - - changeFilter("state", value.trim())} - options={["ALIVE", "DEAD"]} - /> - - - - setPage("pageSize", Math.min(Number(value), 500) || 10) - } - /> - - - setSortKey(val)} - /> - - - - Reverse: - setOrderDesc(checked)} /> - - - - - - - - - -
- setPage("pageNo", pageNo)} - /> -
- {mode === "table" && ( - - - - - {columns.map((col) => ( - - {col} - - ))} - - - - {nodeList - .slice( - (page.pageNo - 1) * page.pageSize, - page.pageNo * page.pageSize, - ) - .map( - ( - { - hostname = "", - ip = "", - cpu = 0, - mem = [], - disk, - net = [0, 0], - raylet, - logUrl, - }: NodeDetail, - i, - ) => ( - - - - - - - - {raylet.nodeId.slice(0, 5)} - - - - {hostname} - {ip} - - - {cpu}% - - - - - {memoryConverter(mem[0] - mem[1])}/ - {memoryConverter(mem[0])}({mem[2]}%) - - - - {disk && disk["/"] && ( - - {memoryConverter(disk["/"].used)}/ - {memoryConverter(disk["/"].total)}( - {disk["/"].percent}%) - - )} - - - {memoryConverter(net[0])}/s - - - {memoryConverter(net[1])}/s - - - {raylet.brpcPort && ( - - {raylet.brpcPort} - - )} - - - {!!raylet.startTime && ( -

- Start Time:{" "} - {dayjs(raylet.startTime * 1000).format( - "YYYY/MM/DD HH:mm:ss", - )} -

- )} - {!!raylet.terminateTime && ( -

- End Time:{" "} - {dayjs(raylet.terminateTime * 1000).format( - "YYYY/MM/DD HH:mm:ss", - )} -

- )} -
- - - Log - - -
- ), - )} -
-
-
- )} - {mode === "card" && ( - - {nodeList - .slice( - (page.pageNo - 1) * page.pageSize, - page.pageNo * page.pageSize, - ) - .map((e) => ( - - - - ))} - - )} -
-
- ); -}; - -export default Nodes; diff --git a/dashboard/client/src/service/actor.ts b/dashboard/client/src/service/actor.ts deleted file mode 100644 index 425fd62a44de..000000000000 --- a/dashboard/client/src/service/actor.ts +++ /dev/null @@ -1,14 +0,0 @@ -import axios from "axios"; -import { Actor } from "../type/actor"; - -export const getActors = () => { - return axios.get<{ - result: boolean; - message: string; - data: { - actors: { - [actorId: string]: Actor; - }; - }; - }>("logical/actors"); -}; diff --git a/dashboard/client/src/service/cluster.ts b/dashboard/client/src/service/cluster.ts deleted file mode 100644 index 9bf53e76dbb9..000000000000 --- a/dashboard/client/src/service/cluster.ts +++ /dev/null @@ -1,6 +0,0 @@ -import axios from "axios"; -import { RayConfigRsp } from "../type/config"; - -export const getRayConfig = () => { - return axios.get("api/ray_config"); -}; diff --git a/dashboard/client/src/service/job.ts b/dashboard/client/src/service/job.ts deleted file mode 100644 index fc5d5452db68..000000000000 --- a/dashboard/client/src/service/job.ts +++ /dev/null @@ -1,10 +0,0 @@ -import axios from "axios"; -import { JobDetailRsp, JobListRsp } from "../type/job"; - -export const getJobList = () => { - return axios.get("jobs?view=summary"); -}; - -export const getJobDetail = (id: string) => { - return axios.get(`jobs/${id}`); -}; diff --git a/dashboard/client/src/service/log.ts b/dashboard/client/src/service/log.ts deleted file mode 100644 index b485b12f1684..000000000000 --- a/dashboard/client/src/service/log.ts +++ /dev/null @@ -1,35 +0,0 @@ -import axios from "axios"; - -export const getLogDetail = async (url: string) => { - if (window.location.pathname !== "/" && url !== "log_index") { - const pathArr = window.location.pathname.split("/"); - if (pathArr.length > 1) { - const idx = pathArr.findIndex((e) => e.includes(":")); - if (idx > -1) { - const afterArr = pathArr.slice(0, idx); - afterArr.push(url.replace(/https?:\/\//, "")); - url = afterArr.join("/"); - } - } - } - const rsp = await axios.get( - url === "log_index" ? url : `log_proxy?url=${encodeURIComponent(url)}`, - ); - if (rsp.headers["content-type"]?.includes("html")) { - const el = document.createElement("div"); - el.innerHTML = rsp.data; - const arr = [].map.call( - el.getElementsByTagName("li"), - (li: HTMLLIElement) => { - const a = li.children[0] as HTMLAnchorElement; - return { - name: li.innerText, - href: li.innerText.includes("http") ? a.href : a.pathname, - } as { [key: string]: string }; - }, - ); - return arr as { [key: string]: string }[]; - } - - return rsp.data as string; -}; diff --git a/dashboard/client/src/service/node.ts b/dashboard/client/src/service/node.ts deleted file mode 100644 index 5eac1dc9cafb..000000000000 --- a/dashboard/client/src/service/node.ts +++ /dev/null @@ -1,10 +0,0 @@ -import axios from "axios"; -import { NodeDetailRsp, NodeListRsp } from "../type/node"; - -export const getNodeList = async () => { - return await axios.get("nodes?view=summary"); -}; - -export const getNodeDetail = async (id: string) => { - return await axios.get(`nodes/${id}`); -}; diff --git a/dashboard/client/src/service/util.ts b/dashboard/client/src/service/util.ts deleted file mode 100644 index 966c82db2919..000000000000 --- a/dashboard/client/src/service/util.ts +++ /dev/null @@ -1,52 +0,0 @@ -import axios from "axios"; - -type CMDRsp = { - result: boolean; - msg: string; - data: { - output: string; - }; -}; - -export const getJstack = (ip: string, pid: string) => { - return axios.get("utils/jstack", { - params: { - ip, - pid, - }, - }); -}; - -export const getJmap = (ip: string, pid: string) => { - return axios.get("utils/jmap", { - params: { - ip, - pid, - }, - }); -}; - -export const getJstat = (ip: string, pid: string, options: string) => { - return axios.get("utils/jstat", { - params: { - ip, - pid, - options, - }, - }); -}; - -type NamespacesRsp = { - result: boolean; - msg: string; - data: { - namespaces: { - namespaceId: string; - hostNameList: string[]; - }[]; - }; -}; - -export const getNamespaces = () => { - return axios.get("namespaces"); -}; diff --git a/dashboard/client/src/theme.ts b/dashboard/client/src/theme.ts deleted file mode 100644 index f83d58b5ad46..000000000000 --- a/dashboard/client/src/theme.ts +++ /dev/null @@ -1,61 +0,0 @@ -import { blue, blueGrey, grey, lightBlue } from "@material-ui/core/colors"; -import { createMuiTheme } from "@material-ui/core/styles"; - -const basicTheme = { - typography: { - fontSize: 12, - fontFamily: [ - "-apple-system", - "BlinkMacSystemFont", - '"Segoe UI"', - "Roboto", - '"Helvetica Neue"', - "Arial", - "sans-serif", - '"Apple Color Emoji"', - '"Segoe UI Emoji"', - '"Segoe UI Symbol"', - ].join(","), - }, - props: { - MuiPaper: { - elevation: 0, - }, - }, -}; - -export const lightTheme = createMuiTheme({ - ...basicTheme, - palette: { - primary: blue, - secondary: lightBlue, - text: { - primary: grey[900], - secondary: grey[800], - disabled: grey[400], - hint: grey[300], - }, - background: { - paper: "#fff", - default: blueGrey[50], - }, - }, -}); - -export const darkTheme = createMuiTheme({ - ...basicTheme, - palette: { - primary: blue, - secondary: lightBlue, - text: { - primary: blueGrey[50], - secondary: blueGrey[100], - disabled: blueGrey[200], - hint: blueGrey[300], - }, - background: { - paper: grey[800], - default: grey[900], - }, - }, -}); diff --git a/dashboard/client/src/type/actor.ts b/dashboard/client/src/type/actor.ts deleted file mode 100644 index 8a00c0e41269..000000000000 --- a/dashboard/client/src/type/actor.ts +++ /dev/null @@ -1,94 +0,0 @@ -export enum ActorEnum { - ALIVE = "ALIVE", - PENDING = "PENDING", - RECONSTRUCTING = "RECONSTRUCTING", - DEAD = "DEAD", -} - -export type Address = { - rayletId: string; - ipAddress: string; - port: number; - workerId: string; -}; - -export type TaskSpec = { - actorCreationTaskSpec: { - actorId: string; - dynamicWorkerOptions: string[]; - extensionData: string; - isAsyncio: boolean; - isDetached: boolean; - maxActorRestarts: boolean; - maxConcurrency: number; - name: string; - }; - args: { - data: string; - metadata: string; - nestedInlinedIds: string[]; - objectIds: string[]; - }[]; - callerAddress: { - ipAddress: string; - port: number; - rayletId: string; - workerId: string; - }; - callerId: string; - functionDescriptor: { - javaFunctionDescriptor: { - className: string; - functionName: string; - signature: string; - }; - pythonFunctionDescriptor: { - className: string; - functionName: string; - signature: string; - }; - }; - jobId: string; - language: string; - maxRetries: number; - numReturns: string; - parentCounter: string; - parentTaskId: string; - requiredPlacementResources: { - [key: string]: number; - }; - requiredResources: { - [key: string]: number; - }; - sourceActorId: string; - taskId: string; - type: string; -}; - -export type Actor = { - actorId: string; - children: { [key: string]: Actor }; - taskSpec: TaskSpec; - ipAddress: string; - isDirectCall: boolean; - jobId: string; - numExecutedTasks: number; - numLocalObjects: number; - numObjectIdsInScope: number; - state: ActorEnum | string; // PENDING, ALIVE, RECONSTRUCTING, DEAD - taskQueueLength: number; - usedObjectStoreMemory: number; - usedResources: { [key: string]: string | number }; - timestamp: number; - actorTitle: string; - averageTaskExecutionSpeed: number; - nodeId: string; - pid: number; - ownerAddress: Address; - address: Address; - maxReconstructions: string; - remainingReconstructions: string; - isDetached: false; - name: string; - numRestarts: string; -}; diff --git a/dashboard/client/src/type/config.d.ts b/dashboard/client/src/type/config.d.ts deleted file mode 100644 index 40a34a25fcd5..000000000000 --- a/dashboard/client/src/type/config.d.ts +++ /dev/null @@ -1,22 +0,0 @@ -export type RayConfig = { - userName: string; - workNodeNumber: number; - headNodeNumber: number; - containerVcores: number; - containerMemory: number; - clusterName: string; - supremeFo: boolean; - jobManagerPort: number; - externalRedisAddresses: string; - envParams: string; - sourceCodeLink: string; - imageUrl: string; -}; - -export type RayConfigRsp = { - result: boolean; - msg: string; - data: { - config: RayConfig; - }; -}; diff --git a/dashboard/client/src/type/event.d.ts b/dashboard/client/src/type/event.d.ts deleted file mode 100644 index 4f586f9a04d5..000000000000 --- a/dashboard/client/src/type/event.d.ts +++ /dev/null @@ -1,31 +0,0 @@ -export type Event = { - eventId: string; - jobId: string; - nodeId: string; - sourceType: string; - sourceHostname: string; - sourcePid: number; - label: string; - message: string; - timestamp: number; - severity: string; -}; - -export type EventRsp = { - result: boolean; - msg: string; - data: { - jobId: string; - events: Event[]; - }; -}; - -export type EventGlobalRsp = { - result: boolean; - msg: string; - data: { - events: { - global: Event[]; - }; - }; -}; diff --git a/dashboard/client/src/type/job.d.ts b/dashboard/client/src/type/job.d.ts deleted file mode 100644 index c5ca4dce874c..000000000000 --- a/dashboard/client/src/type/job.d.ts +++ /dev/null @@ -1,70 +0,0 @@ -import { Actor } from "./actor"; -import { Worker } from "./worker"; - -export type Job = { - jobId: string; - name: string; - owner: string; - language: string; - driverEntry: string; - state: string; - timestamp: number; - namespaceId: string; - driverPid: number; - driverIpAddress: string; - isDead: boolean; -}; - -export type PythonDependenciey = string; - -export type JavaDependency = { - name: string; - version: string; - md5: string; - url: string; -}; - -export type JobInfo = { - url: string; - driverArgs: string; - customConfig: { - [k: string]: string; - }; - jvmOptions: string; - dependencies: { - python: PythonDependenciey[]; - java: JavaDependency[]; - }; - driverStarted: boolean; - submitTime: string; - startTime: null | string | number; - endTime: null | string | number; - driverIpAddress: string; - driverHostname: string; - driverPid: number; - eventUrl: string; - failErrorMessage: string; - driverCmdline: string; -} & Job; - -export type JobDetail = { - jobInfo: JobInfo; - jobActors: { [id: string]: Actor }; - jobWorkers: Worker[]; -}; - -export type JobDetailRsp = { - data: { - detail: JobDetail; - }; - msg: string; - result: boolean; -}; - -export type JobListRsp = { - data: { - summary: Job[]; - }; - msg: string; - result: boolean; -}; diff --git a/dashboard/client/src/type/node.d.ts b/dashboard/client/src/type/node.d.ts deleted file mode 100644 index 12106d9adab0..000000000000 --- a/dashboard/client/src/type/node.d.ts +++ /dev/null @@ -1,62 +0,0 @@ -import { Actor } from "./actor"; -import { Raylet } from "./raylet"; -import { Worker } from "./worker"; - -export type NodeDetail = { - now: number; - hostname: string; - ip: string; - cpu: number; // cpu usage - cpus: number[]; // Logic CPU Count, Physical CPU Count - mem: number[]; // total memory, free memory, memory used ratio - bootTime: number; // start time - loadAvg: number[][]; // recent 1,5,15 minitues system load,load per cpu http://man7.org/linux/man-pages/man3/getloadavg.3.html - disk: { - // disk used on root - "/": { - total: number; - used: number; - free: number; - percent: number; - }; - // disk used on tmp - "/tmp": { - total: number; - used: number; - free: number; - percent: number; - }; - }; - net: number[]; // sent tps, received tps - raylet: Raylet; - logCounts: number; - errorCounts: number; - actors: { [id: string]: Actor }; - cmdline: string[]; - state: string; - logUrl: string; -}; - -export type NodeListRsp = { - data: { - summary: NodeDetail[]; - }; - result: boolean; - msg: string; -}; - -export type NodeDetailExtend = { - workers: Worker[]; - raylet: Raylet; - actors: { - [actorId: string]: Actor; - }; -} & NodeDetail; - -export type NodeDetailRsp = { - data: { - detail: NodeDetailExtend; - }; - msg: string; - result: boolean; -}; diff --git a/dashboard/client/src/type/raylet.d.ts b/dashboard/client/src/type/raylet.d.ts deleted file mode 100644 index 459b4c2b9086..000000000000 --- a/dashboard/client/src/type/raylet.d.ts +++ /dev/null @@ -1,28 +0,0 @@ -export type ViewMeasures = { - tags: string; - int_value?: number; - double_value?: number; - distribution_min?: number; - distribution_mean?: number; - distribution_max?: number; - distribution_count?: number; - distribution_bucket_boundaries?: number[]; - distribution_bucket_counts?: number[]; -}; - -export type ViewData = { - viewName: string; - measures: ViewMeasures[]; -}; - -export type Raylet = { - viewData: ViewData[]; - numWorkers: number; - pid: number; - nodeId: string; - nodeManagerPort: number; - brpcPort: pid; - state: string; - startTime: number; - terminateTime: number; -}; diff --git a/dashboard/client/src/type/worker.d.ts b/dashboard/client/src/type/worker.d.ts deleted file mode 100644 index cf35bfa018dd..000000000000 --- a/dashboard/client/src/type/worker.d.ts +++ /dev/null @@ -1,36 +0,0 @@ -export type CoreWorkerStats = { - currentTaskFuncDesc: string; - ipAddress: string; - port: string; - actorId: string; - usedResources: { [key: string]: number }; - numExecutedTasks: number; - workerId: string; - actorTitle: string; - jobId: string; -}; - -export type Worker = { - createTime: number; - cpuPercent: number; - cmdline: string[]; - memoryInfo: { - rss: number; // aka “Resident Set Size”, this is the non-swapped physical memory a process has used. On UNIX it matches “top“‘s RES column). On Windows this is an alias for wset field and it matches “Mem Usage” column of taskmgr.exe. - vms: number; // aka “Virtual Memory Size”, this is the total amount of virtual memory used by the process. On UNIX it matches “top“‘s VIRT column. On Windows this is an alias for pagefile field and it matches “Mem Usage” “VM Size” column of taskmgr.exe. - pfaults: number; // number of page faults. - pageins: number; // number of actual pageins. - [key: string]: number; - }; - cpuTimes: { - user: number; - system: number; - childrenUser: number; - childrenUystem: number; - iowait?: number; - }; - pid: number; - coreWorkerStats: CoreWorkerStats[]; - language: string; - hostname: string; - ip: hostname; -}; diff --git a/dashboard/client/src/util/converter.ts b/dashboard/client/src/util/converter.ts deleted file mode 100644 index 427ae86b78f3..000000000000 --- a/dashboard/client/src/util/converter.ts +++ /dev/null @@ -1,27 +0,0 @@ -export const memoryConverter = (bytes: number) => { - if (bytes < 1024) { - return `${bytes}KB`; - } - - if (bytes < 1024 ** 2) { - return `${(bytes / 1024 ** 1).toFixed(2)}KB`; - } - - if (bytes < 1024 ** 3) { - return `${(bytes / 1024 ** 2).toFixed(2)}MB`; - } - - if (bytes < 1024 ** 4) { - return `${(bytes / 1024 ** 3).toFixed(2)}GB`; - } - - if (bytes < 1024 ** 5) { - return `${(bytes / 1024 ** 4).toFixed(2)}TB`; - } - - if (bytes < 1024 ** 6) { - return `${(bytes / 1024 ** 5).toFixed(2)}TB`; - } - - return ""; -}; diff --git a/dashboard/client/src/util/func.tsx b/dashboard/client/src/util/func.tsx deleted file mode 100644 index c07ef70fe85b..000000000000 --- a/dashboard/client/src/util/func.tsx +++ /dev/null @@ -1,28 +0,0 @@ -import { Tooltip } from "@material-ui/core"; -import React, { CSSProperties } from "react"; - -export const longTextCut = (text: string = "", len: number = 28) => ( - - {text.length > len ? text.slice(0, len) + "..." : text} - -); - -export const jsonFormat = (str: string | object) => { - const preStyle = { - textAlign: "left", - wordBreak: "break-all", - whiteSpace: "pre-wrap", - } as CSSProperties; - if (typeof str === "object") { - return
{JSON.stringify(str, null, 2)}
; - } - try { - const j = JSON.parse(str); - if (typeof j !== "object") { - return JSON.stringify(j); - } - return
{JSON.stringify(j, null, 2)}
; - } catch (e) { - return str; - } -}; diff --git a/dashboard/client/src/util/hook.ts b/dashboard/client/src/util/hook.ts deleted file mode 100644 index 3c6f61b06ef8..000000000000 --- a/dashboard/client/src/util/hook.ts +++ /dev/null @@ -1,63 +0,0 @@ -import { get } from "lodash"; -import { useState } from "react"; - -export const useFilter = () => { - const [filters, setFilters] = useState<{ key: KeyType; val: string }[]>([]); - const changeFilter = (key: KeyType, val: string) => { - const f = filters.find((e) => e.key === key); - if (f) { - f.val = val; - } else { - filters.push({ key, val }); - } - setFilters([...filters]); - }; - const filterFunc = (instance: { [key: string]: any }) => { - return filters.every( - (f) => !f.val || get(instance, f.key, "").toString().includes(f.val), - ); - }; - - return { - changeFilter, - filterFunc, - }; -}; - -export const useSorter = (initialSortKey?: string) => { - const [sorter, setSorter] = useState({ - key: initialSortKey || "", - desc: false, - }); - - const sorterFunc = ( - instanceA: { [key: string]: any }, - instanceB: { [key: string]: any }, - ) => { - if (!sorter.key) { - return 0; - } - - let [b, a] = [instanceA, instanceB]; - if (sorter.desc) { - [a, b] = [instanceA, instanceB]; - } - - if (!get(a, sorter.key)) { - return -1; - } - - if (!get(b, sorter.key)) { - return 1; - } - - return get(a, sorter.key) > get(b, sorter.key) ? 1 : -1; - }; - - return { - sorterFunc, - setSortKey: (key: string) => setSorter({ ...sorter, key }), - setOrderDesc: (desc: boolean) => setSorter({ ...sorter, desc }), - sorterKey: sorter.key, - }; -}; diff --git a/dashboard/client/src/util/localData.ts b/dashboard/client/src/util/localData.ts deleted file mode 100644 index 0066c4788b95..000000000000 --- a/dashboard/client/src/util/localData.ts +++ /dev/null @@ -1,12 +0,0 @@ -export const getLocalStorage = (key: string) => { - const data = window.localStorage.getItem(key); - try { - return JSON.parse(data || "") as T; - } catch { - return data; - } -}; - -export const setLocalStorage = (key: string, value: any) => { - return window.localStorage.setItem(key, JSON.stringify(value)); -}; From 44710d870ea326de5588e66972a491f222cadde8 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 151/244] Revert "[core] Pin arguments during task execution (#13737)" This reverts commit da4f463d567346fe0e4f2719b81962b5ca019985. --- python/ray/tests/test_object_manager.py | 16 ++- python/ray/tests/test_object_spilling.py | 3 + src/ray/raylet/dependency_manager.cc | 6 + src/ray/raylet/dependency_manager.h | 9 ++ src/ray/raylet/dependency_manager_test.cc | 10 ++ src/ray/raylet/node_manager.cc | 65 ++++------ src/ray/raylet/node_manager.h | 10 -- .../raylet/scheduling/cluster_task_manager.cc | 49 +------ .../raylet/scheduling/cluster_task_manager.h | 22 +--- .../scheduling/cluster_task_manager_test.cc | 122 ++++-------------- src/ray/raylet/test/util.h | 9 +- 11 files changed, 99 insertions(+), 222 deletions(-) diff --git a/python/ray/tests/test_object_manager.py b/python/ray/tests/test_object_manager.py index 004b1c2f6a5d..e38733f62d7e 100644 --- a/python/ray/tests/test_object_manager.py +++ b/python/ray/tests/test_object_manager.py @@ -296,6 +296,9 @@ def driver(): ray.get(driver.remote()) +@pytest.mark.skip( + reason="This hangs due to a deadlock between a worker getting its " + "arguments and the node pulling arguments for the next task queued.") @pytest.mark.timeout(30) def test_pull_bundles_admission_control(shutdown_only): cluster = Cluster() @@ -330,6 +333,9 @@ def foo(*args): ray.get(tasks) +@pytest.mark.skip( + reason="This hangs due to a deadlock between a worker getting its " + "arguments and the node pulling arguments for the next task queued.") @pytest.mark.timeout(30) def test_pull_bundles_admission_control_dynamic(shutdown_only): # This test is the same as test_pull_bundles_admission_control, except that @@ -352,13 +358,11 @@ def test_pull_bundles_admission_control_dynamic(shutdown_only): cluster.wait_for_nodes() @ray.remote - def foo(i, *args): - print("foo", i) + def foo(*args): return @ray.remote - def allocate(i): - print("allocate", i) + def allocate(*args): return np.zeros(object_size, dtype=np.uint8) args = [] @@ -369,8 +373,8 @@ def allocate(i): ] args.append(task_args) - tasks = [foo.remote(i, *task_args) for i, task_args in enumerate(args)] - allocated = [allocate.remote(i) for i in range(num_objects)] + tasks = [foo.remote(*task_args) for task_args in args] + allocated = [allocate.remote() for _ in range(num_objects)] ray.get(tasks) del allocated diff --git a/python/ray/tests/test_object_spilling.py b/python/ray/tests/test_object_spilling.py index 242799dc9281..3f5b5f7ae885 100644 --- a/python/ray/tests/test_object_spilling.py +++ b/python/ray/tests/test_object_spilling.py @@ -618,6 +618,9 @@ def test_release_during_plasma_fetch(object_spilling_config, shutdown_only): do_test_release_resource(object_spilling_config, expect_released=True) +@pytest.mark.skip( + reason="This hangs due to a deadlock between a worker getting its " + "arguments and the node pulling arguments for the next task queued.") @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") @pytest.mark.timeout(30) diff --git a/src/ray/raylet/dependency_manager.cc b/src/ray/raylet/dependency_manager.cc index 7c9faf642d3c..988893beaa47 100644 --- a/src/ray/raylet/dependency_manager.cc +++ b/src/ray/raylet/dependency_manager.cc @@ -185,6 +185,12 @@ bool DependencyManager::RequestTaskDependencies( return task_entry.num_missing_dependencies == 0; } +bool DependencyManager::IsTaskReady(const TaskID &task_id) const { + auto task_entry = queued_task_requests_.find(task_id); + RAY_CHECK(task_entry != queued_task_requests_.end()); + return task_entry->second.num_missing_dependencies == 0; +} + void DependencyManager::RemoveTaskDependencies(const TaskID &task_id) { RAY_LOG(DEBUG) << "Removing dependencies for task " << task_id; auto task_entry = queued_task_requests_.find(task_id); diff --git a/src/ray/raylet/dependency_manager.h b/src/ray/raylet/dependency_manager.h index 903a9893a579..1e7ddfcb17c1 100644 --- a/src/ray/raylet/dependency_manager.h +++ b/src/ray/raylet/dependency_manager.h @@ -37,6 +37,7 @@ class TaskDependencyManagerInterface { virtual bool RequestTaskDependencies( const TaskID &task_id, const std::vector &required_objects) = 0; + virtual bool IsTaskReady(const TaskID &task_id) const = 0; virtual void RemoveTaskDependencies(const TaskID &task_id) = 0; virtual ~TaskDependencyManagerInterface(){}; }; @@ -130,6 +131,14 @@ class DependencyManager : public TaskDependencyManagerInterface { bool RequestTaskDependencies(const TaskID &task_id, const std::vector &required_objects); + /// Check whether a task is ready to run. The task ID must have been + /// previously added by the caller. + /// + /// \param task_id The ID of the task to check. + /// \return Whether all of the dependencies for the task are + /// local. + bool IsTaskReady(const TaskID &task_id) const; + /// Cancel a task's dependencies. We will no longer attempt to fetch any /// remote dependencies, if no other task or worker requires them. /// diff --git a/src/ray/raylet/dependency_manager_test.cc b/src/ray/raylet/dependency_manager_test.cc index 6ea260bc3d97..c6d0ab2ee8c5 100644 --- a/src/ray/raylet/dependency_manager_test.cc +++ b/src/ray/raylet/dependency_manager_test.cc @@ -89,6 +89,7 @@ TEST_F(DependencyManagerTest, TestSimpleTask) { dependency_manager_.RequestTaskDependencies(task_id, ObjectIdsToRefs(arguments)); ASSERT_FALSE(ready); ASSERT_EQ(object_manager_mock_.active_requests.size(), 1); + ASSERT_FALSE(dependency_manager_.IsTaskReady(task_id)); // For each argument, tell the task dependency manager that the argument is // local. All arguments should be canceled as they become available locally. @@ -97,12 +98,15 @@ TEST_F(DependencyManagerTest, TestSimpleTask) { } auto ready_task_ids = dependency_manager_.HandleObjectLocal(arguments[0]); ASSERT_TRUE(ready_task_ids.empty()); + ASSERT_FALSE(dependency_manager_.IsTaskReady(task_id)); ready_task_ids = dependency_manager_.HandleObjectLocal(arguments[1]); ASSERT_TRUE(ready_task_ids.empty()); + ASSERT_FALSE(dependency_manager_.IsTaskReady(task_id)); // The task is ready to run. ready_task_ids = dependency_manager_.HandleObjectLocal(arguments[2]); ASSERT_EQ(ready_task_ids.size(), 1); ASSERT_EQ(ready_task_ids.front(), task_id); + ASSERT_TRUE(dependency_manager_.IsTaskReady(task_id)); // Remove the task. dependency_manager_.RemoveTaskDependencies(task_id); @@ -123,6 +127,7 @@ TEST_F(DependencyManagerTest, TestMultipleTasks) { bool ready = dependency_manager_.RequestTaskDependencies( task_id, ObjectIdsToRefs({argument_id})); ASSERT_FALSE(ready); + ASSERT_FALSE(dependency_manager_.IsTaskReady(task_id)); // The object should be requested from the object manager once for each task. ASSERT_EQ(object_manager_mock_.active_requests.size(), i + 1); } @@ -134,6 +139,7 @@ TEST_F(DependencyManagerTest, TestMultipleTasks) { std::unordered_set added_tasks(dependent_tasks.begin(), dependent_tasks.end()); for (auto &id : ready_task_ids) { ASSERT_TRUE(added_tasks.erase(id)); + ASSERT_TRUE(dependency_manager_.IsTaskReady(id)); } ASSERT_TRUE(added_tasks.empty()); @@ -160,6 +166,7 @@ TEST_F(DependencyManagerTest, TestTaskArgEviction) { bool ready = dependency_manager_.RequestTaskDependencies(task_id, ObjectIdsToRefs(arguments)); ASSERT_FALSE(ready); + ASSERT_FALSE(dependency_manager_.IsTaskReady(task_id)); // Tell the task dependency manager that each of the arguments is now // available. @@ -176,6 +183,7 @@ TEST_F(DependencyManagerTest, TestTaskArgEviction) { ASSERT_TRUE(ready_tasks.empty()); } } + ASSERT_TRUE(dependency_manager_.IsTaskReady(task_id)); // Simulate each of the arguments getting evicted. Each object should now be // considered remote. @@ -195,6 +203,7 @@ TEST_F(DependencyManagerTest, TestTaskArgEviction) { // the waiting state. ASSERT_TRUE(waiting_tasks.empty()); } + ASSERT_FALSE(dependency_manager_.IsTaskReady(task_id)); } // Tell the task dependency manager that each of the arguments is available @@ -212,6 +221,7 @@ TEST_F(DependencyManagerTest, TestTaskArgEviction) { ASSERT_TRUE(ready_tasks.empty()); } } + ASSERT_TRUE(dependency_manager_.IsTaskReady(task_id)); dependency_manager_.RemoveTaskDependencies(task_id); AssertNoLeaks(); diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index 251e28e26aed..e1ac5eb670bb 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -222,11 +222,7 @@ NodeManager::NodeManager(boost::asio::io_service &io_service, const NodeID &self self_node_id_, std::dynamic_pointer_cast(cluster_resource_scheduler_), dependency_manager_, is_owner_alive, get_node_info_func, announce_infeasible_task, - worker_pool_, leased_workers_, - [this](const std::vector &object_ids, - std::vector> *results) { - return GetObjectsFromPlasma(object_ids, results); - })); + worker_pool_, leased_workers_)); placement_group_resource_manager_ = std::make_shared( std::dynamic_pointer_cast( @@ -1246,9 +1242,8 @@ void NodeManager::DisconnectClient(const std::shared_ptr &clie if ((!task_id.IsNil() || !actor_id.IsNil()) && !worker->IsDead()) { // If the worker was an actor, it'll be cleaned by GCS. if (actor_id.IsNil()) { - // Return the resources that were being used by this worker. Task task; - cluster_task_manager_->TaskFinished(worker, &task); + static_cast(local_queues_.RemoveTask(task_id, &task)); } if (disconnect_type == rpc::WorkerExitType::SYSTEM_ERROR_EXIT) { @@ -2370,33 +2365,6 @@ std::string compact_tag_string(const opencensus::stats::ViewDescriptor &view, return result.str(); } -bool NodeManager::GetObjectsFromPlasma(const std::vector &object_ids, - std::vector> *results) { - // Pin the objects in plasma by getting them and holding a reference to - // the returned buffer. - // NOTE: the caller must ensure that the objects already exist in plasma before - // sending a PinObjectIDs request. - std::vector plasma_results; - // TODO(swang): This `Get` has a timeout of 0, so the plasma store will not - // block when serving the request. However, if the plasma store is under - // heavy load, then this request can still block the NodeManager event loop - // since we must wait for the plasma store's reply. We should consider using - // an `AsyncGet` instead. - if (!store_client_.Get(object_ids, /*timeout_ms=*/0, &plasma_results).ok()) { - return false; - } - - for (const auto &plasma_result : plasma_results) { - if (plasma_result.data == nullptr) { - results->push_back(nullptr); - } else { - results->emplace_back(std::unique_ptr( - new RayObject(plasma_result.data, plasma_result.metadata, {}))); - } - } - return true; -} - void NodeManager::HandlePinObjectIDs(const rpc::PinObjectIDsRequest &request, rpc::PinObjectIDsReply *reply, rpc::SendReplyCallback send_reply_callback) { @@ -2406,16 +2374,33 @@ void NodeManager::HandlePinObjectIDs(const rpc::PinObjectIDsRequest &request, object_ids.push_back(ObjectID::FromBinary(object_id_binary)); } if (object_pinning_enabled_) { - std::vector> results; - if (!GetObjectsFromPlasma(object_ids, &results)) { - RAY_LOG(WARNING) - << "Failed to get objects that should have been in the object store. These " - "objects may have been evicted while there are still references in scope."; + // Pin the objects in plasma by getting them and holding a reference to + // the returned buffer. + // NOTE: the caller must ensure that the objects already exist in plasma before + // sending a PinObjectIDs request. + std::vector plasma_results; + // TODO(swang): This `Get` has a timeout of 0, so the plasma store will not + // block when serving the request. However, if the plasma store is under + // heavy load, then this request can still block the NodeManager event loop + // since we must wait for the plasma store's reply. We should consider using + // an `AsyncGet` instead. + if (!store_client_.Get(object_ids, /*timeout_ms=*/0, &plasma_results).ok()) { + RAY_LOG(WARNING) << "Failed to get objects to be pinned from object store."; // TODO(suquark): Maybe "Status::ObjectNotFound" is more accurate here. send_reply_callback(Status::Invalid("Failed to get objects."), nullptr, nullptr); return; } - local_object_manager_.PinObjects(object_ids, std::move(results)); + + std::vector> objects; + for (int64_t i = 0; i < request.object_ids().size(); i++) { + if (plasma_results[i].data == nullptr) { + objects.push_back(nullptr); + } else { + objects.emplace_back(std::unique_ptr( + new RayObject(plasma_results[i].data, plasma_results[i].metadata, {}))); + } + } + local_object_manager_.PinObjects(object_ids, std::move(objects)); } // Wait for the object to be freed by the owner, which keeps the ref count. local_object_manager_.WaitForObjectFree(request.owner_address(), object_ids); diff --git a/src/ray/raylet/node_manager.h b/src/ray/raylet/node_manager.h index 606dc3ac6fa7..3a68fcbae992 100644 --- a/src/ray/raylet/node_manager.h +++ b/src/ray/raylet/node_manager.h @@ -647,16 +647,6 @@ class NodeManager : public rpc::NodeManagerServiceHandler, std::unordered_map> MakeTasksByClass( const std::vector &tasks) const; - /// Get pointers to objects stored in plasma. They will be - /// released once the returned references go out of scope. - /// - /// \param[in] object_ids The objects to get. - /// \param[out] results The pointers to objects stored in - /// plasma. - /// \return Whether the request was successful. - bool GetObjectsFromPlasma(const std::vector &object_ids, - std::vector> *results); - /////////////////////////////////////////////////////////////////////////////////////// //////////////////// Begin of the override methods of ClusterTaskManager ////////////// // The following methods are defined in node_manager.task.cc instead of node_manager.cc diff --git a/src/ray/raylet/scheduling/cluster_task_manager.cc b/src/ray/raylet/scheduling/cluster_task_manager.cc index 109833eb59ab..a4dbff1f48dd 100644 --- a/src/ray/raylet/scheduling/cluster_task_manager.cc +++ b/src/ray/raylet/scheduling/cluster_task_manager.cc @@ -20,10 +20,7 @@ ClusterTaskManager::ClusterTaskManager( NodeInfoGetter get_node_info, std::function announce_infeasible_task, WorkerPoolInterface &worker_pool, - std::unordered_map> &leased_workers, - std::function &object_ids, - std::vector> *results)> - pin_task_arguments) + std::unordered_map> &leased_workers) : self_node_id_(self_node_id), cluster_resource_scheduler_(cluster_resource_scheduler), task_dependency_manager_(task_dependency_manager), @@ -34,8 +31,7 @@ ClusterTaskManager::ClusterTaskManager( RayConfig::instance().max_resource_shapes_per_load_report()), report_worker_backlog_(RayConfig::instance().report_worker_backlog()), worker_pool_(worker_pool), - leased_workers_(leased_workers), - pin_task_arguments_(pin_task_arguments) {} + leased_workers_(leased_workers) {} bool ClusterTaskManager::SchedulePendingTasks() { // Always try to schedule infeasible tasks in case they are now feasible. @@ -148,36 +144,11 @@ void ClusterTaskManager::DispatchScheduledTasksToWorkers( auto &task = std::get<0>(work); auto &spec = task.GetTaskSpecification(); - std::vector> args; - bool success = true; - const auto &deps = spec.GetDependencyIds(); - if (!deps.empty()) { - // This gets refs to the arguments stored in plasma. The refs should be - // deleted once we no longer need to pin the arguments. - success = pin_task_arguments_(deps, &args); - if (!success) { - RAY_LOG(WARNING) << "Error getting task arguments from plasma store"; - } - for (size_t i = 0; i < deps.size(); i++) { - if (args[i] == nullptr) { - // This can happen if the task's arguments were all local at some - // point, but then at least one was evicted before the task could - // be dispatched to a worker. - RAY_LOG(INFO) - << "Task " << spec.TaskId() << " argument " << deps[i] - << " was evicted before the task could be dispatched. This can happen " - "when there are many objects needed on this node. The task will be " - "scheduled once all of its dependencies are local."; - success = false; - break; - } - } - } - // An argument was evicted since this task was added to the dispatch // queue. Move it back to the waiting queue. The caller is responsible // for notifying us when the task is unblocked again. - if (!success) { + if (!spec.GetDependencies().empty() && + !task_dependency_manager_.IsTaskReady(spec.TaskId())) { waiting_tasks_[spec.TaskId()] = std::move(*work_it); work_it = dispatch_queue.erase(work_it); continue; @@ -206,12 +177,6 @@ void ClusterTaskManager::DispatchScheduledTasksToWorkers( bool worker_leased; bool remove = AttemptDispatchWork(*work_it, worker, &worker_leased); if (worker_leased) { - // Pin the arguments while the lease is active. These will be erased - // once the lease is returned. - num_pinned_task_arguments_ += args.size(); - RAY_CHECK(pinned_task_arguments_.emplace(spec.TaskId(), std::move(args)).second) - << spec.TaskId(); - auto reply = std::get<1>(*work_it); auto callback = std::get<2>(*work_it); Dispatch(worker, leased_workers_, task, reply, callback); @@ -330,10 +295,6 @@ void ClusterTaskManager::TaskFinished(std::shared_ptr worker, Task *task) { RAY_CHECK(worker != nullptr && task != nullptr); *task = worker->GetAssignedTask(); - auto it = pinned_task_arguments_.find(task->GetTaskSpecification().TaskId()); - RAY_CHECK(it != pinned_task_arguments_.end()); - num_pinned_task_arguments_ -= it->second.size(); - pinned_task_arguments_.erase(it); if (worker->GetAllocatedInstances() != nullptr) { ReleaseWorkerResources(worker); } @@ -672,8 +633,6 @@ std::string ClusterTaskManager::DebugStr() const { buffer << "Schedule queue length: " << num_tasks_to_schedule << "\n"; buffer << "Dispatch queue length: " << num_tasks_to_dispatch << "\n"; buffer << "Waiting tasks size: " << waiting_tasks_.size() << "\n"; - buffer << "Number of executing tasks: " << pinned_task_arguments_.size() << "\n"; - buffer << "Number of pinned task arguments: " << num_pinned_task_arguments_ << "\n"; buffer << "cluster_resource_scheduler state: " << cluster_resource_scheduler_->DebugString() << "\n"; buffer << "=================================================="; diff --git a/src/ray/raylet/scheduling/cluster_task_manager.h b/src/ray/raylet/scheduling/cluster_task_manager.h index 7f2652cebc80..f632357e10f4 100644 --- a/src/ray/raylet/scheduling/cluster_task_manager.h +++ b/src/ray/raylet/scheduling/cluster_task_manager.h @@ -2,7 +2,6 @@ #include "absl/container/flat_hash_map.h" #include "absl/container/flat_hash_set.h" -#include "ray/common/ray_object.h" #include "ray/common/task/task.h" #include "ray/common/task/task_common.h" #include "ray/raylet/dependency_manager.h" @@ -61,10 +60,7 @@ class ClusterTaskManager : public ClusterTaskManagerInterface { NodeInfoGetter get_node_info, std::function announce_infeasible_task, WorkerPoolInterface &worker_pool, - std::unordered_map> &leased_workers, - std::function &object_ids, - std::vector> *results)> - pin_task_arguments); + std::unordered_map> &leased_workers); /// (Step 1) Queue tasks and schedule. /// Queue task and schedule. This hanppens when processing the worker lease request. @@ -252,22 +248,6 @@ class ClusterTaskManager : public ClusterTaskManagerInterface { WorkerPoolInterface &worker_pool_; std::unordered_map> &leased_workers_; - /// Callback to get references to task arguments. These will be pinned while - /// the task is running. - std::function &object_ids, - std::vector> *results)> - pin_task_arguments_; - - /// Arguments needed by currently granted lease requests. These should be - /// pinned before the lease is granted to ensure that the arguments are not - /// evicted before the task(s) start running. - std::unordered_map>> - pinned_task_arguments_; - - /// The total number of arguments pinned for running tasks. - /// Used for debug purposes. - size_t num_pinned_task_arguments_ = 0; - /// Determine whether a task should be immediately dispatched, /// or placed on a wait queue. /// diff --git a/src/ray/raylet/scheduling/cluster_task_manager_test.cc b/src/ray/raylet/scheduling/cluster_task_manager_test.cc index 80a9406da4d5..776e7fc53030 100644 --- a/src/ray/raylet/scheduling/cluster_task_manager_test.cc +++ b/src/ray/raylet/scheduling/cluster_task_manager_test.cc @@ -85,7 +85,7 @@ Task CreateTask(const std::unordered_map &required_resource std::make_pair(PlacementGroupID::Nil(), -1), true, ""); for (int i = 0; i < num_args; i++) { - ObjectID put_id = ObjectID::FromIndex(RandomTaskId(), /*index=*/i + 1); + ObjectID put_id = ObjectID::FromIndex(TaskID::Nil(), /*index=*/i + 1); spec_builder.AddArg(TaskArgByReference(put_id, rpc::Address())); } @@ -96,25 +96,20 @@ Task CreateTask(const std::unordered_map &required_resource class MockTaskDependencyManager : public TaskDependencyManagerInterface { public: - MockTaskDependencyManager(std::unordered_set &missing_objects) - : missing_objects_(missing_objects) {} - bool RequestTaskDependencies( const TaskID &task_id, const std::vector &required_objects) { RAY_CHECK(subscribed_tasks.insert(task_id).second); - for (auto &obj_ref : required_objects) { - if (missing_objects_.count(ObjectRefToId(obj_ref))) { - return false; - } - } - return true; + return task_ready_; } void RemoveTaskDependencies(const TaskID &task_id) { RAY_CHECK(subscribed_tasks.erase(task_id)); } - std::unordered_set &missing_objects_; + bool IsTaskReady(const TaskID &task_id) const { return task_ready_; } + + bool task_ready_ = true; + std::unordered_set subscribed_tasks; }; @@ -126,34 +121,16 @@ class ClusterTaskManagerTest : public ::testing::Test { is_owner_alive_(true), node_info_calls_(0), announce_infeasible_task_calls_(0), - dependency_manager_(missing_objects_), - task_manager_( - id_, scheduler_, dependency_manager_, - [this](const WorkerID &worker_id, const NodeID &node_id) { - return is_owner_alive_; - }, - [this](const NodeID &node_id) { - node_info_calls_++; - return node_info_[node_id]; - }, - [this](const Task &task) { announce_infeasible_task_calls_++; }, pool_, - leased_workers_, - [this](const std::vector &object_ids, - std::vector> *results) { - for (auto &obj_id : object_ids) { - if (missing_objects_.count(obj_id) == 0) { - std::string meta = "metadata"; - auto metadata = const_cast( - reinterpret_cast(meta.data())); - auto meta_buffer = - std::make_shared(metadata, meta.size()); - results->emplace_back(new RayObject(nullptr, meta_buffer, {})); - } else { - results->emplace_back(nullptr); - } - } - return true; - }) {} + task_manager_(id_, scheduler_, dependency_manager_, + [this](const WorkerID &worker_id, const NodeID &node_id) { + return is_owner_alive_; + }, + [this](const NodeID &node_id) { + node_info_calls_++; + return node_info_[node_id]; + }, + [this](const Task &task) { announce_infeasible_task_calls_++; }, + pool_, leased_workers_) {} void SetUp() {} @@ -176,25 +153,13 @@ class ClusterTaskManagerTest : public ::testing::Test { ASSERT_TRUE(task_manager_.tasks_to_dispatch_.empty()); ASSERT_TRUE(task_manager_.waiting_tasks_.empty()); ASSERT_TRUE(task_manager_.infeasible_tasks_.empty()); - ASSERT_TRUE(task_manager_.pinned_task_arguments_.empty()); - ASSERT_EQ(task_manager_.num_pinned_task_arguments_, 0); ASSERT_TRUE(dependency_manager_.subscribed_tasks.empty()); } - void AssertPinnedTaskArgumentsEquals(const TaskID &task_id, size_t num_args_expected) { - ASSERT_EQ(task_manager_.pinned_task_arguments_[task_id].size(), num_args_expected); - size_t num_args = 0; - for (auto &args : task_manager_.pinned_task_arguments_) { - num_args += args.second.size(); - } - ASSERT_EQ(task_manager_.num_pinned_task_arguments_, num_args); - } - NodeID id_; std::shared_ptr scheduler_; MockWorkerPool pool_; std::unordered_map> leased_workers_; - std::unordered_set missing_objects_; bool is_owner_alive_; @@ -238,11 +203,6 @@ TEST_F(ClusterTaskManagerTest, BasicTest) { ASSERT_EQ(pool_.workers.size(), 0); ASSERT_EQ(node_info_calls_, 0); - Task finished_task; - task_manager_.TaskFinished(leased_workers_.begin()->second, &finished_task); - ASSERT_EQ(finished_task.GetTaskSpecification().TaskId(), - task.GetTaskSpecification().TaskId()); - AssertNoLeaks(); } @@ -292,9 +252,8 @@ TEST_F(ClusterTaskManagerTest, ResourceTakenWhileResolving) { }; /* Blocked on dependencies */ - auto task = CreateTask({{ray::kCPU_ResourceLabel, 5}}, 2); - auto missing_arg = task.GetTaskSpecification().GetDependencyIds()[0]; - missing_objects_.insert(missing_arg); + dependency_manager_.task_ready_ = false; + auto task = CreateTask({{ray::kCPU_ResourceLabel, 5}}, 1); std::unordered_set expected_subscribed_tasks = { task.GetTaskSpecification().TaskId()}; task_manager_.QueueAndScheduleTask(task, &reply, callback); @@ -305,42 +264,36 @@ TEST_F(ClusterTaskManagerTest, ResourceTakenWhileResolving) { ASSERT_EQ(pool_.workers.size(), 2); /* This task can run */ - auto task2 = CreateTask({{ray::kCPU_ResourceLabel, 5}}, 1); + auto task2 = CreateTask({{ray::kCPU_ResourceLabel, 5}}); task_manager_.QueueAndScheduleTask(task2, &reply, callback); ASSERT_EQ(dependency_manager_.subscribed_tasks, expected_subscribed_tasks); - AssertPinnedTaskArgumentsEquals(task2.GetTaskSpecification().TaskId(), 1); ASSERT_EQ(num_callbacks, 1); ASSERT_EQ(leased_workers_.size(), 1); ASSERT_EQ(pool_.workers.size(), 1); /* First task is unblocked now, but resources are no longer available */ - missing_objects_.erase(missing_arg); + dependency_manager_.task_ready_ = true; auto id = task.GetTaskSpecification().TaskId(); std::vector unblocked = {id}; task_manager_.TasksUnblocked(unblocked); ASSERT_EQ(dependency_manager_.subscribed_tasks, expected_subscribed_tasks); - AssertPinnedTaskArgumentsEquals(task2.GetTaskSpecification().TaskId(), 1); ASSERT_EQ(num_callbacks, 1); ASSERT_EQ(leased_workers_.size(), 1); ASSERT_EQ(pool_.workers.size(), 1); /* Second task finishes, making space for the original task */ - Task finished_task; - task_manager_.TaskFinished(leased_workers_.begin()->second, &finished_task); leased_workers_.clear(); + task_manager_.ReleaseWorkerResources(worker); task_manager_.ScheduleAndDispatchTasks(); ASSERT_TRUE(dependency_manager_.subscribed_tasks.empty()); // Task2 is now done so task can run. - AssertPinnedTaskArgumentsEquals(task.GetTaskSpecification().TaskId(), 2); ASSERT_EQ(num_callbacks, 2); ASSERT_EQ(leased_workers_.size(), 1); ASSERT_EQ(pool_.workers.size(), 0); - - task_manager_.TaskFinished(leased_workers_.begin()->second, &finished_task); AssertNoLeaks(); } @@ -389,12 +342,6 @@ TEST_F(ClusterTaskManagerTest, TestSpillAfterAssigned) { // The second task was spilled. ASSERT_EQ(spillback_reply.retry_at_raylet_address().raylet_id(), remote_node_id.Binary()); - - Task finished_task; - task_manager_.TaskFinished(leased_workers_.begin()->second, &finished_task); - ASSERT_EQ(finished_task.GetTaskSpecification().TaskId(), - task.GetTaskSpecification().TaskId()); - AssertNoLeaks(); } @@ -438,12 +385,6 @@ TEST_F(ClusterTaskManagerTest, TaskCancellationTest) { ASSERT_FALSE(callback_called); ASSERT_EQ(pool_.workers.size(), 0); ASSERT_EQ(leased_workers_.size(), 1); - - Task finished_task; - task_manager_.TaskFinished(leased_workers_.begin()->second, &finished_task); - ASSERT_EQ(finished_task.GetTaskSpecification().TaskId(), - task.GetTaskSpecification().TaskId()); - AssertNoLeaks(); } @@ -674,12 +615,6 @@ TEST_F(ClusterTaskManagerTest, BacklogReportTest) { task_manager_.FillResourceUsage(data); auto resource_load_by_shape = data->resource_load_by_shape(); ASSERT_EQ(resource_load_by_shape.resource_demands().size(), 0); - - while (!leased_workers_.empty()) { - Task finished_task; - task_manager_.TaskFinished(leased_workers_.begin()->second, &finished_task); - leased_workers_.erase(leased_workers_.begin()); - } AssertNoLeaks(); } } @@ -850,9 +785,8 @@ TEST_F(ClusterTaskManagerTest, ArgumentEvicted) { }; /* Blocked on dependencies */ + dependency_manager_.task_ready_ = false; auto task = CreateTask({{ray::kCPU_ResourceLabel, 5}}, 2); - auto missing_arg = task.GetTaskSpecification().GetDependencyIds()[0]; - missing_objects_.insert(missing_arg); std::unordered_set expected_subscribed_tasks = { task.GetTaskSpecification().TaskId()}; task_manager_.QueueAndScheduleTask(task, &reply, callback); @@ -861,7 +795,7 @@ TEST_F(ClusterTaskManagerTest, ArgumentEvicted) { ASSERT_EQ(leased_workers_.size(), 0); /* Task is unblocked now */ - missing_objects_.erase(missing_arg); + dependency_manager_.task_ready_ = true; pool_.workers.clear(); auto id = task.GetTaskSpecification().TaskId(); task_manager_.TasksUnblocked({id}); @@ -870,7 +804,7 @@ TEST_F(ClusterTaskManagerTest, ArgumentEvicted) { ASSERT_EQ(leased_workers_.size(), 0); /* Task argument gets evicted */ - missing_objects_.insert(missing_arg); + dependency_manager_.task_ready_ = false; pool_.PushWorker(std::dynamic_pointer_cast(worker)); task_manager_.ScheduleAndDispatchTasks(); ASSERT_EQ(dependency_manager_.subscribed_tasks, expected_subscribed_tasks); @@ -878,16 +812,10 @@ TEST_F(ClusterTaskManagerTest, ArgumentEvicted) { ASSERT_EQ(leased_workers_.size(), 0); /* Worker available and arguments available */ - missing_objects_.erase(missing_arg); + dependency_manager_.task_ready_ = true; task_manager_.TasksUnblocked({id}); ASSERT_EQ(num_callbacks, 1); ASSERT_EQ(leased_workers_.size(), 1); - - Task finished_task; - task_manager_.TaskFinished(leased_workers_.begin()->second, &finished_task); - ASSERT_EQ(finished_task.GetTaskSpecification().TaskId(), - task.GetTaskSpecification().TaskId()); - AssertNoLeaks(); } diff --git a/src/ray/raylet/test/util.h b/src/ray/raylet/test/util.h index c43a386fba14..8527220e3df8 100644 --- a/src/ray/raylet/test/util.h +++ b/src/ray/raylet/test/util.h @@ -33,7 +33,7 @@ class MockWorker : public WorkerInterface { void AssignTaskId(const TaskID &task_id) {} - void SetAssignedTask(const Task &assigned_task) { task_ = assigned_task; } + void SetAssignedTask(const Task &assigned_task) {} const std::string IpAddress() const { return address_.ip_address(); } @@ -162,7 +162,11 @@ class MockWorker : public WorkerInterface { void SetBundleId(const BundleID &bundle_id) { bundle_id_ = bundle_id; } - Task &GetAssignedTask() { return task_; } + Task &GetAssignedTask() { + RAY_CHECK(false) << "Method unused"; + auto *t = new Task(); + return *t; + } bool IsRegistered() { RAY_CHECK(false) << "Method unused"; @@ -184,7 +188,6 @@ class MockWorker : public WorkerInterface { bool is_detached_actor_; BundleID bundle_id_; bool blocked_ = false; - Task task_; }; } // namespace raylet From b772faee9fd26b0105aa4fb672bbb4ca793cbf3e Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 152/244] Revert "[docker] Build Python3.6 & Python3.8 Docker Images (#13548)" This reverts commit a8186620856bce52b103f871059640cbc5ea959a. --- .travis.yml | 35 +---- ci/travis/build-docker-images.py | 208 +++++++++++----------------- ci/travis/determine_tests_to_run.py | 2 - docker/base-deps/Dockerfile | 8 +- docker/ray-ml/Dockerfile | 10 +- python/requirements_ml_docker.txt | 3 +- 6 files changed, 91 insertions(+), 175 deletions(-) diff --git a/.travis.yml b/.travis.yml index 8cff56d419d2..4d8f8ddd1255 100644 --- a/.travis.yml +++ b/.travis.yml @@ -209,32 +209,10 @@ matrix: - . ./ci/travis/ci.sh test_wheels - export PATH="$HOME/miniconda3/bin:$PATH" - python -m pip install docker - - if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then python $TRAVIS_BUILD_DIR/ci/travis/build-docker-images.py PY37; fi + - if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then python $TRAVIS_BUILD_DIR/ci/travis/build-docker-images.py; fi - bash ./java/build-jar-multiplatform.sh linux cache: false - - # Build Py36 & Py38 Docker Images - - os: linux - env: - - LINUX_WHEELS=1 - - DOCKER_BUILD_PY36_38=1 - - PYTHONWARNINGS=ignore - language: java - jdk: openjdk8 - install: - - . ./ci/travis/ci.sh init RAY_CI_LINUX_WHEELS_AFFECTED - before_script: - - . ./ci/travis/ci.sh build - script: - - wget --quiet "https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh" -O miniconda3.sh - - bash miniconda3.sh -b -p "$HOME/miniconda3" - - export PATH="$HOME/miniconda3/bin:$PATH" - - conda install -y python=3.7.6 - - python -m pip install docker - - if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then python $TRAVIS_BUILD_DIR/ci/travis/build-docker-images.py PY36_PY38; fi - cache: false - # Build and deploy multi-platform jars. - os: linux env: @@ -513,7 +491,7 @@ deploy: - provider: script edge: true # This supposedly opts in to deploy v2. - script: export PATH="$HOME/miniconda3/bin:$PATH"; ./ci/keep_alive python $TRAVIS_BUILD_DIR/ci/travis/build-docker-images.py PY37 + script: export PATH="$HOME/miniconda3/bin:$PATH"; ./ci/keep_alive python $TRAVIS_BUILD_DIR/ci/travis/build-docker-images.py skip_cleanup: true on: repo: ray-project/ray @@ -552,12 +530,3 @@ deploy: repo: ray-project/ray branch: master condition: $MULTIPLATFORM_JARS = 1 || $MAC_JARS = 1 || $LINUX_JARS = 1 - - - provider: script - edge: true # This supposedly opts in to deploy v2. - script: export PATH="$HOME/miniconda3/bin:$PATH"; ./ci/keep_alive python $TRAVIS_BUILD_DIR/ci/travis/build-docker-images.py PY36_PY38 - skip_cleanup: true - on: - repo: ray-project/ray - all_branches: true - condition: $LINUX_WHEELS = 1 \ No newline at end of file diff --git a/ci/travis/build-docker-images.py b/ci/travis/build-docker-images.py index ad69a15dbcaa..c549bc95e60a 100644 --- a/ci/travis/build-docker-images.py +++ b/ci/travis/build-docker-images.py @@ -15,7 +15,7 @@ print = functools.partial(print, file=sys.stderr, flush=True) DOCKER_USERNAME = "raytravisbot" DOCKER_CLIENT = None -PYTHON_WHL_VERSION = "cp3" +PYTHON_WHL_VERSION = "cp37m" DOCKER_HUB_DESCRIPTION = { "base-deps": ("Internal Image, refer to " @@ -29,8 +29,6 @@ "https://hub.docker.com/repository/docker/rayproject/ray-ml") } -PY_MATRIX = {"-py36": "3.6.12", "-py37": "3.7.7", "-py38": "3.8.5"} - def _merge_build(): return os.environ.get("TRAVIS_PULL_REQUEST").lower() == "false" @@ -54,18 +52,13 @@ def _get_root_dir(): return os.path.join(_get_curr_dir(), "../../") -def _get_wheel_name(minor_version_number): - if minor_version_number: - matches = glob.glob(f"{_get_root_dir()}/.whl/*{PYTHON_WHL_VERSION}" - f"{minor_version_number}*-manylinux*") - assert len(matches) == 1, ( - f"Found ({len(matches)}) matches for '*{PYTHON_WHL_VERSION}" - f"{minor_version_number}*-manylinux*' instead of 1") - return os.path.basename(matches[0]) - else: - matches = glob.glob( - f"{_get_root_dir()}/.whl/*{PYTHON_WHL_VERSION}*-manylinux*") - return [os.path.basename(i) for i in matches] +def _get_wheel_name(): + matches = glob.glob( + f"{_get_root_dir()}/.whl/*{PYTHON_WHL_VERSION}-manylinux*") + assert len(matches) == 1, ( + f"Found ({len(matches)}) matches " + f"'*{PYTHON_WHL_VERSION}-manylinux*' instead of 1") + return os.path.basename(matches[0]) def _docker_affected(): @@ -88,76 +81,64 @@ def _docker_affected(): def _build_cpu_gpu_images(image_name, no_cache=True) -> List[str]: built_images = [] for gpu in ["-cpu", "-gpu"]: - for py_name, py_version in PY_MATRIX.items(): - build_args = {} - build_args["PYTHON_VERSION"] = py_version - # I.e. "-py36"[-1] == 6 - build_args["PYTHON_MINOR_VERSION"] = py_name[-1] - - if image_name == "base-deps": - build_args["BASE_IMAGE"] = ( - "nvidia/cuda:11.0-cudnn8-runtime-ubuntu18.04" - if gpu == "-gpu" else "ubuntu:focal") + build_args = {} + if image_name == "base-deps": + build_args["BASE_IMAGE"] = ( + "nvidia/cuda:11.0-cudnn8-runtime-ubuntu18.04" + if gpu == "-gpu" else "ubuntu:focal") + else: + build_args["GPU"] = gpu + + if "ray" in image_name: + build_args["WHEEL_PATH"] = f".whl/{_get_wheel_name()}" + + tagged_name = f"rayproject/{image_name}:nightly{gpu}" + for i in range(2): + output = DOCKER_CLIENT.api.build( + path=os.path.join(_get_root_dir(), "docker", image_name), + tag=tagged_name, + nocache=no_cache, + buildargs=build_args) + + full_output = "" + try: + start = datetime.datetime.now() + current_iter = start + for line in output: + if datetime.datetime.now( + ) - current_iter >= datetime.timedelta(minutes=5): + current_iter = datetime.datetime.now() + elapsed = datetime.datetime.now() - start + print(f"Still building {tagged_name} after " + f"{elapsed.seconds} seconds") + full_output += line.decode("utf-8") + except Exception as e: + print(f"FAILURE with error {e}") + + if len(DOCKER_CLIENT.api.images(tagged_name)) == 0: + print(f"ERROR building: {tagged_name} & error below:") + print(full_output) + if (i == 1): + raise Exception("FAILED TO BUILD IMAGE") + print("TRYING AGAIN") else: - # NOTE(ilr) This is a bit of an abuse of the name "GPU" - build_args["GPU"] = f"{py_name}{gpu}" - - if image_name in ["ray", "ray-deps"]: - wheel = _get_wheel_name(build_args["PYTHON_MINOR_VERSION"]) - build_args["WHEEL_PATH"] = f".whl/{wheel}" - - tagged_name = f"rayproject/{image_name}:nightly{py_name}{gpu}" - for i in range(2): - cleanup = DOCKER_CLIENT.containers.prune().get( - "SpaceReclaimed") - if cleanup is not None: - print(f"Cleaned up {cleanup / (2**20)}MB") - output = DOCKER_CLIENT.api.build( - path=os.path.join(_get_root_dir(), "docker", image_name), - tag=tagged_name, - nocache=no_cache, - buildargs=build_args) - - full_output = "" - try: - start = datetime.datetime.now() - current_iter = start - for line in output: - if datetime.datetime.now( - ) - current_iter >= datetime.timedelta(minutes=5): - current_iter = datetime.datetime.now() - elapsed = datetime.datetime.now() - start - print(f"Still building {tagged_name} after " - f"{elapsed.seconds} seconds") - full_output += line.decode("utf-8") - except Exception as e: - print(f"FAILURE with error {e}") - - if len(DOCKER_CLIENT.api.images(tagged_name)) == 0: - print(f"ERROR building: {tagged_name} & error below:") - print(full_output) - if (i == 1): - raise Exception("FAILED TO BUILD IMAGE") - print("TRYING AGAIN") - else: - break - - print("BUILT: ", tagged_name) - built_images.append(tagged_name) + break + + print("BUILT: ", tagged_name) + built_images.append(tagged_name) return built_images def copy_wheels(): root_dir = _get_root_dir() - wheels = _get_wheel_name(None) - for wheel in wheels: - source = os.path.join(root_dir, ".whl", wheel) - ray_dst = os.path.join(root_dir, "docker/ray/.whl/") - ray_dep_dst = os.path.join(root_dir, "docker/ray-deps/.whl/") - os.makedirs(ray_dst, exist_ok=True) - shutil.copy(source, ray_dst) - os.makedirs(ray_dep_dst, exist_ok=True) - shutil.copy(source, ray_dep_dst) + wheel = _get_wheel_name() + source = os.path.join(root_dir, ".whl", wheel) + ray_dst = os.path.join(root_dir, "docker/ray/.whl/") + ray_dep_dst = os.path.join(root_dir, "docker/ray-deps/.whl/") + os.makedirs(ray_dst, exist_ok=True) + shutil.copy(source, ray_dst) + os.makedirs(ray_dep_dst, exist_ok=True) + shutil.copy(source, ray_dep_dst) def build_or_pull_base_images(is_docker_affected: bool) -> List[str]: @@ -258,48 +239,31 @@ def get_new_tag(old_tag, new_tag): image_list.extend(["base-deps", "ray-deps"]) for image in image_list: - for py_version in PY_MATRIX.keys(): - full_image = f"rayproject/{image}" + full_image = f"rayproject/{image}" - # Tag "nightly-py3x" from "nightly-py3x-cpu" + # Generate :nightly from nightly-cpu + DOCKER_CLIENT.api.tag( + image=f"{full_image}:nightly-cpu", + repository=full_image, + tag="nightly") + + for arch_tag in ["-cpu", "-gpu", ""]: + full_arch_tag = f"nightly{arch_tag}" + # Do not tag release builds because they are no longer up to date + # after the branch cut. + if not _release_build(): + # Tag and push rayproject/:nightly + docker_push(full_image, full_arch_tag) + + # Ex: specific_tag == "1.0.1" or "" or "" + specific_tag = get_new_tag( + full_arch_tag, date_tag if "-deps" in image else sha_tag) + # Tag and push rayproject/: DOCKER_CLIENT.api.tag( - image=f"{full_image}:nightly{py_version}-cpu", + image=f"{full_image}:{full_arch_tag}", repository=full_image, - tag=f"nightly{py_version}") - - for arch_tag in ["-cpu", "-gpu", ""]: - full_arch_tag = f"nightly{py_version}{arch_tag}" - # Do not tag release builds because they are no longer up to - # date after the branch cut. - if not _release_build(): - # Tag and push rayproject/:nightly - docker_push(full_image, full_arch_tag) - - # Ex: specific_tag == "1.0.1" or "" or "" - specific_tag = get_new_tag( - full_arch_tag, date_tag if "-deps" in image else sha_tag) - - # Tag and push rayproject/: - DOCKER_CLIENT.api.tag( - image=f"{full_image}:{full_arch_tag}", - repository=full_image, - tag=specific_tag) - docker_push(full_image, specific_tag) - - if "-py37" in py_version: - non_python_specific_tag = specific_tag.replace("-py37", "") - DOCKER_CLIENT.api.tag( - image=f"{full_image}:{full_arch_tag}", - repository=full_image, - tag=non_python_specific_tag) - docker_push(full_image, non_python_specific_tag) - - non_python_nightly_tag = full_arch_tag.replace("-py37", "") - DOCKER_CLIENT.api.tag( - image=f"{full_image}:{full_arch_tag}", - repository=full_image, - tag=non_python_nightly_tag) - docker_push(full_image, non_python_nightly_tag) + tag=specific_tag) + docker_push(full_image, specific_tag) # Push infra here: @@ -342,14 +306,6 @@ def push_readmes(): if __name__ == "__main__": print("RUNNING WITH: ", sys.version) - if len(sys.argv) == 2: - version_to_drop = sys.argv[1] - if version_to_drop == "PY37": - PY_MATRIX.pop("-py36") - PY_MATRIX.pop("-py38") - else: - PY_MATRIX.pop("-py37") - print("Building the following python versions: ", PY_MATRIX) if os.environ.get("TRAVIS") == "true": is_docker_affected = _docker_affected() if _merge_build() or is_docker_affected: diff --git a/ci/travis/determine_tests_to_run.py b/ci/travis/determine_tests_to_run.py index cba016fcf610..70eefc16a566 100644 --- a/ci/travis/determine_tests_to_run.py +++ b/ci/travis/determine_tests_to_run.py @@ -124,8 +124,6 @@ def list_changed_files(commit_range): for prefix in skip_prefix_list): # nothing is run but linting in these cases pass - elif changed_file.endswith("build-docker-images.py"): - RAY_CI_DOCKER_AFFECTED = 1 elif changed_file.startswith("src/"): RAY_CI_TUNE_AFFECTED = 1 RAY_CI_SGD_AFFECTED = 1 diff --git a/docker/base-deps/Dockerfile b/docker/base-deps/Dockerfile index e00ca141c9d5..278fad1ec73d 100644 --- a/docker/base-deps/Dockerfile +++ b/docker/base-deps/Dockerfile @@ -30,8 +30,6 @@ RUN sudo apt-get update -y && sudo apt-get upgrade -y \ git \ wget \ cmake \ - g++ \ - zlib1g-dev \ $(if [ "$AUTOSCALER" = "autoscaler" ]; then echo \ tmux \ screen \ @@ -54,14 +52,12 @@ RUN sudo apt-get update -y && sudo apt-get upgrade -y \ numpy==1.15.4 \ psutil \ blist \ - atari-py \ # blist is needed for numpy (which is re-installed when ray is installed) - # atari-py is built from source for Python 3.8 (requires g++ & zlib1g-dev) # To avoid the following error on Jenkins: # AttributeError: 'numpy.ufunc' object has no attribute '__module__' && $HOME/anaconda3/bin/pip uninstall -y dask \ - # We install cmake temporarily to get psutil, blist & atari-py - && sudo apt-get autoremove -y cmake g++ zlib1g-dev \ + # We install cmake temporarily to get psutil + && sudo apt-get autoremove -y cmake \ # Either install kubectl or remove wget && (if [ "$AUTOSCALER" = "autoscaler" ]; \ then wget -O - -q https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add - \ diff --git a/docker/ray-ml/Dockerfile b/docker/ray-ml/Dockerfile index 908351df19d9..25211085edc7 100644 --- a/docker/ray-ml/Dockerfile +++ b/docker/ray-ml/Dockerfile @@ -1,13 +1,12 @@ ARG GPU FROM rayproject/ray:nightly"$GPU" -ARG PYTHON_MINOR_VERSION=7 # We have to uninstall wrapt this way for Tensorflow compatibility COPY requirements.txt ./ COPY requirements_ml_docker.txt ./ COPY requirements_rllib.txt ./ # Docker image uses Python 3.7 -COPY linux-py3."$PYTHON_MINOR_VERSION"-requirements_tune.txt ./requirements_tune.txt +COPY linux-py3.7-requirements_tune.txt ./requirements_tune.txt RUN sudo apt-get update \ && sudo apt-get install -y gcc \ @@ -15,13 +14,12 @@ RUN sudo apt-get update \ libgtk2.0-dev \ zlib1g-dev \ libgl1-mesa-dev \ - && $HOME/anaconda3/bin/pip --no-cache-dir install -U -r requirements_ml_docker.txt \ && $HOME/anaconda3/bin/pip --use-deprecated=legacy-resolver --no-cache-dir install -r requirements.txt \ && $HOME/anaconda3/bin/pip --no-cache-dir install -r requirements_rllib.txt \ && $HOME/anaconda3/bin/pip --no-cache-dir install -r requirements_tune.txt \ - # Remove dataclasses & typing because they are included in Python > 3.6 - && if [ $(python -c 'import sys; print(sys.version_info.minor)') != "6" ]; then \ - $HOME/anaconda3/bin/pip uninstall dataclasses typing -y; fi \ + && $HOME/anaconda3/bin/pip --no-cache-dir install -U -r requirements_ml_docker.txt \ + # Remove dataclasses & typing because they are included in Py3.7 + && $HOME/anaconda3/bin/pip uninstall dataclasses typing -y \ && sudo rm requirements.txt && sudo rm requirements_ml_docker.txt \ && sudo rm requirements_tune.txt && sudo rm requirements_rllib.txt \ && sudo apt-get clean diff --git a/python/requirements_ml_docker.txt b/python/requirements_ml_docker.txt index bbecb5bd873e..c61ba0c055f6 100644 --- a/python/requirements_ml_docker.txt +++ b/python/requirements_ml_docker.txt @@ -3,5 +3,4 @@ tensorflow-gpu>=2.4.0 -f https://download.pytorch.org/whl/torch_stable.html torch==1.7.1+cu110 -f https://download.pytorch.org/whl/torch_stable.html -torchvision==0.8.2+cu110 -pip; python_version > "3.7" +torchvision==0.8.2+cu110 \ No newline at end of file From 5ed79628e6f72c71ad87f87982cbbe886e1f8c4b Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 153/244] Revert "[Docker] usage of python-version (#13011)" This reverts commit ecaa77e68c0d914e7c9e474dddc25a8c530ac139. --- build-docker.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/build-docker.sh b/build-docker.sh index 42f9068954f1..b39336186caf 100755 --- a/build-docker.sh +++ b/build-docker.sh @@ -8,8 +8,7 @@ set -x GPU="" BASE_IMAGE="ubuntu:focal" WHEEL_URL="https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl" -PYTHON_VERSION="3.7.7" - +PYTHON_VERSION="" while [[ $# -gt 0 ]] do @@ -42,7 +41,6 @@ case $key in --python-version) # Python version to install. e.g. 3.7.7. # Changing python versions may require a different wheel. - # If not provided defaults to 3.7.7 shift PYTHON_VERSION=$1 ;; @@ -61,7 +59,7 @@ for IMAGE in "base-deps" "ray-deps" "ray" do cp "$WHEEL" "docker/$IMAGE/$(basename "$WHEEL")" if [ $OUTPUT_SHA ]; then - IMAGE_SHA=$(docker build $NO_CACHE --build-arg GPU="$GPU" --build-arg BASE_IMAGE="$BASE_IMAGE" --build-arg WHEEL_PATH="$(basename "$WHEEL")" --build-arg PYTHON_VERSION="$PYTHON_VERSION" -q -t rayproject/$IMAGE:nightly$GPU docker/$IMAGE) + IMAGE_SHA=$(docker build $NO_CACHE --build-arg GPU="$GPU" --build-arg BASE_IMAGE="$BASE_IMAGE" --build-arg WHEEL_PATH="$(basename "$WHEEL")" -q -t rayproject/$IMAGE:nightly$GPU docker/$IMAGE) echo "rayproject/$IMAGE:nightly$GPU SHA:$IMAGE_SHA" else docker build $NO_CACHE --build-arg GPU="$GPU" --build-arg BASE_IMAGE="$BASE_IMAGE" --build-arg WHEEL_PATH="$(basename "$WHEEL")" --build-arg PYTHON_VERSION="$PYTHON_VERSION" -t rayproject/$IMAGE:nightly$GPU docker/$IMAGE From 9ccf8b341cc9f88e271cfa68ac5969d5c681eee2 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 154/244] Revert "[Serve] Add ServeHandle metrics (#13640)" This reverts commit 0af0b741fb9396020594b8d1a2bc4f3f091982fc. --- doc/source/serve/advanced.rst | 4 -- python/ray/serve/handle.py | 15 -------- python/ray/serve/router.py | 55 ++++++++------------------- python/ray/serve/tests/test_api.py | 4 -- python/ray/serve/tests/test_router.py | 2 +- 5 files changed, 16 insertions(+), 64 deletions(-) diff --git a/doc/source/serve/advanced.rst b/doc/source/serve/advanced.rst index 542a3ce188ec..3ac191f1b3a4 100644 --- a/doc/source/serve/advanced.rst +++ b/doc/source/serve/advanced.rst @@ -321,10 +321,6 @@ The following metrics are exposed by Ray Serve: - The number of HTTP requests processed. * - ``serve_num_router_requests`` - The number of requests processed by the router. - * - ``serve_handle_request_counter`` - - The number of requests processed by this ServeHandle. - * - ``backend_queued_queries`` - - The number of queries for this backend waiting to be assigned to a replica. To see this in action, run ``ray start --head --metrics-export-port=8080`` in your terminal, and then run the following script: diff --git a/python/ray/serve/handle.py b/python/ray/serve/handle.py index 475f64556cb5..c6951c6380b9 100644 --- a/python/ray/serve/handle.py +++ b/python/ray/serve/handle.py @@ -4,8 +4,6 @@ from typing import Any, Dict, Optional, Union from enum import Enum -from ray.serve.utils import get_random_letters -from ray.util import metrics from ray.serve.router import Router @@ -49,17 +47,6 @@ def __init__(self, self.router = router self.endpoint_name = endpoint_name self.handle_options = handle_options or HandleOptions() - self.handle_tag = f"{self.endpoint_name}#{get_random_letters()}" - - self.request_counter = metrics.Count( - "serve_handle_request_counter", - description=("The number of handle.remote() calls that have been " - "made on this handle."), - tag_keys=("handle", "endpoint")) - self.request_counter.set_default_tags({ - "handle": self.handle_tag, - "endpoint": self.endpoint_name - }) def options(self, *, @@ -105,7 +92,6 @@ async def remote(self, ``**kwargs``: All keyword arguments will be available in ``request.query_params``. """ - self.request_counter.record(1) return await self.router._remote( self.endpoint_name, self.handle_options, request_data, kwargs) @@ -132,7 +118,6 @@ def remote(self, request_data: Optional[Union[Dict, Any]] = None, ``**kwargs``: All keyword arguments will be available in ``request.args``. """ - self.request_counter.record(1) coro = self.router._remote(self.endpoint_name, self.handle_options, request_data, kwargs) future: concurrent.futures.Future = asyncio.run_coroutine_threadsafe( diff --git a/python/ray/serve/router.py b/python/ray/serve/router.py index ec887d006c43..c4a87b49bb60 100644 --- a/python/ray/serve/router.py +++ b/python/ray/serve/router.py @@ -1,6 +1,7 @@ import asyncio from enum import Enum import itertools +from collections import defaultdict from dataclasses import dataclass, field from typing import Any, ChainMap, Dict, Iterable, List, Optional @@ -48,12 +49,12 @@ class Query: class ReplicaSet: """Data structure representing a set of replica actor handles""" - def __init__(self, backend_tag): - self.backend_tag = backend_tag + def __init__(self): # NOTE(simon): We have to do this because max_concurrent_queries # and the replica handles come from different long poll keys. self.max_concurrent_queries: int = 8 self.in_flight_queries: Dict[ActorHandle, set] = dict() + # The iterator used for load balancing among replicas. Using itertools # cycle, we implements a round-robin policy, skipping overloaded # replicas. @@ -63,25 +64,15 @@ def __init__(self, backend_tag): self.replica_iterator = itertools.cycle(self.in_flight_queries.keys()) # Used to unblock this replica set waiting for free replicas. A newly - # added replica or updated max_concurrent_queries value means the + # added replica or updated max_concurrenty_queries value means the # query that waits on a free replica might be unblocked on. self.config_updated_event = asyncio.Event() - self.num_queued_queries = 0 - self.num_queued_queries_gauge = metrics.Gauge( - "serve_backend_queued_queries", - description=( - "The current number of queries to this backend waiting" - " to be assigned to a replica."), - tag_keys=("backend", "endpoint")) - self.num_queued_queries_gauge.set_default_tags({ - "backend": self.backend_tag - }) def set_max_concurrent_queries(self, new_value): if new_value != self.max_concurrent_queries: self.max_concurrent_queries = new_value logger.debug( - f"ReplicaSet: changing max_concurrent_queries to {new_value}") + f"ReplicaSet: chaging max_concurrent_queries to {new_value}") self.config_updated_event.set() def update_worker_replicas(self, worker_replicas: Iterable[ActorHandle]): @@ -101,7 +92,7 @@ def update_worker_replicas(self, worker_replicas: Iterable[ActorHandle]): self.config_updated_event.set() def _try_assign_replica(self, query: Query) -> Optional[ray.ObjectRef]: - """Try to assign query to a replica, return the object ref if succeeded + """Try to assign query to a replica, return the object ref is succeeded or return None if it can't assign this query to any replicas. """ for _ in range(len(self.in_flight_queries.keys())): @@ -139,10 +130,6 @@ async def assign_replica(self, query: Query) -> ray.ObjectRef: and only send a query to available replicas (determined by the backend max_concurrent_quries value.) """ - endpoint = query.metadata.endpoint - self.num_queued_queries += 1 - self.num_queued_queries_gauge.record( - self.num_queued_queries, tags={"endpoint": endpoint}) assigned_ref = self._try_assign_replica(query) while assigned_ref is None: # Can't assign a replica right now. logger.debug("Failed to assign a replica for " @@ -160,12 +147,8 @@ async def assign_replica(self, query: Query) -> ray.ObjectRef: return_when=asyncio.FIRST_COMPLETED) if self.config_updated_event.is_set(): self.config_updated_event.clear() - # We are pretty sure a free replica is ready now, let's recurse and - # assign this query a replica. + # We are pretty sure a free replica is ready now. assigned_ref = self._try_assign_replica(query) - self.num_queued_queries -= 1 - self.num_queued_queries_gauge.record( - self.num_queued_queries, tags={"endpoint": endpoint}) return assigned_ref @@ -185,8 +168,7 @@ def __init__(self, controller_handle: ActorHandle): self.controller = controller_handle self.endpoint_policies: Dict[str, EndpointPolicy] = dict() - - self.backend_replicas: Dict[str, ReplicaSet] = dict() + self.backend_replicas: Dict[str, ReplicaSet] = defaultdict(ReplicaSet) self._pending_endpoints: Dict[str, asyncio.Future] = dict() @@ -230,8 +212,8 @@ async def _update_replica_handles(self, replica_handles): replica_handles) for backend_tag, replica_handles in ChainMap(added, updated).items(): - self._get_or_create_replica_set( - backend_tag).update_worker_replicas(replica_handles) + self.backend_replicas[backend_tag].update_worker_replicas( + replica_handles) for backend_tag in removed.keys(): if backend_tag in self.backend_replicas: @@ -241,9 +223,8 @@ async def _update_backend_configs(self, backend_configs): added, removed, updated = compute_dict_delta(self.backend_replicas, backend_configs) for backend_tag, config in ChainMap(added, updated).items(): - self._get_or_create_replica_set( - backend_tag).set_max_concurrent_queries( - config.max_concurrent_queries) + self.backend_replicas[backend_tag].set_max_concurrent_queries( + config.max_concurrent_queries) for backend_tag in removed.keys(): if backend_tag in self.backend_replicas: @@ -280,17 +261,11 @@ async def assign_request( endpoint_policy = self.endpoint_policies[endpoint] chosen_backend, *shadow_backends = endpoint_policy.assign(query) - result_ref = await self._get_or_create_replica_set( - chosen_backend).assign_replica(query) + result_ref = await self.backend_replicas[chosen_backend + ].assign_replica(query) for backend in shadow_backends: - (await self._get_or_create_replica_set(backend) - .assign_replica(query)) + await self.backend_replicas[backend].assign_replica(query) self.num_router_requests.record(1, tags={"endpoint": endpoint}) return result_ref - - def _get_or_create_replica_set(self, backend_name): - if backend_name not in self.backend_replicas: - self.backend_replicas[backend_name] = ReplicaSet(backend_name) - return self.backend_replicas[backend_name] diff --git a/python/ray/serve/tests/test_api.py b/python/ray/serve/tests/test_api.py index 62f239f78782..a35f7e54b361 100644 --- a/python/ray/serve/tests/test_api.py +++ b/python/ray/serve/tests/test_api.py @@ -875,10 +875,6 @@ def verify_metrics(do_assert=False): # gauge "replica_processing_queries", "replica_queued_queries", - # handle - "serve_handle_request_counter", - # ReplicaSet - "backend_queued_queries" ] for metric in expected_metrics: # For the final error round diff --git a/python/ray/serve/tests/test_router.py b/python/ray/serve/tests/test_router.py index 9b8eb5548b7c..231ac11a5bfd 100644 --- a/python/ray/serve/tests/test_router.py +++ b/python/ray/serve/tests/test_router.py @@ -204,7 +204,7 @@ async def num_queries(self): return self._num_queries # We will test a scenario with two replicas in the replica set. - rs = ReplicaSet("my_backend") + rs = ReplicaSet() workers = [MockWorker.remote() for _ in range(2)] rs.set_max_concurrent_queries(1) rs.update_worker_replicas(workers) From 3bbd0f4c93891f15674624681fd489f128865cc0 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 155/244] Revert "[RLlib] Fix custom multi action distr (#13681)" This reverts commit 44f52af9ebe3e32c2c581a7d68a6f69930eabcb2. --- rllib/models/catalog.py | 5 ++-- rllib/tests/test_catalog.py | 52 +++---------------------------------- 2 files changed, 6 insertions(+), 51 deletions(-) diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index 66796d71f907..6d0bfd111296 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -199,14 +199,13 @@ def get_action_dist( config = config or MODEL_DEFAULTS # Custom distribution given. if config.get("custom_action_dist"): - custom_action_config = config.copy() - action_dist_name = custom_action_config.pop("custom_action_dist") + action_dist_name = config["custom_action_dist"] logger.debug( "Using custom action distribution {}".format(action_dist_name)) dist_cls = _global_registry.get(RLLIB_ACTION_DIST, action_dist_name) return ModelCatalog._get_multi_action_distribution( - dist_cls, action_space, custom_action_config, framework) + dist_cls, action_space, config, framework) # Dist_type is given directly as a class. elif type(dist_type) is type and \ diff --git a/rllib/tests/test_catalog.py b/rllib/tests/test_catalog.py index bbd1ec1bbbaa..b98f7143a56d 100644 --- a/rllib/tests/test_catalog.py +++ b/rllib/tests/test_catalog.py @@ -1,15 +1,13 @@ -from functools import partial import gym -from gym.spaces import Box, Dict, Discrete +from gym.spaces import Box, Discrete import numpy as np import unittest import ray -from ray.rllib.models import ActionDistribution, ModelCatalog, MODEL_DEFAULTS -from ray.rllib.models.preprocessors import NoPreprocessor, Preprocessor -from ray.rllib.models.tf.tf_action_dist import MultiActionDistribution, \ - TFActionDistribution +from ray.rllib.models import ModelCatalog, MODEL_DEFAULTS, ActionDistribution from ray.rllib.models.tf.tf_modelv2 import TFModelV2 +from ray.rllib.models.tf.tf_action_dist import TFActionDistribution +from ray.rllib.models.preprocessors import NoPreprocessor, Preprocessor from ray.rllib.utils.annotations import override from ray.rllib.utils.framework import try_import_tf, try_import_torch from ray.rllib.utils.test_utils import framework_iterator @@ -62,12 +60,6 @@ def logp(self, x): return tf.zeros(self.output_shape) -class CustomMultiActionDistribution(MultiActionDistribution): - @override(MultiActionDistribution) - def entropy(self): - raise NotImplementedError - - class TestModelCatalog(unittest.TestCase): def tearDown(self): ray.shutdown() @@ -169,42 +161,6 @@ class Model(): with self.assertRaises(NotImplementedError): dist.entropy() - def test_custom_multi_action_distribution(self): - class Model(): - pass - - ray.init( - object_store_memory=1000 * 1024 * 1024, - ignore_reinit_error=True) # otherwise fails sometimes locally - # registration - ModelCatalog.register_custom_action_dist( - "test", CustomMultiActionDistribution) - s1 = Discrete(5) - s2 = Box(0, 1, shape=(3, ), dtype=np.float32) - spaces = dict(action_1=s1, action_2=s2) - action_space = Dict(spaces) - # test retrieving it - model_config = MODEL_DEFAULTS.copy() - model_config["custom_action_dist"] = "test" - dist_cls, param_shape = ModelCatalog.get_action_dist( - action_space, model_config) - self.assertIsInstance(dist_cls, partial) - self.assertEqual(param_shape, s1.n + 2 * s2.shape[0]) - - # test the class works as a distribution - dist_input = tf1.placeholder(tf.float32, (None, param_shape)) - model = Model() - model.model_config = model_config - dist = dist_cls(dist_input, model=model) - self.assertIsInstance(dist.sample(), dict) - self.assertIn("action_1", dist.sample()) - self.assertIn("action_2", dist.sample()) - self.assertEqual(dist.sample()["action_1"].dtype, tf.int64) - self.assertEqual(dist.sample()["action_2"].shape[1:], s2.shape) - - with self.assertRaises(NotImplementedError): - dist.entropy() - if __name__ == "__main__": import pytest From f3a7c2eddf49d7f06b4fe2484b0046f66d82944c Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 156/244] Revert "[Ax] Align optimization mode and reported SEM with Ax (#13611)" This reverts commit fcf7ef9ed520a2ad3e6f8ead82d980246ffc8ddc. --- python/ray/tune/suggest/ax.py | 44 +++++++------------------ python/ray/tune/tests/test_sample.py | 10 +++--- python/ray/tune/tests/test_searchers.py | 6 ++-- 3 files changed, 17 insertions(+), 43 deletions(-) diff --git a/python/ray/tune/suggest/ax.py b/python/ray/tune/suggest/ax.py index 85aa79f30284..7cccf74a79d6 100644 --- a/python/ray/tune/suggest/ax.py +++ b/python/ray/tune/suggest/ax.py @@ -1,6 +1,7 @@ import copy from typing import Dict, List, Optional, Union +from ax.service.ax_client import AxClient from ray.tune.result import DEFAULT_METRIC from ray.tune.sample import Categorical, Float, Integer, LogUniform, \ Quantized, Uniform @@ -11,17 +12,8 @@ try: import ax - from ax.service.ax_client import AxClient except ImportError: - ax = AxClient = None - -# This exception only exists in newer Ax releases for python 3.7 -try: - from ax.exceptions.generation_strategy import \ - MaxParallelismReachedException -except ImportError: - MaxParallelismReachedException = Exception - + ax = None import logging from ray.tune.suggest import Searcher @@ -132,7 +124,6 @@ def __init__(self, assert ax is not None, """Ax must be installed! You can install AxSearch with the command: `pip install ax-platform sqlalchemy`.""" - if mode: assert mode in ["min", "max"], "`mode` must be 'min' or 'max'." @@ -160,6 +151,7 @@ def __init__(self, self.max_concurrent = max_concurrent + self._objective_name = metric self._parameters = [] self._live_trial_mapping = {} @@ -187,10 +179,6 @@ def _setup_experiment(self): "`AxClient.create_experiment()`, or you should pass an " "Ax search space as the `space` parameter to `AxSearch`, " "or pass a `config` dict to `tune.run()`.") - if self._mode not in ["min", "max"]: - raise ValueError( - "Please specify the `mode` argument when initializing " - "the `AxSearch` object or pass it to `tune.run()`.") self._ax.create_experiment( parameters=self._space, objective_name=self._metric, @@ -200,25 +188,16 @@ def _setup_experiment(self): else: if any([ self._space, self._parameter_constraints, - self._outcome_constraints, self._mode, self._metric + self._outcome_constraints ]): raise ValueError( "If you create the Ax experiment yourself, do not pass " "values for these parameters to `AxSearch`: {}.".format([ - "space", - "parameter_constraints", - "outcome_constraints", - "mode", - "metric", + "space", "parameter_constraints", "outcome_constraints" ])) exp = self._ax.experiment - - # Update mode and metric from experiment if it has been passed - self._mode = "min" \ - if exp.optimization_config.objective.minimize else "max" - self._metric = exp.optimization_config.objective.metric.name - + self._objective_name = exp.optimization_config.objective.metric.name self._parameters = list(exp.parameters) if self._ax._enforce_sequential_optimization: @@ -260,10 +239,7 @@ def suggest(self, trial_id: str) -> Optional[Dict]: config = self._points_to_evaluate.pop(0) parameters, trial_index = self._ax.attach_trial(config) else: - try: - parameters, trial_index = self._ax.get_next_trial() - except MaxParallelismReachedException: - return None + parameters, trial_index = self._ax.get_next_trial() self._live_trial_mapping[trial_id] = trial_index return unflatten_dict(parameters) @@ -279,12 +255,14 @@ def on_trial_complete(self, trial_id, result=None, error=False): def _process_result(self, trial_id, result): ax_trial_index = self._live_trial_mapping[trial_id] - metric_dict = {self._metric: (result[self._metric], None)} + metric_dict = { + self._objective_name: (result[self._objective_name], 0.0) + } outcome_names = [ oc.metric.name for oc in self._ax.experiment.optimization_config.outcome_constraints ] - metric_dict.update({on: (result[on], None) for on in outcome_names}) + metric_dict.update({on: (result[on], 0.0) for on in outcome_names}) self._ax.complete_trial( trial_index=ax_trial_index, raw_data=metric_dict) diff --git a/python/ray/tune/tests/test_sample.py b/python/ray/tune/tests/test_sample.py index b631dc2b15b5..0b752e1be207 100644 --- a/python/ray/tune/tests/test_sample.py +++ b/python/ray/tune/tests/test_sample.py @@ -263,14 +263,12 @@ def testConvertAx(self): ] client1 = AxClient(random_seed=1234) - client1.create_experiment( - parameters=converted_config, objective_name="a", minimize=False) - searcher1 = AxSearch(ax_client=client1) + client1.create_experiment(parameters=converted_config) + searcher1 = AxSearch(ax_client=client1, metric="a", mode="max") client2 = AxClient(random_seed=1234) - client2.create_experiment( - parameters=ax_config, objective_name="a", minimize=False) - searcher2 = AxSearch(ax_client=client2) + client2.create_experiment(parameters=ax_config) + searcher2 = AxSearch(ax_client=client2, metric="a", mode="max") config1 = searcher1.suggest("0") config2 = searcher2.suggest("0") diff --git a/python/ray/tune/tests/test_searchers.py b/python/ray/tune/tests/test_searchers.py index 403b11276dcc..0b50be49db90 100644 --- a/python/ray/tune/tests/test_searchers.py +++ b/python/ray/tune/tests/test_searchers.py @@ -49,10 +49,8 @@ def testAx(self): # At least one nan, inf, -inf and float client = AxClient(random_seed=4321) client.create_experiment( - parameters=converted_config, - objective_name="_metric", - minimize=False) - searcher = AxSearch(ax_client=client) + parameters=converted_config, objective_name="_metric") + searcher = AxSearch(ax_client=client, metric="_metric", mode="max") out = tune.run( _invalid_objective, From d6f4fccfc682fefc5c55739473f8bf4043c96d93 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 157/244] Revert "[RLlib] Fix multiple Unity3DEnvs trying to connect to the same custom port (#13519)" This reverts commit 67f411045755aca66486ae83ead77f208b614b69. --- python/requirements_rllib.txt | 4 -- rllib/BUILD | 7 --- rllib/env/wrappers/tests/test_unity3d_env.py | 55 -------------------- rllib/env/wrappers/unity3d_env.py | 21 ++------ 4 files changed, 5 insertions(+), 82 deletions(-) delete mode 100644 rllib/env/wrappers/tests/test_unity3d_env.py diff --git a/python/requirements_rllib.txt b/python/requirements_rllib.txt index 5f5a0f99112d..0cefb02969b3 100644 --- a/python/requirements_rllib.txt +++ b/python/requirements_rllib.txt @@ -16,7 +16,3 @@ kaggle_environments # For MAML on PyTorch. higher - -# Unity3D testing -mlagents -mlagents_envs diff --git a/rllib/BUILD b/rllib/BUILD index dd1d4c1638a7..f8f1cbd3c6f8 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -1069,13 +1069,6 @@ sh_test( data = glob(["examples/serving/*.py"]), ) -py_test( - name = "env/wrappers/tests/test_unity3d_env", - tags = ["env"], - size = "small", - srcs = ["env/wrappers/tests/test_unity3d_env.py"] -) - py_test( name = "env/wrappers/tests/test_recsim_wrapper", tags = ["env"], diff --git a/rllib/env/wrappers/tests/test_unity3d_env.py b/rllib/env/wrappers/tests/test_unity3d_env.py deleted file mode 100644 index 5e347ed0ec05..000000000000 --- a/rllib/env/wrappers/tests/test_unity3d_env.py +++ /dev/null @@ -1,55 +0,0 @@ -import unittest -from unittest.mock import patch - -from ray.rllib.env.wrappers.unity3d_env import Unity3DEnv - - -@patch("mlagents_envs.environment.UnityEnvironment") -class TestUnity3DEnv(unittest.TestCase): - def test_port_editor(self, mock_unity3d): - """Test if the environment uses the editor port - when no environment file is provided""" - - _ = Unity3DEnv(port=None) - args, kwargs = mock_unity3d.call_args - mock_unity3d.assert_called_once() - self.assertEqual(5004, kwargs.get("base_port")) - - def test_port_app(self, mock_unity3d): - """Test if the environment uses the correct port - when the environment file is provided""" - - _ = Unity3DEnv(file_name="app", port=None) - args, kwargs = mock_unity3d.call_args - mock_unity3d.assert_called_once() - self.assertEqual(5005, kwargs.get("base_port")) - - def test_ports_multi_app(self, mock_unity3d): - """Test if the base_port + worker_id - is different for each environment""" - - _ = Unity3DEnv(file_name="app", port=None) - args, kwargs_first = mock_unity3d.call_args - _ = Unity3DEnv(file_name="app", port=None) - args, kwargs_second = mock_unity3d.call_args - self.assertNotEqual( - kwargs_first.get("base_port") + kwargs_first.get("worker_id"), - kwargs_second.get("base_port") + kwargs_second.get("worker_id")) - - def test_custom_port_app(self, mock_unity3d): - """Test if the base_port + worker_id is different - for each environment when using custom ports""" - - _ = Unity3DEnv(file_name="app", port=5010) - args, kwargs_first = mock_unity3d.call_args - _ = Unity3DEnv(file_name="app", port=5010) - args, kwargs_second = mock_unity3d.call_args - self.assertNotEqual( - kwargs_first.get("base_port") + kwargs_first.get("worker_id"), - kwargs_second.get("base_port") + kwargs_second.get("worker_id")) - - -if __name__ == "__main__": - import pytest - import sys - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/env/wrappers/unity3d_env.py b/rllib/env/wrappers/unity3d_env.py index 876c06e96508..753c234439d7 100644 --- a/rllib/env/wrappers/unity3d_env.py +++ b/rllib/env/wrappers/unity3d_env.py @@ -27,12 +27,7 @@ class Unity3DEnv(MultiAgentEnv): inside an RLlib PolicyClient for cloud/distributed training of Unity games. """ - # Default base port when connecting directly to the Editor - _BASE_PORT_EDITOR = 5004 - # Default base port when connecting to a compiled environment - _BASE_PORT_ENVIRONMENT = 5005 - # The worker_id for each environment instance - _WORKER_ID = 0 + _BASE_PORT = 5004 def __init__(self, file_name: str = None, @@ -78,24 +73,18 @@ def __init__(self, # environments (num_workers >> 1). Otherwise, would lead to port # conflicts sometimes. time.sleep(random.randint(1, 10)) - port_ = port or (self._BASE_PORT_ENVIRONMENT - if file_name else self._BASE_PORT_EDITOR) - # cache the worker_id and - # increase it for the next environment - worker_id_ = Unity3DEnv._WORKER_ID if file_name else 0 - Unity3DEnv._WORKER_ID += 1 + port_ = port or self._BASE_PORT + self._BASE_PORT += 1 try: self.unity_env = UnityEnvironment( file_name=file_name, - worker_id=worker_id_, + worker_id=0, base_port=port_, seed=seed, no_graphics=no_graphics, timeout_wait=timeout_wait, ) - print( - "Created UnityEnvironment for port {}".format(port_ + - worker_id_)) + print("Created UnityEnvironment for port {}".format(port_)) except mlagents_envs.exception.UnityWorkerInUseException: pass else: From 349b7450ce81983be4510ba8028840a370742c22 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 158/244] Revert "[RLlib] Atari-RAM-Preprocessing, unsigned observation vector results in a false preprocessed observation (#13013)" This reverts commit 44041c9cf44f256fafbabce6c2455b7a74f73ee9. --- rllib/models/preprocessors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rllib/models/preprocessors.py b/rllib/models/preprocessors.py index 0abfb8658080..44312a807432 100644 --- a/rllib/models/preprocessors.py +++ b/rllib/models/preprocessors.py @@ -140,7 +140,7 @@ def _init_shape(self, obs_space: gym.Space, options: dict) -> List[int]: @override(Preprocessor) def transform(self, observation: TensorType) -> np.ndarray: self.check_shape(observation) - return (observation.astype("float32") - 128) / 128 + return (observation - 128) / 128 class OneHotPreprocessor(Preprocessor): From 9c985fd43307542d1c5d5f470686664f652f8b01 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 159/244] Revert "[GCS]only update states related fields when publish actor table data (#13448)" This reverts commit abbb0a1b8d5d635103a866a8d29a0a175815a15e. --- .../stats_collector/stats_collector_head.py | 12 +-- .../tests/test_stats_collector.py | 83 ------------------- .../gcs/gcs_client/service_based_accessor.cc | 2 +- .../gcs_client/service_based_gcs_client.cc | 2 +- src/ray/gcs/gcs_server/gcs_actor_manager.cc | 26 +++--- src/ray/gcs/gcs_server/gcs_actor_manager.h | 12 +-- 6 files changed, 16 insertions(+), 121 deletions(-) diff --git a/dashboard/modules/stats_collector/stats_collector_head.py b/dashboard/modules/stats_collector/stats_collector_head.py index d8c085c0ea62..e0b6cffa77b8 100644 --- a/dashboard/modules/stats_collector/stats_collector_head.py +++ b/dashboard/modules/stats_collector/stats_collector_head.py @@ -221,25 +221,15 @@ def _process_actor_table_data(data): RETRY_GET_ALL_ACTOR_INFO_INTERVAL_SECONDS) # Receive actors from channel. - state_keys = ("state", "address", "numRestarts", "timestamp", "pid") async for sender, msg in receiver.iter(): try: - actor_id, actor_table_data = msg + _, actor_table_data = msg pubsub_message = ray.gcs_utils.PubSubMessage.FromString( actor_table_data) message = ray.gcs_utils.ActorTableData.FromString( pubsub_message.data) actor_table_data = actor_table_data_to_dict(message) _process_actor_table_data(actor_table_data) - # If actor is not new registered but updated, we only update - # states related fields. - if actor_table_data["state"] != "DEPENDENCIES_UNREADY": - actor_id = actor_id.decode("UTF-8")[len( - ray.gcs_utils.TablePrefix_ACTOR_string + ":"):] - actor_table_data_copy = dict(DataSource.actors[actor_id]) - for k in state_keys: - actor_table_data_copy[k] = actor_table_data[k] - actor_table_data = actor_table_data_copy actor_id = actor_table_data["actorId"] job_id = actor_table_data["jobId"] node_id = actor_table_data["address"]["rayletId"] diff --git a/dashboard/modules/stats_collector/tests/test_stats_collector.py b/dashboard/modules/stats_collector/tests/test_stats_collector.py index cb4a1d3c5470..fcd1c42e3456 100644 --- a/dashboard/modules/stats_collector/tests/test_stats_collector.py +++ b/dashboard/modules/stats_collector/tests/test_stats_collector.py @@ -7,12 +7,9 @@ import random import pytest import ray -import redis import threading import ray.new_dashboard.modules.stats_collector.stats_collector_consts \ as stats_collector_consts -import ray.new_dashboard.utils as dashboard_utils -import ray.ray_constants as ray_constants from datetime import datetime, timedelta from ray.cluster_utils import Cluster from ray.new_dashboard.tests.conftest import * # noqa @@ -420,85 +417,5 @@ class InfeasibleActor: raise Exception(f"Timed out while testing, {ex_stack}") -def test_actor_pubsub(disable_aiohttp_cache, ray_start_with_dashboard): - timeout = 5 - assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) - is True) - address_info = ray_start_with_dashboard - address = address_info["redis_address"] - address = address.split(":") - assert len(address) == 2 - - client = redis.StrictRedis( - host=address[0], - port=int(address[1]), - password=ray_constants.REDIS_DEFAULT_PASSWORD) - - p = client.pubsub(ignore_subscribe_messages=True) - p.psubscribe(ray.gcs_utils.RAY_ACTOR_PUBSUB_PATTERN) - - @ray.remote - class DummyActor: - def __init__(self): - pass - - # Create a dummy actor. - a = DummyActor.remote() - - def handle_pub_messages(client, msgs, timeout, expect_num): - start_time = time.time() - while time.time() - start_time < timeout and len(msgs) < expect_num: - msg = client.get_message() - if msg is None: - time.sleep(0.01) - continue - pubsub_msg = ray.gcs_utils.PubSubMessage.FromString(msg["data"]) - actor_data = ray.gcs_utils.ActorTableData.FromString( - pubsub_msg.data) - msgs.append(actor_data) - - msgs = [] - handle_pub_messages(p, msgs, timeout, 2) - - # Assert we received published actor messages with state - # DEPENDENCIES_UNREADY and ALIVE. - assert len(msgs) == 2 - - # Kill actor. - ray.kill(a) - handle_pub_messages(p, msgs, timeout, 3) - - # Assert we received published actor messages with state DEAD. - assert len(msgs) == 3 - - def actor_table_data_to_dict(message): - return dashboard_utils.message_to_dict( - message, { - "actorId", "parentId", "jobId", "workerId", "rayletId", - "actorCreationDummyObjectId", "callerId", "taskId", - "parentTaskId", "sourceActorId", "placementGroupId" - }, - including_default_value_fields=False) - - non_state_keys = ("actorId", "jobId", "taskSpec") - for msg in msgs: - actor_data_dict = actor_table_data_to_dict(msg) - # DEPENDENCIES_UNREADY is 0, which would not be keeped in dict. We - # need check its original value. - if msg.state == 0: - assert len(actor_data_dict) > 5 - for k in non_state_keys: - assert k in actor_data_dict - # For status that is not DEPENDENCIES_UNREADY, only states fields will - # be published. - elif actor_data_dict["state"] in ("ALIVE", "DEAD"): - assert actor_data_dict.keys() == { - "state", "address", "timestamp", "pid" - } - else: - raise Exception("Unknown state: {}".format( - actor_data_dict["state"])) - - if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/src/ray/gcs/gcs_client/service_based_accessor.cc b/src/ray/gcs/gcs_client/service_based_accessor.cc index 891bd6ba6a54..821e0f7d930a 100644 --- a/src/ray/gcs/gcs_client/service_based_accessor.cc +++ b/src/ray/gcs/gcs_client/service_based_accessor.cc @@ -275,7 +275,7 @@ Status ServiceBasedActorInfoAccessor::AsyncSubscribe( auto on_subscribe = [subscribe](const std::string &id, const std::string &data) { ActorTableData actor_data; actor_data.ParseFromString(data); - subscribe(ActorID::FromHex(id), actor_data); + subscribe(ActorID::FromBinary(actor_data.actor_id()), actor_data); }; return client_impl_->GetGcsPubSub().Subscribe(ACTOR_CHANNEL, actor_id.Hex(), on_subscribe, subscribe_done); diff --git a/src/ray/gcs/gcs_client/service_based_gcs_client.cc b/src/ray/gcs/gcs_client/service_based_gcs_client.cc index 5fccd645726d..cf9bdd9e4d4e 100644 --- a/src/ray/gcs/gcs_client/service_based_gcs_client.cc +++ b/src/ray/gcs/gcs_client/service_based_gcs_client.cc @@ -207,7 +207,7 @@ void ServiceBasedGcsClient::ReconnectGcsServer() { RAY_LOG(INFO) << "Repeated reconnection in " << RayConfig::instance().minimum_gcs_reconnect_interval_milliseconds() - << " milliseconds, return directly."; + << "milliseconds, return directly."; return; } diff --git a/src/ray/gcs/gcs_server/gcs_actor_manager.cc b/src/ray/gcs/gcs_server/gcs_actor_manager.cc index 2f3740654c8b..7b30bbc7dde9 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_actor_manager.cc @@ -503,9 +503,9 @@ void GcsActorManager::DestroyActor(const ActorID &actor_id) { RAY_CHECK_OK(gcs_table_storage_->ActorTable().Put( actor->GetActorID(), *actor_table_data, [this, actor_id, actor_table_data](Status status) { - RAY_CHECK_OK(gcs_pub_sub_->Publish( - ACTOR_CHANNEL, actor_id.Hex(), - GenActorDataOnlyWithStates(*actor_table_data)->SerializeAsString(), nullptr)); + RAY_CHECK_OK(gcs_pub_sub_->Publish(ACTOR_CHANNEL, actor_id.Hex(), + actor_table_data->SerializeAsString(), + nullptr)); // Destroy placement group owned by this actor. destroy_owned_placement_group_if_needed_(actor_id); })); @@ -677,6 +677,7 @@ void GcsActorManager::ReconstructActor(const ActorID &actor_id, bool need_resche // between memory cache and storage. mutable_actor_table_data->set_num_restarts(num_restarts + 1); mutable_actor_table_data->set_state(rpc::ActorTableData::RESTARTING); + const auto actor_table_data = actor->GetActorTableData(); // Make sure to reset the address before flushing to GCS. Otherwise, // GCS will mistakenly consider this lease request succeeds when restarting. actor->UpdateAddress(rpc::Address()); @@ -684,11 +685,10 @@ void GcsActorManager::ReconstructActor(const ActorID &actor_id, bool need_resche // The backend storage is reliable in the future, so the status must be ok. RAY_CHECK_OK(gcs_table_storage_->ActorTable().Put( actor_id, *mutable_actor_table_data, - [this, actor_id, mutable_actor_table_data](Status status) { - RAY_CHECK_OK(gcs_pub_sub_->Publish( - ACTOR_CHANNEL, actor_id.Hex(), - GenActorDataOnlyWithStates(*mutable_actor_table_data)->SerializeAsString(), - nullptr)); + [this, actor_id, actor_table_data](Status status) { + RAY_CHECK_OK(gcs_pub_sub_->Publish(ACTOR_CHANNEL, actor_id.Hex(), + actor_table_data.SerializeAsString(), + nullptr)); })); gcs_actor_scheduler_->Schedule(actor); } else { @@ -701,7 +701,6 @@ void GcsActorManager::ReconstructActor(const ActorID &actor_id, bool need_resche } mutable_actor_table_data->set_state(rpc::ActorTableData::DEAD); - mutable_actor_table_data->set_timestamp(current_sys_time_ms()); // The backend storage is reliable in the future, so the status must be ok. RAY_CHECK_OK(gcs_table_storage_->ActorTable().Put( actor_id, *mutable_actor_table_data, @@ -714,8 +713,7 @@ void GcsActorManager::ReconstructActor(const ActorID &actor_id, bool need_resche } RAY_CHECK_OK(gcs_pub_sub_->Publish( ACTOR_CHANNEL, actor_id.Hex(), - GenActorDataOnlyWithStates(*mutable_actor_table_data)->SerializeAsString(), - nullptr)); + mutable_actor_table_data->SerializeAsString(), nullptr)); })); // The actor is dead, but we should not remove the entry from the // registered actors yet. If the actor is owned, we will destroy the actor @@ -756,9 +754,9 @@ void GcsActorManager::OnActorCreationSuccess(const std::shared_ptr &ac RAY_CHECK_OK(gcs_table_storage_->ActorTable().Put( actor_id, actor_table_data, [this, actor_id, actor_table_data, actor](Status status) { - RAY_CHECK_OK(gcs_pub_sub_->Publish( - ACTOR_CHANNEL, actor_id.Hex(), - GenActorDataOnlyWithStates(actor_table_data)->SerializeAsString(), nullptr)); + RAY_CHECK_OK(gcs_pub_sub_->Publish(ACTOR_CHANNEL, actor_id.Hex(), + actor_table_data.SerializeAsString(), + nullptr)); // Invoke all callbacks for all registration requests of this actor (duplicated // requests are included) and remove all of them from // actor_to_create_callbacks_. diff --git a/src/ray/gcs/gcs_server/gcs_actor_manager.h b/src/ray/gcs/gcs_server/gcs_actor_manager.h index d3ffc309793e..0f47cfb4f672 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_manager.h +++ b/src/ray/gcs/gcs_server/gcs_actor_manager.h @@ -316,6 +316,7 @@ class GcsActorManager : public rpc::ActorInfoHandler { absl::flat_hash_set GetUnresolvedActorsByOwnerWorker( const NodeID &node_id, const WorkerID &worker_id) const; + private: /// Reconstruct the specified actor. /// /// \param actor The target actor to be reconstructed. @@ -345,17 +346,6 @@ class GcsActorManager : public rpc::ActorInfoHandler { /// \param actor The actor to be killed. void AddDestroyedActorToCache(const std::shared_ptr &actor); - std::shared_ptr GenActorDataOnlyWithStates( - const rpc::ActorTableData &actor) { - auto actor_delta = std::make_shared(); - actor_delta->set_state(actor.state()); - actor_delta->mutable_address()->CopyFrom(actor.address()); - actor_delta->set_num_restarts(actor.num_restarts()); - actor_delta->set_timestamp(actor.timestamp()); - actor_delta->set_pid(actor.pid()); - return actor_delta; - } - /// Callbacks of pending `RegisterActor` requests. /// Maps actor ID to actor registration callbacks, which is used to filter duplicated /// messages from a driver/worker caused by some network problems. From 834e382069136dc935509245a598fbb66d906bfd Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 160/244] Revert "[Serve] Add "endpoint registered" message to router log (#13752)" This reverts commit c732b42ff189419680292b920f24ae87cb2de5fb. --- python/ray/serve/router.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/ray/serve/router.py b/python/ray/serve/router.py index c4a87b49bb60..477f037fd459 100644 --- a/python/ray/serve/router.py +++ b/python/ray/serve/router.py @@ -256,7 +256,6 @@ async def assign_request( raise RayServeException( f"Endpoint {endpoint} was removed. This request " "cannot be completed.") - logger.info(f"Endpoint {endpoint} registered.") endpoint_policy = self.endpoint_policies[endpoint] chosen_backend, *shadow_backends = endpoint_policy.assign(query) From d14af72fef44d6debbdab961c807c4f81a319d3d Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 161/244] Revert "[Core] Hotfix Windows Compilation Error for ClusterTaskManager (#13754)" This reverts commit 8d72380ebc20c7aa854e4a1df03157aa96304f9d. --- src/ray/raylet/scheduling/cluster_task_manager.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/ray/raylet/scheduling/cluster_task_manager.cc b/src/ray/raylet/scheduling/cluster_task_manager.cc index a4dbff1f48dd..43c6ce1cc78a 100644 --- a/src/ray/raylet/scheduling/cluster_task_manager.cc +++ b/src/ray/raylet/scheduling/cluster_task_manager.cc @@ -618,15 +618,15 @@ bool ClusterTaskManager::AnyPendingTasks(Task *exemplar, bool *any_pending, std::string ClusterTaskManager::DebugStr() const { // TODO(Shanly): This method will be replaced with `DebugString` once we remove the // legacy scheduler. - auto accumulator = [](size_t state, const std::pair> &pair) { + auto accumulator = [](int state, const std::pair> &pair) { return state + pair.second.size(); }; - size_t num_infeasible_tasks = std::accumulate( - infeasible_tasks_.begin(), infeasible_tasks_.end(), (size_t)0, accumulator); - size_t num_tasks_to_schedule = std::accumulate( - tasks_to_schedule_.begin(), tasks_to_schedule_.end(), (size_t)0, accumulator); - size_t num_tasks_to_dispatch = std::accumulate( - tasks_to_dispatch_.begin(), tasks_to_dispatch_.end(), (size_t)0, accumulator); + int num_infeasible_tasks = + std::accumulate(infeasible_tasks_.begin(), infeasible_tasks_.end(), 0, accumulator); + int num_tasks_to_schedule = std::accumulate(tasks_to_schedule_.begin(), + tasks_to_schedule_.end(), 0, accumulator); + int num_tasks_to_dispatch = std::accumulate(tasks_to_dispatch_.begin(), + tasks_to_dispatch_.end(), 0, accumulator); std::stringstream buffer; buffer << "========== Node: " << self_node_id_ << " =================\n"; buffer << "Infeasible queue length: " << num_infeasible_tasks << "\n"; From d8ee9b5fc31294666e2fc4680de1ee6a05184de2 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 162/244] Revert "Revert "[Serve] Fix ServeHandle serialization (#13695)" (#13753)" This reverts commit 405c8d77945238f10641e428f6121acca8757536. --- python/ray/serve/api.py | 7 +++++ python/ray/serve/handle.py | 25 ++++++++++----- python/ray/serve/tests/test_handle.py | 44 ++++++++++++++++++++++++++- 3 files changed, 68 insertions(+), 8 deletions(-) diff --git a/python/ray/serve/api.py b/python/ray/serve/api.py index b42cd78464a7..19783dc3700b 100644 --- a/python/ray/serve/api.py +++ b/python/ray/serve/api.py @@ -66,6 +66,8 @@ def check(self, *args, **kwargs): class ThreadProxiedRouter: def __init__(self, controller_handle, sync: bool): + self.controller_handle = controller_handle + self.sync = sync self.router = Router(controller_handle) if sync: @@ -92,6 +94,11 @@ def _remote(self, endpoint_name, handle_options, request_data, **kwargs) return coro + def __reduce__(self): + deserializer = ThreadProxiedRouter + serialized_data = (self.controller_handle, self.sync) + return deserializer, serialized_data + class Client: def __init__(self, diff --git a/python/ray/serve/handle.py b/python/ray/serve/handle.py index c6951c6380b9..4ee2624a8d31 100644 --- a/python/ray/serve/handle.py +++ b/python/ray/serve/handle.py @@ -4,8 +4,6 @@ from typing import Any, Dict, Optional, Union from enum import Enum -from ray.serve.router import Router - @dataclass(frozen=True) class HandleOptions: @@ -40,10 +38,11 @@ class RayServeHandle: # raises RayTaskError Exception """ - def __init__(self, - router: Router, - endpoint_name, - handle_options: Optional[HandleOptions] = None): + def __init__( + self, + router, # ThreadProxiedRouter + endpoint_name, + handle_options: Optional[HandleOptions] = None): self.router = router self.endpoint_name = endpoint_name self.handle_options = handle_options or HandleOptions() @@ -78,7 +77,7 @@ def options(self, async def remote(self, request_data: Optional[Union[Dict, Any]] = None, **kwargs): - """Issue an asynchrounous request to the endpoint. + """Issue an asynchronous request to the endpoint. Returns a Ray ObjectRef whose results can be waited for or retrieved using ray.wait or ray.get (or ``await object_ref``), respectively. @@ -98,6 +97,12 @@ async def remote(self, def __repr__(self): return f"{self.__class__.__name__}(endpoint='{self.endpoint_name}')" + def __reduce__(self): + deserializer = RayServeHandle + serialized_data = (self.router, self.endpoint_name, + self.handle_options) + return deserializer, serialized_data + class RayServeSyncHandle(RayServeHandle): def remote(self, request_data: Optional[Union[Dict, Any]] = None, @@ -123,3 +128,9 @@ def remote(self, request_data: Optional[Union[Dict, Any]] = None, future: concurrent.futures.Future = asyncio.run_coroutine_threadsafe( coro, self.router.async_loop) return future.result() + + def __reduce__(self): + deserializer = RayServeSyncHandle + serialized_data = (self.router, self.endpoint_name, + self.handle_options) + return deserializer, serialized_data diff --git a/python/ray/serve/tests/test_handle.py b/python/ray/serve/tests/test_handle.py index c17db7686aad..88ab9d2c2b7a 100644 --- a/python/ray/serve/tests/test_handle.py +++ b/python/ray/serve/tests/test_handle.py @@ -1,9 +1,51 @@ import requests - +import pytest import ray from ray import serve +@pytest.mark.asyncio +async def test_async_handle_serializable(serve_instance): + client = serve_instance + + def f(_): + return "hello" + + client.create_backend("f", f) + client.create_endpoint("f", backend="f") + + @ray.remote + class TaskActor: + async def task(self, handle): + ref = await handle.remote() + output = await ref + return output + + handle = client.get_handle("f", sync=False) + + task_actor = TaskActor.remote() + result = await task_actor.task.remote(handle) + assert result == "hello" + + +def test_sync_handle_serializable(serve_instance): + client = serve_instance + + def f(_): + return "hello" + + client.create_backend("f", f) + client.create_endpoint("f", backend="f") + + @ray.remote + def task(handle): + return ray.get(handle.remote()) + + handle = client.get_handle("f", sync=True) + result_ref = task.remote(handle) + assert ray.get(result_ref) == "hello" + + def test_handle_in_endpoint(serve_instance): client = serve_instance From 11cb3a618b2f9177bf66ba3c53dcb6ed4ea57200 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 163/244] Revert " Report failed deserialization of errors in Ray client" This reverts commit c1f9c0d44ebd9d65302c58b9bbeb8df6ae1c5c8e. --- python/ray/util/client/worker.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/python/ray/util/client/worker.py b/python/ray/util/client/worker.py index b0a4b78f52b1..9f2f189c6ae2 100644 --- a/python/ray/util/client/worker.py +++ b/python/ray/util/client/worker.py @@ -172,11 +172,7 @@ def _get(self, ref: ClientObjectRef, timeout: float): except grpc.RpcError as e: raise e.details() if not data.valid: - try: - err = cloudpickle.loads(data.error) - except Exception: - logger.exception("Failed to deserialize {}".format(data.error)) - raise + err = cloudpickle.loads(data.error) logger.error(err) raise err return loads_from_server(data.data) @@ -260,12 +256,7 @@ def _call_schedule_for_task( except grpc.RpcError as e: raise decode_exception(e.details) if not ticket.valid: - try: - raise cloudpickle.loads(ticket.error) - except Exception: - logger.exception("Failed to deserialize {}".format( - ticket.error)) - raise + raise cloudpickle.loads(ticket.error) return ticket.return_ids def call_release(self, id: bytes) -> None: From 1ba588626dc168322a17d490419deb81e086fee7 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 164/244] Revert "[docs] Fix MLflow / Tune example in documentation (#13740)" This reverts commit e215ffe2b515b70fbf8081a2643ad3886236d4fb. --- python/ray/tune/integration/mlflow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ray/tune/integration/mlflow.py b/python/ray/tune/integration/mlflow.py index 6e038b810f78..cbd3811d4e30 100644 --- a/python/ray/tune/integration/mlflow.py +++ b/python/ray/tune/integration/mlflow.py @@ -274,8 +274,8 @@ def train_fn(config): @mlflow_mixin def train_fn(config): for i in range(10): - loss = config["a"] + config["b"] - mlflow.log_metric(key="loss", value=loss) + loss = self.config["a"] + self.config["b"] + mlflow.log_metric(key="loss", value=loss}) tune.report(loss=loss, done=True) tune.run( From 65cc74f57387969c72f6957f42c613d45ed24aae Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 165/244] Revert "[autoscaler][AWS] Make sure subnets belong to same VPC as user-specified security groups (#13558)" This reverts commit 748813a26c1db18bf94709872366eaada6224361. --- python/ray/autoscaler/_private/aws/config.py | 56 +------------------ ...xample-head-and-worker-security-group.yaml | 31 ---------- python/ray/tests/aws/test_autoscaler_aws.py | 20 ------- python/ray/tests/aws/utils/constants.py | 13 ----- python/ray/tests/aws/utils/stubs.py | 21 +------ 5 files changed, 4 insertions(+), 137 deletions(-) delete mode 100644 python/ray/autoscaler/aws/example-head-and-worker-security-group.yaml diff --git a/python/ray/autoscaler/_private/aws/config.py b/python/ray/autoscaler/_private/aws/config.py index 4c3a1c448102..79fc57896dac 100644 --- a/python/ray/autoscaler/_private/aws/config.py +++ b/python/ray/autoscaler/_private/aws/config.py @@ -5,7 +5,6 @@ import json import os import time -from typing import Any, Dict, List import logging import boto3 @@ -358,23 +357,9 @@ def _configure_subnet(config): ec2 = _resource("ec2", config) use_internal_ips = config["provider"].get("use_internal_ips", False) - # If head or worker security group is specified, filter down to subnets - # belonging to the same VPC as the security group. - sg_ids = (config["head_node"].get("SecurityGroupIds", []) + - config["worker_nodes"].get("SecurityGroupIds", [])) - if sg_ids: - vpc_id_of_sg = _get_vpc_id_of_sg(sg_ids, config) - else: - vpc_id_of_sg = None - try: - candidate_subnets = ec2.subnets.all() - if vpc_id_of_sg: - candidate_subnets = [ - s for s in candidate_subnets if s.vpc_id == vpc_id_of_sg - ] subnets = sorted( - (s for s in candidate_subnets if s.state == "available" and ( + (s for s in ec2.subnets.all() if s.state == "available" and ( use_internal_ips or s.map_public_ip_on_launch)), reverse=True, # sort from Z-A key=lambda subnet: subnet.availability_zone) @@ -429,34 +414,6 @@ def _configure_subnet(config): return config -def _get_vpc_id_of_sg(sg_ids: List[str], config: Dict[str, Any]) -> str: - """Returns the VPC id of the security groups with the provided security - group ids. - - Errors if the provided security groups belong to multiple VPCs. - Errors if no security group with any of the provided ids is identified. - """ - sg_ids = list(set(sg_ids)) - - ec2 = _resource("ec2", config) - filters = [{"Name": "group-id", "Values": sg_ids}] - security_groups = ec2.security_groups.filter(Filters=filters) - vpc_ids = [sg.vpc_id for sg in security_groups] - vpc_ids = list(set(vpc_ids)) - - multiple_vpc_msg = "All security groups specified in the cluster config "\ - "should belong to the same VPC." - cli_logger.doassert(len(vpc_ids) <= 1, multiple_vpc_msg) - assert len(vpc_ids) <= 1, multiple_vpc_msg - - no_sg_msg = "Failed to detect a security group with id equal to any of "\ - "the configured SecurityGroupIds." - cli_logger.doassert(len(vpc_ids) > 0, no_sg_msg) - assert len(vpc_ids) > 0, no_sg_msg - - return vpc_ids[0] - - def _configure_security_group(config): _set_config_info( head_security_group_src="config", workers_security_group_src="config") @@ -609,13 +566,6 @@ def _create_security_group(config, vpc_id, group_name): def _upsert_security_group_rules(conf, security_groups): sgids = {sg.id for sg in security_groups.values()} - - # Update sgids to include user-specified security groups. - # This is necessary if the user specifies the head node type's security - # groups but not the worker's, or vice-versa. - for node_type in NODE_KIND_CONFIG_KEYS.values(): - sgids.update(conf[node_type].get("SecurityGroupIds", [])) - # sort security group items for deterministic inbound rule config order # (mainly supports more precise stub-based boto3 unit testing) for node_type, sg in sorted(security_groups.items()): @@ -633,7 +583,7 @@ def _update_inbound_rules(target_security_group, sgids, config): def _create_default_inbound_rules(sgids, extended_rules=[]): - intracluster_rules = _create_default_intracluster_inbound_rules(sgids) + intracluster_rules = _create_default_instracluster_inbound_rules(sgids) ssh_rules = _create_default_ssh_inbound_rules() merged_rules = itertools.chain( intracluster_rules, @@ -643,7 +593,7 @@ def _create_default_inbound_rules(sgids, extended_rules=[]): return list(merged_rules) -def _create_default_intracluster_inbound_rules(intracluster_sgids): +def _create_default_instracluster_inbound_rules(intracluster_sgids): return [{ "FromPort": -1, "ToPort": -1, diff --git a/python/ray/autoscaler/aws/example-head-and-worker-security-group.yaml b/python/ray/autoscaler/aws/example-head-and-worker-security-group.yaml deleted file mode 100644 index b940366a0e2f..000000000000 --- a/python/ray/autoscaler/aws/example-head-and-worker-security-group.yaml +++ /dev/null @@ -1,31 +0,0 @@ -cluster_name: sg - -max_workers: 1 - -provider: - type: aws - region: us-west-2 - availability_zone: us-west-2a - -auth: - ssh_user: ubuntu - -# If required, head and worker nodes can exist on subnets in different VPCs and -# communicate via VPC peering. - -# VPC peering overview: https://docs.aws.amazon.com/vpc/latest/userguide/vpc-peering.html. -# Setup VPC peering: https://docs.aws.amazon.com/vpc/latest/peering/create-vpc-peering-connection.html. -# Configure VPC peering route tables: https://docs.aws.amazon.com/vpc/latest/peering/vpc-peering-routing.html. - -# To enable external SSH connectivity, you should also ensure that your VPC -# is configured to assign public IPv4 addresses to every EC2 instance -# assigned to it. -head_node: - SecurityGroupIds: - - sg-1234abcd # Replace with an actual security group id. - -worker_nodes: - SecurityGroupIds: - - sg-1234abcd # Replace with an actual security group id. - - diff --git a/python/ray/tests/aws/test_autoscaler_aws.py b/python/ray/tests/aws/test_autoscaler_aws.py index 52ceb9fb8ecd..697c9efb163c 100644 --- a/python/ray/tests/aws/test_autoscaler_aws.py +++ b/python/ray/tests/aws/test_autoscaler_aws.py @@ -113,26 +113,6 @@ def test_create_sg_with_custom_inbound_rules_and_name(iam_client_stub, ec2_client_stub.assert_no_pending_responses() -def test_subnet_given_head_and_worker_sg(iam_client_stub, ec2_client_stub): - stubs.configure_iam_role_default(iam_client_stub) - stubs.configure_key_pair_default(ec2_client_stub) - - # list a security group and a thousand subnets in different vpcs - stubs.describe_a_security_group(ec2_client_stub, DEFAULT_SG) - stubs.describe_a_thousand_subnets_in_different_vpcs(ec2_client_stub) - - config = helpers.bootstrap_aws_example_config_file( - "example-head-and-worker-security-group.yaml") - - # check that just the single subnet in the right vpc is filled - assert config["head_node"]["SubnetIds"] == [DEFAULT_SUBNET["SubnetId"]] - assert config["worker_nodes"]["SubnetIds"] == [DEFAULT_SUBNET["SubnetId"]] - - # expect no pending responses left in IAM or EC2 client stub queues - iam_client_stub.assert_no_pending_responses() - ec2_client_stub.assert_no_pending_responses() - - if __name__ == "__main__": import sys sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/aws/utils/constants.py b/python/ray/tests/aws/utils/constants.py index adc8a5b2abe4..cdcf5a79c68d 100644 --- a/python/ray/tests/aws/utils/constants.py +++ b/python/ray/tests/aws/utils/constants.py @@ -50,19 +50,6 @@ "VpcId": "vpc-0000000", } - -def subnet_in_vpc(vpc_num): - """Returns a copy of DEFAULT_SUBNET whose VpcId ends with the digits - of vpc_num.""" - subnet = copy.copy(DEFAULT_SUBNET) - subnet["VpcId"] = f"vpc-{vpc_num:07d}" - return subnet - - -A_THOUSAND_SUBNETS_IN_DIFFERENT_VPCS = [ - subnet_in_vpc(vpc_num) for vpc_num in range(1, 1000) -] + [DEFAULT_SUBNET] - # Secondary EC2 subnet to expose to tests as required. AUX_SUBNET = { "AvailabilityZone": "us-west-2a", diff --git a/python/ray/tests/aws/utils/stubs.py b/python/ray/tests/aws/utils/stubs.py index 61f1f9ab632b..7840447d80e0 100644 --- a/python/ray/tests/aws/utils/stubs.py +++ b/python/ray/tests/aws/utils/stubs.py @@ -1,7 +1,7 @@ import ray from ray.tests.aws.utils.mocks import mock_path_exists_key_pair from ray.tests.aws.utils.constants import DEFAULT_INSTANCE_PROFILE, \ - DEFAULT_KEY_PAIR, DEFAULT_SUBNET, A_THOUSAND_SUBNETS_IN_DIFFERENT_VPCS + DEFAULT_KEY_PAIR, DEFAULT_SUBNET from unittest import mock @@ -41,13 +41,6 @@ def configure_subnet_default(ec2_client_stub): service_response={"Subnets": [DEFAULT_SUBNET]}) -def describe_a_thousand_subnets_in_different_vpcs(ec2_client_stub): - ec2_client_stub.add_response( - "describe_subnets", - expected_params={}, - service_response={"Subnets": A_THOUSAND_SUBNETS_IN_DIFFERENT_VPCS}) - - def skip_to_configure_sg(ec2_client_stub, iam_client_stub): configure_iam_role_default(iam_client_stub) configure_key_pair_default(ec2_client_stub) @@ -73,18 +66,6 @@ def describe_no_security_groups(ec2_client_stub): service_response={}) -def describe_a_security_group(ec2_client_stub, security_group): - ec2_client_stub.add_response( - "describe_security_groups", - expected_params={ - "Filters": [{ - "Name": "group-id", - "Values": [security_group["GroupId"]] - }] - }, - service_response={"SecurityGroups": [security_group]}) - - def create_sg_echo(ec2_client_stub, security_group): ec2_client_stub.add_response( "create_security_group", From 14041927c4258e197a3389f9d9a72bc85fe92d53 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 166/244] Revert "[docs] change MLFlow to MLflow in docs (#13739)" This reverts commit d29f46b608ef490d2887de6dca341aedfb972fb2. --- doc/source/tune/_tutorials/overview.rst | 4 ++-- doc/source/tune/api_docs/logging.rst | 2 +- doc/source/tune/examples/index.rst | 6 +++--- doc/source/tune/index.rst | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/source/tune/_tutorials/overview.rst b/doc/source/tune/_tutorials/overview.rst index 8e79b8ca158a..0517c2f0a9e5 100644 --- a/doc/source/tune/_tutorials/overview.rst +++ b/doc/source/tune/_tutorials/overview.rst @@ -71,9 +71,9 @@ Take a look at any of the below tutorials to get started with Tune. :description: :doc:`Track your experiment process with the Weights & Biases tools ` .. customgalleryitem:: - :tooltip: Use MLflow with Ray Tune. + :tooltip: Use MLFlow with Ray Tune. :figure: /images/mlflow.png - :description: :doc:`Log and track your hyperparameter sweep with MLflow Tracking & AutoLogging ` + :description: :doc:`Log and track your hyperparameter sweep with MLFlow Tracking & AutoLogging ` .. raw:: html diff --git a/doc/source/tune/api_docs/logging.rst b/doc/source/tune/api_docs/logging.rst index 1bdc400cc802..b976a898ed08 100644 --- a/doc/source/tune/api_docs/logging.rst +++ b/doc/source/tune/api_docs/logging.rst @@ -162,7 +162,7 @@ CSVLogger MLFlowLogger ------------ -Tune also provides a default logger for `MLflow `_. You can install MLflow via ``pip install mlflow``. +Tune also provides a default logger for `MLFlow `_. You can install MLFlow via ``pip install mlflow``. You can see the :doc:`tutorial here `. WandbLogger diff --git a/doc/source/tune/examples/index.rst b/doc/source/tune/examples/index.rst index acdb758929ea..27fde3a05711 100644 --- a/doc/source/tune/examples/index.rst +++ b/doc/source/tune/examples/index.rst @@ -82,13 +82,13 @@ Pytorch Lightning - :doc:`/tune/examples/mnist_pytorch_lightning`: A comprehensive example using `Pytorch Lightning `_ to train a MNIST model. This example showcases how to use various search optimization techniques. It utilizes the Ray Tune-provided :ref:`PyTorch Lightning callbacks `. - :ref:`A walkthrough tutorial for using Ray Tune with Pytorch-Lightning `. -Wandb, MLflow +Wandb, MLFlow ~~~~~~~~~~~~~ - :ref:`Tutorial ` for using `wandb `__ with Ray Tune - :doc:`/tune/examples/wandb_example`: Example for using `Weights and Biases `__ with Ray Tune. -- :doc:`/tune/examples/mlflow_example`: Example for using `MLflow `__ with Ray Tune. -- :doc:`/tune/examples/mlflow_ptl_example`: Example for using `MLflow `__ and `Pytorch Lightning `_ with Ray Tune. +- :doc:`/tune/examples/mlflow_example`: Example for using `MLFlow `__ with Ray Tune. +- :doc:`/tune/examples/mlflow_ptl_example`: Example for using `MLFlow `__ and `Pytorch Lightning `_ with Ray Tune. Tensorflow/Keras ~~~~~~~~~~~~~~~~ diff --git a/doc/source/tune/index.rst b/doc/source/tune/index.rst index 2003b2eacb80..86f312cf8ddd 100644 --- a/doc/source/tune/index.rst +++ b/doc/source/tune/index.rst @@ -73,7 +73,7 @@ A key problem with machine learning frameworks is the need to restructure all of With Tune, you can optimize your model just by :ref:`adding a few code snippets `. -Further, Tune actually removes boilerplate from your code training workflow, automatically :ref:`managing checkpoints ` and :ref:`logging results to tools ` such as MLflow and TensorBoard. +Further, Tune actually removes boilerplate from your code training workflow, automatically :ref:`managing checkpoints ` and :ref:`logging results to tools ` such as MLFlow and TensorBoard. Multi-GPU & distributed training out of the box From 1d0ea62ebef5c1ece93a0fef3303f313b23c7199 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 167/244] Revert "[Core] Add private on_completed callback for ObjectRef (#13688)" This reverts commit 04ed213e9bff418179b8c8d4ff882ba6427b771a. --- python/ray/_raylet.pyx | 42 +++++++++++++++++++++--------- python/ray/includes/object_ref.pxi | 42 ++++-------------------------- python/ray/tests/test_asyncio.py | 22 +--------------- 3 files changed, 36 insertions(+), 70 deletions(-) diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index dc9fceaca7df..0fc3f4bf25da 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -1569,13 +1569,12 @@ cdef class CoreWorker: return ref_counts - def set_get_async_callback(self, ObjectRef object_ref, callback): - cpython.Py_INCREF(callback) + def get_async(self, ObjectRef object_ref, future): + cpython.Py_INCREF(future) CCoreWorkerProcess.GetCoreWorker().GetAsync( - object_ref.native(), - async_callback, - callback - ) + object_ref.native(), + async_set_result, + future) def push_error(self, JobID job_id, error_type, error_message, double timestamp): @@ -1589,11 +1588,13 @@ cdef class CoreWorker: resource_name.encode("ascii"), capacity, CNodeID.FromBinary(client_id.binary())) -cdef void async_callback(shared_ptr[CRayObject] obj, - CObjectID object_ref, - void *user_callback) with gil: +cdef void async_set_result(shared_ptr[CRayObject] obj, + CObjectID object_ref, + void *future) with gil: cdef: c_vector[shared_ptr[CRayObject]] objects_to_deserialize + py_future = (future) + loop = py_future._loop # Object is retrieved from in memory store. # Here we go through the code path used to deserialize objects. @@ -1604,6 +1605,23 @@ cdef void async_callback(shared_ptr[CRayObject] obj, result = ray.worker.global_worker.deserialize_objects( data_metadata_pairs, ids_to_deserialize)[0] - py_callback = user_callback - py_callback(result) - cpython.Py_DECREF(py_callback) + def set_future(): + # Issue #11030, #8841 + # If this future has result set already, we just need to + # skip the set result/exception procedure. + if py_future.done(): + cpython.Py_DECREF(py_future) + return + + if isinstance(result, RayTaskError): + ray.worker.last_task_error_raise_time = time.time() + py_future.set_exception(result.as_instanceof_cause()) + elif isinstance(result, RayError): + # Directly raise exception for RayActorError + py_future.set_exception(result) + else: + py_future.set_result(result) + + cpython.Py_DECREF(py_future) + + loop.call_soon_threadsafe(set_future) diff --git a/python/ray/includes/object_ref.pxi b/python/ray/includes/object_ref.pxi index 31c59d08ba2c..3353e696edbf 100644 --- a/python/ray/includes/object_ref.pxi +++ b/python/ray/includes/object_ref.pxi @@ -1,7 +1,6 @@ from ray.includes.unique_ids cimport CObjectID import asyncio -from typing import Callable, Any import ray @@ -72,41 +71,10 @@ cdef class ObjectRef(BaseID): def as_future(self): loop = asyncio.get_event_loop() - py_future = loop.create_future() - - def callback(result): - loop = py_future._loop - - def set_future(): - # Issue #11030, #8841 - # If this future has result set already, we just need to - # skip the set result/exception procedure. - if py_future.done(): - return - - if isinstance(result, RayTaskError): - ray.worker.last_task_error_raise_time = time.time() - py_future.set_exception(result.as_instanceof_cause()) - elif isinstance(result, RayError): - # Directly raise exception for RayActorError - py_future.set_exception(result) - else: - py_future.set_result(result) - - loop.call_soon_threadsafe(set_future) - - self._on_completed(callback) + core_worker = ray.worker.global_worker.core_worker + future = loop.create_future() + core_worker.get_async(self, future) # A hack to keep a reference to the object ref for ref counting. - py_future.object_ref = self - return py_future - - def _on_completed(self, py_callback: Callable[[Any], None]): - """Register a callback that will be called after Object is ready. - If the ObjectRef is already ready, the callback will be called soon. - The callback should take the result as the only argument. The result - can be an exception object in case of task error. - """ - core_worker = ray.worker.global_worker.core_worker - core_worker.set_get_async_callback(self, py_callback) - return self + future.object_ref = self + return future diff --git a/python/ray/tests/test_asyncio.py b/python/ray/tests/test_asyncio.py index 31f03aefa546..18dd63a22d07 100644 --- a/python/ray/tests/test_asyncio.py +++ b/python/ray/tests/test_asyncio.py @@ -6,7 +6,7 @@ import pytest import ray -from ray.test_utils import SignalActor, wait_for_condition +from ray.test_utils import SignalActor def test_asyncio_actor(ray_start_regular_shared): @@ -224,26 +224,6 @@ async def loop_forever(self): ray.get(a.ping.remote()) -def test_async_callback(ray_start_regular_shared): - global_set = set() - - ref = ray.put(None) - ref._on_completed(lambda _: global_set.add("completed-1")) - wait_for_condition(lambda: "completed-1" in global_set) - - signal = SignalActor.remote() - - @ray.remote - def wait(): - ray.get(signal.wait.remote()) - - ref = wait.remote() - ref._on_completed(lambda _: global_set.add("completed-2")) - assert "completed-2" not in global_set - signal.send.remote() - wait_for_condition(lambda: "completed-2" in global_set) - - if __name__ == "__main__": import pytest sys.exit(pytest.main(["-v", __file__])) From be3abce9beb86c86f29e1e8818de78f63c4ab118 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 168/244] Revert "[Object Spilling] Remove job id from the io worker log name. (#13746)" This reverts commit e63c6de5ae23927ed0f637b44bbba75b215b7cc1. --- python/ray/ray_logging.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/python/ray/ray_logging.py b/python/ray/ray_logging.py index c9af57536b0c..56df7b5c2092 100644 --- a/python/ray/ray_logging.py +++ b/python/ray/ray_logging.py @@ -165,17 +165,15 @@ def get_worker_log_file_name(worker_type): "please report it to Ray's Github issue.") worker_name = "worker" else: - job_id = "" + job_id = ray.JobID.nil() worker_name = "io_worker" # Make sure these values are set already. assert ray.worker._global_node is not None assert ray.worker.global_worker is not None filename = (f"{worker_name}-" - f"{binary_to_hex(ray.worker.global_worker.worker_id)}-") - if job_id: - filename += f"{job_id}-" - filename += f"{os.getpid()}" + f"{binary_to_hex(ray.worker.global_worker.worker_id)}-" + f"{job_id}-{os.getpid()}") return filename From 99d7fc8bc0720db3acec4753d1857045b7f0c2c5 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 169/244] Revert "Revert "Revert "[CLI] Fix Ray Status with ENV Variable set (#13707) (#13726)" This reverts commit fd14fc289435d97982036b5be83ecfe251035659. --- python/ray/_private/services.py | 2 +- python/ray/tests/test_cli.py | 28 ------------------- .../test_cli_patterns/test_ray_status.txt | 14 ---------- 3 files changed, 1 insertion(+), 43 deletions(-) delete mode 100644 python/ray/tests/test_cli_patterns/test_ray_status.txt diff --git a/python/ray/_private/services.py b/python/ray/_private/services.py index 1c4c6497dca6..d0eafc9693c6 100644 --- a/python/ray/_private/services.py +++ b/python/ray/_private/services.py @@ -216,7 +216,7 @@ def get_ray_address_to_use_or_die(): A string to pass into `ray.init(address=...)` """ if "RAY_ADDRESS" in os.environ: - return os.environ.get("RAY_ADDRESS") + return "auto" # Avoid conflict with RAY_ADDRESS env var return find_redis_address_or_die() diff --git a/python/ray/tests/test_cli.py b/python/ray/tests/test_cli.py index f5628701f91b..57bf61419690 100644 --- a/python/ray/tests/test_cli.py +++ b/python/ray/tests/test_cli.py @@ -37,7 +37,6 @@ import ray.autoscaler._private.aws.config as aws_config import ray.scripts.scripts as scripts -from ray.test_utils import wait_for_condition boto3_list = [{ "InstanceType": "t1.micro", @@ -416,32 +415,5 @@ def commands_mock(command, stdin): _check_output_via_pattern("test_ray_submit.txt", result) -def test_ray_status(): - import ray - address = ray.init().get("redis_address") - runner = CliRunner() - - def output_ready(): - result = runner.invoke(scripts.status) - result.stdout - return not result.exception and "memory" in result.output - - wait_for_condition(output_ready) - - result = runner.invoke(scripts.status, []) - _check_output_via_pattern("test_ray_status.txt", result) - - result_arg = runner.invoke(scripts.status, ["--address", address]) - _check_output_via_pattern("test_ray_status.txt", result_arg) - - # Try to check status with RAY_ADDRESS set - os.environ["RAY_ADDRESS"] = address - result_env = runner.invoke(scripts.status) - _check_output_via_pattern("test_ray_status.txt", result_env) - - result_env_arg = runner.invoke(scripts.status, ["--address", address]) - _check_output_via_pattern("test_ray_status.txt", result_env_arg) - - if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_cli_patterns/test_ray_status.txt b/python/ray/tests/test_cli_patterns/test_ray_status.txt deleted file mode 100644 index f903c6d62503..000000000000 --- a/python/ray/tests/test_cli_patterns/test_ray_status.txt +++ /dev/null @@ -1,14 +0,0 @@ -======== Cluster status: .+ -Node status ------------------------------------------------------------- - 1 node\(s\) with resources: .+ - -Resources ------------------------------------------------------------- -Usage: - 0.+ - 0.+ - 0.+ - -Demands: - \(no resource demands\) From cd2dd8c829fc7a5ce45b18ce40c14594aa5d8d96 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 170/244] Revert "[Core/Autoscaler] Properly clean up resource backlog from (#13727)" This reverts commit 127bcf10b9b1901c870adf95356da4367f64a8b8. --- .../raylet/scheduling/cluster_task_manager.cc | 21 ++---- .../scheduling/cluster_task_manager_test.cc | 71 +++++++------------ 2 files changed, 30 insertions(+), 62 deletions(-) diff --git a/src/ray/raylet/scheduling/cluster_task_manager.cc b/src/ray/raylet/scheduling/cluster_task_manager.cc index 43c6ce1cc78a..a395e51b5077 100644 --- a/src/ray/raylet/scheduling/cluster_task_manager.cc +++ b/src/ray/raylet/scheduling/cluster_task_manager.cc @@ -618,21 +618,12 @@ bool ClusterTaskManager::AnyPendingTasks(Task *exemplar, bool *any_pending, std::string ClusterTaskManager::DebugStr() const { // TODO(Shanly): This method will be replaced with `DebugString` once we remove the // legacy scheduler. - auto accumulator = [](int state, const std::pair> &pair) { - return state + pair.second.size(); - }; - int num_infeasible_tasks = - std::accumulate(infeasible_tasks_.begin(), infeasible_tasks_.end(), 0, accumulator); - int num_tasks_to_schedule = std::accumulate(tasks_to_schedule_.begin(), - tasks_to_schedule_.end(), 0, accumulator); - int num_tasks_to_dispatch = std::accumulate(tasks_to_dispatch_.begin(), - tasks_to_dispatch_.end(), 0, accumulator); std::stringstream buffer; buffer << "========== Node: " << self_node_id_ << " =================\n"; - buffer << "Infeasible queue length: " << num_infeasible_tasks << "\n"; - buffer << "Schedule queue length: " << num_tasks_to_schedule << "\n"; - buffer << "Dispatch queue length: " << num_tasks_to_dispatch << "\n"; + buffer << "Schedule queue length: " << tasks_to_schedule_.size() << "\n"; + buffer << "Dispatch queue length: " << tasks_to_dispatch_.size() << "\n"; buffer << "Waiting tasks size: " << waiting_tasks_.size() << "\n"; + buffer << "infeasible queue length size: " << infeasible_tasks_.size() << "\n"; buffer << "cluster_resource_scheduler state: " << cluster_resource_scheduler_->DebugString() << "\n"; buffer << "=================================================="; @@ -682,6 +673,7 @@ void ClusterTaskManager::Dispatch( const Task &task, rpc::RequestWorkerLeaseReply *reply, std::function send_reply_callback) { const auto &task_spec = task.GetTaskSpecification(); + RAY_LOG(DEBUG) << "Dispatching task " << task_spec.TaskId(); // Pass the contact info of the worker to use. reply->set_worker_pid(worker->GetProcess().GetId()); reply->mutable_worker_address()->set_ip_address(worker->IpAddress()); @@ -691,7 +683,6 @@ void ClusterTaskManager::Dispatch( RAY_CHECK(leased_workers.find(worker->WorkerId()) == leased_workers.end()); leased_workers[worker->WorkerId()] = worker; - RemoveFromBacklogTracker(task); // Update our internal view of the cluster state. std::shared_ptr allocated_resources; @@ -743,9 +734,7 @@ void ClusterTaskManager::Dispatch( } void ClusterTaskManager::Spillback(const NodeID &spillback_to, const Work &work) { - const auto &task = std::get<0>(work); - const auto &task_spec = task.GetTaskSpecification(); - RemoveFromBacklogTracker(task); + const auto &task_spec = std::get<0>(work).GetTaskSpecification(); RAY_LOG(DEBUG) << "Spilling task " << task_spec.TaskId() << " to node " << spillback_to; if (!cluster_resource_scheduler_->AllocateRemoteTaskResources( diff --git a/src/ray/raylet/scheduling/cluster_task_manager_test.cc b/src/ray/raylet/scheduling/cluster_task_manager_test.cc index 776e7fc53030..7c5f00820839 100644 --- a/src/ray/raylet/scheduling/cluster_task_manager_test.cc +++ b/src/ray/raylet/scheduling/cluster_task_manager_test.cc @@ -554,69 +554,48 @@ TEST_F(ClusterTaskManagerTest, BacklogReportTest) { *callback_occurred_ptr = true; }; - std::vector to_cancel; + std::shared_ptr worker = + std::make_shared(WorkerID::FromRandom(), 1234); + pool_.PushWorker(std::dynamic_pointer_cast(worker)); - // Don't add these fist 2 tasks to `to_cancel`. - for (int i = 0; i < 1; i++) { - Task task = CreateTask({{ray::kCPU_ResourceLabel, 8}}); - task.SetBacklogSize(10 - i); - task_manager_.QueueAndScheduleTask(task, &reply, callback); - } + std::vector to_cancel; - for (int i = 1; i < 10; i++) { - Task task = CreateTask({{ray::kCPU_ResourceLabel, 8}}); - task.SetBacklogSize(10 - i); + for (int i = 0; i < 10; i++) { + Task task = CreateTask({{ray::kCPU_ResourceLabel, 100}}); + task.SetBacklogSize(i); task_manager_.QueueAndScheduleTask(task, &reply, callback); to_cancel.push_back(task.GetTaskSpecification().TaskId()); } ASSERT_FALSE(callback_occurred); ASSERT_EQ(leased_workers_.size(), 0); - ASSERT_EQ(pool_.workers.size(), 0); + ASSERT_EQ(pool_.workers.size(), 1); ASSERT_EQ(node_info_calls_, 0); - { // No tasks can run because the worker pool is empty. - auto data = std::make_shared(); - task_manager_.FillResourceUsage(data); - auto resource_load_by_shape = data->resource_load_by_shape(); - auto shape1 = resource_load_by_shape.resource_demands()[0]; - - ASSERT_EQ(shape1.backlog_size(), 55); - ASSERT_EQ(shape1.num_infeasible_requests_queued(), 0); - ASSERT_EQ(shape1.num_ready_requests_queued(), 10); - } - - // Push a worker so the first task can run. - std::shared_ptr worker = - std::make_shared(WorkerID::FromRandom(), 1234); - pool_.PushWorker(worker); - task_manager_.ScheduleAndDispatchTasks(); + auto data = std::make_shared(); + task_manager_.FillResourceUsage(data); - { - auto data = std::make_shared(); - task_manager_.FillResourceUsage(data); - auto resource_load_by_shape = data->resource_load_by_shape(); - auto shape1 = resource_load_by_shape.resource_demands()[0]; + auto resource_load_by_shape = data->resource_load_by_shape(); + auto shape1 = resource_load_by_shape.resource_demands()[0]; - ASSERT_TRUE(callback_occurred); - ASSERT_EQ(shape1.backlog_size(), 45); - ASSERT_EQ(shape1.num_infeasible_requests_queued(), 0); - ASSERT_EQ(shape1.num_ready_requests_queued(), 9); - } + ASSERT_EQ(shape1.backlog_size(), 45); + ASSERT_EQ(shape1.num_infeasible_requests_queued(), 10); + ASSERT_EQ(shape1.num_ready_requests_queued(), 0); - // Cancel the rest. for (auto &task_id : to_cancel) { ASSERT_TRUE(task_manager_.CancelTask(task_id)); } - RAY_LOG(ERROR) << "Finished cancelling tasks"; - { - auto data = std::make_shared(); - task_manager_.FillResourceUsage(data); - auto resource_load_by_shape = data->resource_load_by_shape(); - ASSERT_EQ(resource_load_by_shape.resource_demands().size(), 0); - AssertNoLeaks(); - } + data = std::make_shared(); + task_manager_.FillResourceUsage(data); + + resource_load_by_shape = data->resource_load_by_shape(); + shape1 = resource_load_by_shape.resource_demands()[0]; + + ASSERT_EQ(shape1.backlog_size(), 0); + ASSERT_EQ(shape1.num_infeasible_requests_queued(), 0); + ASSERT_EQ(shape1.num_ready_requests_queued(), 0); + AssertNoLeaks(); } TEST_F(ClusterTaskManagerTest, OwnerDeadTest) { From 3cd9677ad7a260d4ac765bb639ae5b40d2fb3f75 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 171/244] Revert "[CI] Add retry to java doc test (#13743)" This reverts commit cb69f55b394c897148f6430dda83abfae4238913. --- java/test.sh | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/java/test.sh b/java/test.sh index 86afc719b5b0..49a0d68bbdc5 100755 --- a/java/test.sh +++ b/java/test.sh @@ -71,18 +71,15 @@ RAY_BACKEND_LOG_LEVEL=debug java -cp bazel-bin/java/all_tests_deploy.jar -Dray.a -Dray.redis.password='123456' -Dray.job.code-search-path="$PWD/bazel-bin/java/all_tests_deploy.jar" io.ray.test.MultiDriverTest ray stop -# See issue #13742 the test is very flaky. -# Skipping the doc test for now. - -# echo "Running documentation demo code." -# docdemo_path="java/test/src/main/java/io/ray/docdemo/" -# for file in "$docdemo_path"*.java; do -# file=${file#"$docdemo_path"} -# class=${file%".java"} -# echo "Running $class" -# java -cp bazel-bin/java/all_tests_deploy.jar "io.ray.docdemo.$class" -# done -# popd +echo "Running documentation demo code." +docdemo_path="java/test/src/main/java/io/ray/docdemo/" +for file in "$docdemo_path"*.java; do + file=${file#"$docdemo_path"} + class=${file%".java"} + echo "Running $class" + java -cp bazel-bin/java/all_tests_deploy.jar "io.ray.docdemo.$class" +done +popd pushd "$ROOT_DIR" echo "Testing maven install." From a55cbff09a9cd7c016ba590ff201daed966cd492 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 172/244] Revert "Fix high CPU usage in object manager due to O(n^2) iteration over active pulls list (#13724)" This reverts commit baee1e4dffec110b432d9524e69f0c6e3ddfef72. --- src/ray/object_manager/pull_manager.cc | 29 ++++++++++++-------------- src/ray/object_manager/pull_manager.h | 6 ++---- src/ray/raylet/node_manager.cc | 10 ++++----- 3 files changed, 19 insertions(+), 26 deletions(-) diff --git a/src/ray/object_manager/pull_manager.cc b/src/ray/object_manager/pull_manager.cc index f4920a8def92..302f2f4354ef 100644 --- a/src/ray/object_manager/pull_manager.cc +++ b/src/ray/object_manager/pull_manager.cc @@ -51,8 +51,7 @@ uint64_t PullManager::Pull(const std::vector &object_ref_b bool PullManager::ActivateNextPullBundleRequest( const std::map>::iterator - &next_request_it, - std::vector *objects_to_pull) { + &next_request_it) { // Check that we have sizes for all of the objects in the bundle. If not, we // should not activate the bundle, since it may put us over the available // capacity. @@ -82,7 +81,6 @@ bool PullManager::ActivateNextPullBundleRequest( auto it = object_pull_requests_.find(obj_id); RAY_CHECK(it != object_pull_requests_.end()); num_bytes_being_pulled_ += it->second.object_size; - objects_to_pull->push_back(obj_id); } } @@ -93,8 +91,7 @@ bool PullManager::ActivateNextPullBundleRequest( } void PullManager::DeactivatePullBundleRequest( - const std::map>::iterator &request_it, - std::unordered_set *objects_to_cancel) { + const std::map>::iterator &request_it) { for (const auto &ref : request_it->second) { auto obj_id = ObjectRefToId(ref); RAY_CHECK(active_object_pull_requests_[obj_id].erase(request_it->first)); @@ -104,10 +101,6 @@ void PullManager::DeactivatePullBundleRequest( RAY_CHECK(it != object_pull_requests_.end()); num_bytes_being_pulled_ -= it->second.object_size; active_object_pull_requests_.erase(obj_id); - - if (objects_to_cancel) { - objects_to_cancel->insert(obj_id); - } } } @@ -127,9 +120,10 @@ void PullManager::UpdatePullsBasedOnAvailableMemory(size_t num_bytes_available) RAY_LOG(DEBUG) << "Updating pulls based on available memory: " << num_bytes_available; } num_bytes_available_ = num_bytes_available; + uint64_t prev_highest_req_id_being_pulled = highest_req_id_being_pulled_; + std::unordered_set object_ids_to_pull; // While there is available capacity, activate the next pull request. - std::vector objects_to_pull; while (num_bytes_being_pulled_ < num_bytes_available_) { // Get the next pull request in the queue. const auto last_request_it = pull_request_bundles_.find(highest_req_id_being_pulled_); @@ -151,7 +145,7 @@ void PullManager::UpdatePullsBasedOnAvailableMemory(size_t num_bytes_available) << " num bytes available: " << num_bytes_available_; // There is another pull bundle request that we could try, and there is // enough space. Activate the next pull bundle request in the queue. - if (!ActivateNextPullBundleRequest(next_request_it, &objects_to_pull)) { + if (!ActivateNextPullBundleRequest(next_request_it)) { // This pull bundle request could not be activated, due to lack of object // size information. Wait until we have object size information before // activating this pull bundle. @@ -168,15 +162,18 @@ void PullManager::UpdatePullsBasedOnAvailableMemory(size_t num_bytes_available) << " num bytes available: " << num_bytes_available_; const auto last_request_it = pull_request_bundles_.find(highest_req_id_being_pulled_); RAY_CHECK(last_request_it != pull_request_bundles_.end()); - DeactivatePullBundleRequest(last_request_it, &object_ids_to_cancel); + DeactivatePullBundleRequest(last_request_it); } TriggerOutOfMemoryHandlingIfNeeded(); - for (const auto &obj_id : objects_to_pull) { - if (object_ids_to_cancel.count(obj_id) == 0) { - TryToMakeObjectLocal(obj_id); - } + if (highest_req_id_being_pulled_ > prev_highest_req_id_being_pulled) { + // There are newly activated requests. Start pulling objects for the newly + // activated requests. + // NOTE(swang): We could also just wait for the next timer tick to pull the + // objects, but this would add a delay of up to one tick for any bundles of + // multiple objects, even when we are not under memory pressure. + Tick(); } } diff --git a/src/ray/object_manager/pull_manager.h b/src/ray/object_manager/pull_manager.h index 3a542fef7af2..26eba1a35264 100644 --- a/src/ray/object_manager/pull_manager.h +++ b/src/ray/object_manager/pull_manager.h @@ -146,14 +146,12 @@ class PullManager { /// any objects in the request that are not already being pulled. bool ActivateNextPullBundleRequest( const std::map>::iterator - &next_request_it, - std::vector *objects_to_pull); + &next_request_it); /// Deactivate a pull request in the queue. This cancels any pull or restore /// operations for the object. void DeactivatePullBundleRequest( - const std::map>::iterator &request_it, - std::unordered_set *objects_to_cancel = nullptr); + const std::map>::iterator &request_it); /// Trigger out-of-memory handling if the first request in the queue needs /// more space than the bytes available. This is needed to make room for the diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index e1ac5eb670bb..072064f4695a 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -2509,16 +2509,14 @@ rpc::ObjectStoreStats AccumulateStoreStats( rpc::ObjectStoreStats store_stats; for (const auto &reply : node_stats) { auto cur_store = reply.store_stats(); - // Use max aggregation for time, since the nodes are spilling concurrently. - store_stats.set_spill_time_total_s( - std::max(store_stats.spill_time_total_s(), cur_store.spill_time_total_s())); - store_stats.set_restore_time_total_s( - std::max(store_stats.restore_time_total_s(), cur_store.restore_time_total_s())); - // Use sum aggregation for the rest of the metrics. + store_stats.set_spill_time_total_s(store_stats.spill_time_total_s() + + cur_store.spill_time_total_s()); store_stats.set_spilled_bytes_total(store_stats.spilled_bytes_total() + cur_store.spilled_bytes_total()); store_stats.set_spilled_objects_total(store_stats.spilled_objects_total() + cur_store.spilled_objects_total()); + store_stats.set_restore_time_total_s(store_stats.restore_time_total_s() + + cur_store.restore_time_total_s()); store_stats.set_restored_bytes_total(store_stats.restored_bytes_total() + cur_store.restored_bytes_total()); store_stats.set_restored_objects_total(store_stats.restored_objects_total() + From 08055578a4796d2d169692f51c78a48a13506923 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 173/244] Revert "[Docker] default to /home/ray (#13738)" This reverts commit ad6f493bcf827e6e200882a6ae9a39c7da59ac39. --- docker/base-deps/Dockerfile | 2 -- 1 file changed, 2 deletions(-) diff --git a/docker/base-deps/Dockerfile b/docker/base-deps/Dockerfile index 278fad1ec73d..3aec50c99f80 100644 --- a/docker/base-deps/Dockerfile +++ b/docker/base-deps/Dockerfile @@ -69,5 +69,3 @@ RUN sudo apt-get update -y && sudo apt-get upgrade -y \ fi;) \ && sudo rm -rf /var/lib/apt/lists/* \ && sudo apt-get clean - -WORKDIR $HOME \ No newline at end of file From 10de9b506b60ca5a8a58673cf364125b4669c369 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 174/244] Revert "[Docker] Use Cuda 11 (#13691)" This reverts commit 909abd455f0031a88466ad878f910b72cca86c21. --- build-docker.sh | 2 +- ci/travis/build-docker-images.py | 2 +- ci/travis/build-docker-images.sh | 2 +- docker/base-deps/Dockerfile | 2 +- python/requirements_ml_docker.txt | 7 ++----- release/rllib_tests/unit_gpu_tests/requirements.txt | 6 ++---- 6 files changed, 8 insertions(+), 13 deletions(-) diff --git a/build-docker.sh b/build-docker.sh index b39336186caf..3a09b4896010 100755 --- a/build-docker.sh +++ b/build-docker.sh @@ -16,7 +16,7 @@ key="$1" case $key in --gpu) GPU="-gpu" - BASE_IMAGE="nvidia/cuda:11.0-cudnn8-runtime-ubuntu18.04" + BASE_IMAGE="nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04" ;; --no-cache-build) NO_CACHE="--no-cache" diff --git a/ci/travis/build-docker-images.py b/ci/travis/build-docker-images.py index c549bc95e60a..a2ae7a18d13c 100644 --- a/ci/travis/build-docker-images.py +++ b/ci/travis/build-docker-images.py @@ -84,7 +84,7 @@ def _build_cpu_gpu_images(image_name, no_cache=True) -> List[str]: build_args = {} if image_name == "base-deps": build_args["BASE_IMAGE"] = ( - "nvidia/cuda:11.0-cudnn8-runtime-ubuntu18.04" + "nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04" if gpu == "-gpu" else "ubuntu:focal") else: build_args["GPU"] = gpu diff --git a/ci/travis/build-docker-images.sh b/ci/travis/build-docker-images.sh index 6463c880f649..c894da23a662 100755 --- a/ci/travis/build-docker-images.sh +++ b/ci/travis/build-docker-images.sh @@ -22,7 +22,7 @@ build_and_push_tags() { # $2 tag for image (e.g. hash of commit) for GPU in "" "-gpu" do - BASE_IMAGE=$(if [ "$GPU" ]; then echo "nvidia/cuda:11.0-cudnn8-runtime-ubuntu18.04"; else echo "ubuntu:focal"; fi;) + BASE_IMAGE=$(if [ "$GPU" ]; then echo "nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04"; else echo "ubuntu:focal"; fi;) FULL_NAME_WITH_TAG="rayproject/$1:$2$GPU" NIGHTLY_FULL_NAME_WITH_TAG="rayproject/$1:nightly$GPU" docker build --no-cache --build-arg GPU="$GPU" --build-arg BASE_IMAGE="$BASE_IMAGE" --build-arg WHEEL_PATH=".whl/$WHEEL" --label "SHA=$2" -t "$FULL_NAME_WITH_TAG" /"$ROOT_DIR"/docker/"$1" diff --git a/docker/base-deps/Dockerfile b/docker/base-deps/Dockerfile index 3aec50c99f80..a5bcfedbf6be 100644 --- a/docker/base-deps/Dockerfile +++ b/docker/base-deps/Dockerfile @@ -1,6 +1,6 @@ # The base-deps Docker image installs main libraries needed to run Ray -# The GPU option is nvidia/cuda:11.0-cudnn8-runtime-ubuntu18.04 +# The GPU option is nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 ARG BASE_IMAGE="ubuntu:focal" FROM ${BASE_IMAGE} # If this arg is not "autoscaler" then no autoscaler requirements will be included diff --git a/python/requirements_ml_docker.txt b/python/requirements_ml_docker.txt index c61ba0c055f6..6f610c46862e 100644 --- a/python/requirements_ml_docker.txt +++ b/python/requirements_ml_docker.txt @@ -1,6 +1,3 @@ ipython -tensorflow-gpu>=2.4.0 --f https://download.pytorch.org/whl/torch_stable.html -torch==1.7.1+cu110 --f https://download.pytorch.org/whl/torch_stable.html -torchvision==0.8.2+cu110 \ No newline at end of file +tensorflow-gpu +torch \ No newline at end of file diff --git a/release/rllib_tests/unit_gpu_tests/requirements.txt b/release/rllib_tests/unit_gpu_tests/requirements.txt index b8a991f74f34..4f88975397f9 100644 --- a/release/rllib_tests/unit_gpu_tests/requirements.txt +++ b/release/rllib_tests/unit_gpu_tests/requirements.txt @@ -1,9 +1,7 @@ ray[rllib] ray --f https://download.pytorch.org/whl/torch_stable.html -torch==1.7.1+cu110 --f https://download.pytorch.org/whl/torch_stable.html -torchvision==0.8.2+cu110 +torch==1.6+cu101 +torchvision==0.7.0+cu101 boto3==1.4.8 cython==0.29.0 pytest From 49d3bbcb45d968fba0b2af9f594fa1bf8f51874d Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 175/244] Revert "Remove docs for install-nightly (#13744)" This reverts commit 1ff570ce926e0553eefc38ab36deef02a4ba4969. --- doc/source/installation.rst | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/doc/source/installation.rst b/doc/source/installation.rst index 049d3ed28038..397113d95c04 100644 --- a/doc/source/installation.rst +++ b/doc/source/installation.rst @@ -24,7 +24,22 @@ You can install the latest official version of Ray as follows. Official releases Daily Releases (Nightlies) -------------------------- -You can install the nightly Ray wheels via the following links. These daily releases are tested via automated tests but do not go through the full release process. To install these wheels, use the following ``pip`` command and wheels: +You can install the latest Ray wheels via the following command. These daily releases are tested via automated tests but do not go through the full release process: + +.. code-block:: bash + + pip install -U ray + ray install-nightly + + +.. note:: ``ray install-nightly`` may not capture updated library dependencies. After running ``ray install-nightly``, consider running ``pip install ray[]`` *without upgrading (via -U)* to update dependencies. + + +.. note:: If you're currently on ``ray<=1.0.1.post1``, ``ray install-nightly`` will not install the most recent nightly wheels. Please use the links below instead. + +Alternatively, here are the links to the latest wheels (which are built for each commit on the +master branch). To install these wheels, use the following ``pip`` command and wheels +instead of the ones above: .. code-block:: bash From 539b1cff196727bcfe54f9c92b5dab0607575af2 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 176/244] Revert "[Serve] Fix ServeHandle serialization (#13695)" This reverts commit b522b1ab99df346a1d0c97f0752d4f7e311aa4ad. --- python/ray/serve/api.py | 7 ----- python/ray/serve/handle.py | 25 +++++---------- python/ray/serve/tests/test_handle.py | 44 +-------------------------- 3 files changed, 8 insertions(+), 68 deletions(-) diff --git a/python/ray/serve/api.py b/python/ray/serve/api.py index 19783dc3700b..b42cd78464a7 100644 --- a/python/ray/serve/api.py +++ b/python/ray/serve/api.py @@ -66,8 +66,6 @@ def check(self, *args, **kwargs): class ThreadProxiedRouter: def __init__(self, controller_handle, sync: bool): - self.controller_handle = controller_handle - self.sync = sync self.router = Router(controller_handle) if sync: @@ -94,11 +92,6 @@ def _remote(self, endpoint_name, handle_options, request_data, **kwargs) return coro - def __reduce__(self): - deserializer = ThreadProxiedRouter - serialized_data = (self.controller_handle, self.sync) - return deserializer, serialized_data - class Client: def __init__(self, diff --git a/python/ray/serve/handle.py b/python/ray/serve/handle.py index 4ee2624a8d31..c6951c6380b9 100644 --- a/python/ray/serve/handle.py +++ b/python/ray/serve/handle.py @@ -4,6 +4,8 @@ from typing import Any, Dict, Optional, Union from enum import Enum +from ray.serve.router import Router + @dataclass(frozen=True) class HandleOptions: @@ -38,11 +40,10 @@ class RayServeHandle: # raises RayTaskError Exception """ - def __init__( - self, - router, # ThreadProxiedRouter - endpoint_name, - handle_options: Optional[HandleOptions] = None): + def __init__(self, + router: Router, + endpoint_name, + handle_options: Optional[HandleOptions] = None): self.router = router self.endpoint_name = endpoint_name self.handle_options = handle_options or HandleOptions() @@ -77,7 +78,7 @@ def options(self, async def remote(self, request_data: Optional[Union[Dict, Any]] = None, **kwargs): - """Issue an asynchronous request to the endpoint. + """Issue an asynchrounous request to the endpoint. Returns a Ray ObjectRef whose results can be waited for or retrieved using ray.wait or ray.get (or ``await object_ref``), respectively. @@ -97,12 +98,6 @@ async def remote(self, def __repr__(self): return f"{self.__class__.__name__}(endpoint='{self.endpoint_name}')" - def __reduce__(self): - deserializer = RayServeHandle - serialized_data = (self.router, self.endpoint_name, - self.handle_options) - return deserializer, serialized_data - class RayServeSyncHandle(RayServeHandle): def remote(self, request_data: Optional[Union[Dict, Any]] = None, @@ -128,9 +123,3 @@ def remote(self, request_data: Optional[Union[Dict, Any]] = None, future: concurrent.futures.Future = asyncio.run_coroutine_threadsafe( coro, self.router.async_loop) return future.result() - - def __reduce__(self): - deserializer = RayServeSyncHandle - serialized_data = (self.router, self.endpoint_name, - self.handle_options) - return deserializer, serialized_data diff --git a/python/ray/serve/tests/test_handle.py b/python/ray/serve/tests/test_handle.py index 88ab9d2c2b7a..c17db7686aad 100644 --- a/python/ray/serve/tests/test_handle.py +++ b/python/ray/serve/tests/test_handle.py @@ -1,51 +1,9 @@ import requests -import pytest + import ray from ray import serve -@pytest.mark.asyncio -async def test_async_handle_serializable(serve_instance): - client = serve_instance - - def f(_): - return "hello" - - client.create_backend("f", f) - client.create_endpoint("f", backend="f") - - @ray.remote - class TaskActor: - async def task(self, handle): - ref = await handle.remote() - output = await ref - return output - - handle = client.get_handle("f", sync=False) - - task_actor = TaskActor.remote() - result = await task_actor.task.remote(handle) - assert result == "hello" - - -def test_sync_handle_serializable(serve_instance): - client = serve_instance - - def f(_): - return "hello" - - client.create_backend("f", f) - client.create_endpoint("f", backend="f") - - @ray.remote - def task(handle): - return ray.get(handle.remote()) - - handle = client.get_handle("f", sync=True) - result_ref = task.remote(handle) - assert ray.get(result_ref) == "hello" - - def test_handle_in_endpoint(serve_instance): client = serve_instance From 779653f481033689eb1d3a4806c62d2c697c2aa0 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 177/244] Revert "[serve] Fix whacky worker replica failure test (#13696)" This reverts commit 77cebbb39ff609fb5a69411497f08edbe0ef5d7d. --- python/ray/serve/tests/test_failure.py | 49 +++++++++++++------------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/python/ray/serve/tests/test_failure.py b/python/ray/serve/tests/test_failure.py index 3cba01ffb3ba..de7003c39f8f 100644 --- a/python/ray/serve/tests/test_failure.py +++ b/python/ray/serve/tests/test_failure.py @@ -1,11 +1,13 @@ import os import requests import sys +import tempfile import time import pytest import ray from ray.test_utils import wait_for_condition +from ray import serve from ray.serve.config import BackendConfig, ReplicaConfig @@ -158,30 +160,34 @@ def __call__(self, *args): def test_worker_replica_failure(serve_instance): client = serve_instance - @ray.remote - class Counter: - def __init__(self): - self.count = 0 - - def inc_and_get(self): - self.count += 1 - return self.count - class Worker: # Assumes that two replicas are started. Will hang forever in the # constructor for any workers that are restarted. - def __init__(self, counter): + def __init__(self, path): self.should_hang = False - self.index = ray.get(counter.inc_and_get.remote()) - if self.index > 2: + if not os.path.exists(path): + with open(path, "w") as f: + f.write("1") + else: + with open(path, "r") as f: + num = int(f.read()) + + with open(path, "w") as f: + if num == 2: + self.should_hang = True + else: + f.write(str(num + 1)) + + if self.should_hang: while True: pass def __call__(self, *args): - return self.index + pass - counter = Counter.remote() - client.create_backend("replica_failure", Worker, counter) + temp_path = os.path.join(tempfile.gettempdir(), + serve.utils.get_random_letters()) + client.create_backend("replica_failure", Worker, temp_path) client.update_backend_config( "replica_failure", BackendConfig(num_replicas=2)) client.create_endpoint( @@ -189,16 +195,9 @@ def __call__(self, *args): # Wait until both replicas have been started. responses = set() - start = time.time() - while time.time() - start < 30: + while len(responses) == 1: + responses.add(request_with_retries("/replica_failure", timeout=1).text) time.sleep(0.1) - response = request_with_retries("/replica_failure", timeout=1).text - assert response in ["1", "2"] - responses.add(response) - if len(responses) > 1: - break - else: - raise TimeoutError("Timed out waiting for replicas after 30s.") # Kill one of the replicas. handles = _get_worker_handles(client, "replica_failure") @@ -264,4 +263,6 @@ def f(_): if __name__ == "__main__": + import sys + import pytest sys.exit(pytest.main(["-v", "-s", __file__])) From 2d6397a157374f6a75661d16aa619c6d10577058 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 178/244] Revert "Don't gather check_parent_task on Windows, since it's undefined. (#13700)" This reverts commit b8f365caab6f6102897a0cb24fc79de2e3a99829. --- dashboard/agent.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/dashboard/agent.py b/dashboard/agent.py index a1afb5f77f2a..7bf5e1551a2b 100644 --- a/dashboard/agent.py +++ b/dashboard/agent.py @@ -185,11 +185,8 @@ async def _check_parent(): agent_port=self.grpc_port, agent_ip_address=self.ip)) - tasks = [m.run(self.server) for m in modules] - if sys.platform not in ["win32", "cygwin"]: - tasks.append(check_parent_task) - await asyncio.gather(*tasks) - + await asyncio.gather(check_parent_task, + *(m.run(self.server) for m in modules)) await self.server.wait_for_termination() # Wait for finish signal. await runner.cleanup() From b64422812c08e6a7e13de2d031432bc8f0efca8c Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 179/244] Revert "[tune] add type hints to tune.run(), fix abstract methods of ProgressReporter (#13684)" This reverts commit d47636509808d505e4cfeaa5e9445f3d0484fde9. --- python/ray/tune/progress_reporter.py | 7 -- python/ray/tune/tune.py | 145 ++++++++++++--------------- 2 files changed, 66 insertions(+), 86 deletions(-) diff --git a/python/ray/tune/progress_reporter.py b/python/ray/tune/progress_reporter.py index a462f8e51ef3..a71a2da546a8 100644 --- a/python/ray/tune/progress_reporter.py +++ b/python/ray/tune/progress_reporter.py @@ -57,13 +57,6 @@ def report(self, trials: List[Trial], done: bool, *sys_info: Dict): """ raise NotImplementedError - def set_search_properties(self, metric: Optional[str], - mode: Optional[str]): - return True - - def set_total_samples(self, total_samples: int): - pass - class TuneReporterBase(ProgressReporter): """Abstract base class for the default Tune reporters. diff --git a/python/ray/tune/tune.py b/python/ray/tune/tune.py index 009335c6073f..fab7b79bf5e5 100644 --- a/python/ray/tune/tune.py +++ b/python/ray/tune/tune.py @@ -1,35 +1,25 @@ -from typing import Any, Callable, Dict, Mapping, Optional, Sequence, Type, \ - Union - -import datetime import logging import sys import time -from ray.tune.analysis import ExperimentAnalysis -from ray.tune.callback import Callback from ray.tune.error import TuneError -from ray.tune.experiment import Experiment, convert_to_experiment_list -from ray.tune.logger import Logger -from ray.tune.progress_reporter import CLIReporter, JupyterNotebookReporter, \ - ProgressReporter -from ray.tune.ray_trial_executor import RayTrialExecutor -from ray.tune.registry import get_trainable_cls -from ray.tune.stopper import Stopper -from ray.tune.suggest import BasicVariantGenerator, SearchAlgorithm, \ - SearchGenerator +from ray.tune.experiment import convert_to_experiment_list, Experiment +from ray.tune.analysis import ExperimentAnalysis +from ray.tune.suggest import BasicVariantGenerator, SearchGenerator from ray.tune.suggest.suggestion import Searcher from ray.tune.suggest.variant_generator import has_unresolved_values -from ray.tune.syncer import SyncConfig, set_sync_periods, wait_for_sync -from ray.tune.trainable import Trainable from ray.tune.trial import Trial -from ray.tune.trial_runner import TrialRunner +from ray.tune.trainable import Trainable +from ray.tune.ray_trial_executor import RayTrialExecutor from ray.tune.utils.callback import create_default_callbacks +from ray.tune.registry import get_trainable_cls +from ray.tune.syncer import wait_for_sync, set_sync_periods, \ + SyncConfig +from ray.tune.trial_runner import TrialRunner +from ray.tune.progress_reporter import CLIReporter, JupyterNotebookReporter +from ray.tune.schedulers import FIFOScheduler from ray.tune.utils.log import Verbosity, has_verbosity, set_verbosity -# Must come last to avoid circular imports -from ray.tune.schedulers import FIFOScheduler, TrialScheduler - logger = logging.getLogger(__name__) try: @@ -65,51 +55,50 @@ def _report_progress(runner, reporter, done=False): def run( - run_or_experiment: Union[str, Callable, Type], - name: Optional[str] = None, - metric: Optional[str] = None, - mode: Optional[str] = None, - stop: Union[None, Mapping, Stopper, Callable[[str, Mapping], - bool]] = None, - time_budget_s: Union[None, int, float, datetime.timedelta] = None, - config: Optional[Dict[str, Any]] = None, - resources_per_trial: Optional[Mapping[str, Union[float, int]]] = None, - num_samples: int = 1, - local_dir: Optional[str] = None, - search_alg: Optional[Union[Searcher, SearchAlgorithm]] = None, - scheduler: Optional[TrialScheduler] = None, - keep_checkpoints_num: Optional[int] = None, - checkpoint_score_attr: Optional[str] = None, - checkpoint_freq: int = 0, - checkpoint_at_end: bool = False, - verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS, - progress_reporter: Optional[ProgressReporter] = None, - log_to_file: bool = False, - trial_name_creator: Optional[Callable[[Trial], str]] = None, - trial_dirname_creator: Optional[Callable[[Trial], str]] = None, - sync_config: Optional[SyncConfig] = None, - export_formats: Optional[Sequence] = None, - max_failures: int = 0, - fail_fast: bool = False, - restore: Optional[str] = None, - server_port: Optional[int] = None, - resume: bool = False, - queue_trials: bool = False, - reuse_actors: bool = False, - trial_executor: Optional[RayTrialExecutor] = None, - raise_on_failed_trial: bool = True, - callbacks: Optional[Sequence[Callback]] = None, + run_or_experiment, + name=None, + metric=None, + mode=None, + stop=None, + time_budget_s=None, + config=None, + resources_per_trial=None, + num_samples=1, + local_dir=None, + search_alg=None, + scheduler=None, + keep_checkpoints_num=None, + checkpoint_score_attr=None, + checkpoint_freq=0, + checkpoint_at_end=False, + verbose=Verbosity.V3_TRIAL_DETAILS, + progress_reporter=None, + log_to_file=False, + trial_name_creator=None, + trial_dirname_creator=None, + sync_config=None, + export_formats=None, + max_failures=0, + fail_fast=False, + restore=None, + server_port=None, + resume=False, + queue_trials=False, + reuse_actors=False, + trial_executor=None, + raise_on_failed_trial=True, + callbacks=None, # Deprecated args - loggers: Optional[Sequence[Type[Logger]]] = None, - ray_auto_init: Optional = None, - run_errored_only: Optional = None, - global_checkpoint_period: Optional = None, - with_server: Optional = None, - upload_dir: Optional = None, - sync_to_cloud: Optional = None, - sync_to_driver: Optional = None, - sync_on_checkpoint: Optional = None, -) -> ExperimentAnalysis: + loggers=None, + ray_auto_init=None, + run_errored_only=None, + global_checkpoint_period=None, + with_server=None, + upload_dir=None, + sync_to_cloud=None, + sync_to_driver=None, + sync_on_checkpoint=None, +): """Executes training. Examples: @@ -469,20 +458,18 @@ def run( default_mode=mode) -def run_experiments( - experiments: Union[Experiment, Mapping, Sequence[Union[Experiment, - Mapping]]], - scheduler: Optional[TrialScheduler] = None, - server_port: Optional[int] = None, - verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS, - progress_reporter: Optional[ProgressReporter] = None, - resume: bool = False, - queue_trials: bool = False, - reuse_actors: bool = False, - trial_executor: Optional[RayTrialExecutor] = None, - raise_on_failed_trial: bool = True, - concurrent: bool = True, - callbacks: Optional[Sequence[Callback]] = None): +def run_experiments(experiments, + scheduler=None, + server_port=None, + verbose=Verbosity.V3_TRIAL_DETAILS, + progress_reporter=None, + resume=False, + queue_trials=False, + reuse_actors=False, + trial_executor=None, + raise_on_failed_trial=True, + concurrent=True, + callbacks=None): """Runs and blocks until all trials finish. Examples: From e12457ea004c0152a12795cc3ee0cfb4483ab45e Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 180/244] Revert "[tune] fix non-deterministic category sampling by switching back to `np.random.choice` (#13710)" This reverts commit d8cb6b1a124a4351498afa27c646a5986f3d01f6. --- python/ray/tune/ray_trial_executor.py | 1 - python/ray/tune/sample.py | 5 ++-- python/ray/tune/suggest/zoopt.py | 6 +++-- python/ray/tune/tests/test_sample.py | 34 +++------------------------ 4 files changed, 10 insertions(+), 36 deletions(-) diff --git a/python/ray/tune/ray_trial_executor.py b/python/ray/tune/ray_trial_executor.py index 26480118c2b0..a1fd4a8f3d06 100644 --- a/python/ray/tune/ray_trial_executor.py +++ b/python/ray/tune/ray_trial_executor.py @@ -573,7 +573,6 @@ def get_next_available_trial(self, timeout: Optional[float] = None): return None shuffled_results = list(self._running.keys()) random.shuffle(shuffled_results) - # Note: We shuffle the results because `ray.wait` by default returns # the first available result, and we want to guarantee that slower # trials (i.e. trials that run remotely) also get fairly reported. diff --git a/python/ray/tune/sample.py b/python/ray/tune/sample.py index 3be1b61e0c68..e4d349ee9db1 100644 --- a/python/ray/tune/sample.py +++ b/python/ray/tune/sample.py @@ -1,4 +1,5 @@ import logging +import random from copy import copy from inspect import signature from math import isclose @@ -294,7 +295,7 @@ def sample(self, spec: Optional[Union[List[Dict], Dict]] = None, size: int = 1): - items = np.random.choice(domain.categories, size=size).tolist() + items = random.choices(domain.categories, k=size) return items if len(items) > 1 else domain.cast(items[0]) default_sampler_cls = _Uniform @@ -470,7 +471,7 @@ def choice(categories: List): """Sample a categorical value. Sampling from ``tune.choice([1, 2])`` is equivalent to sampling from - ``np.random.choice([1, 2])`` + ``random.choice([1, 2])`` """ return Categorical(categories).uniform() diff --git a/python/ray/tune/suggest/zoopt.py b/python/ray/tune/suggest/zoopt.py index 71cedffd5500..c0c0ddb18562 100644 --- a/python/ray/tune/suggest/zoopt.py +++ b/python/ray/tune/suggest/zoopt.py @@ -198,8 +198,8 @@ def _setup_zoopt(self): init_samples = None if self._points_to_evaluate: - logger.warning("`points_to_evaluate` is ignored by ZOOpt in " - "versions <= 0.4.1.") + logger.warning( + "`points_to_evaluate` seems to be ignored by ZOOpt.") init_samples = [ Solution(x=tuple(point[dim] for dim in self._dim_keys)) for point in self._points_to_evaluate @@ -213,6 +213,8 @@ def _setup_zoopt(self): parameter=par, parallel_num=self.parallel_num, **self.kwargs) + if init_samples: + self.optimizer.init_attribute() def set_search_properties(self, metric: Optional[str], mode: Optional[str], config: Dict) -> bool: diff --git a/python/ray/tune/tests/test_sample.py b/python/ray/tune/tests/test_sample.py index 0b752e1be207..378a2c1ef565 100644 --- a/python/ray/tune/tests/test_sample.py +++ b/python/ray/tune/tests/test_sample.py @@ -193,32 +193,6 @@ def testQuantized(self): samples = tune.sample.Float(0, 33).quantized(3).sample(size=1000) self.assertTrue(all(0 <= s <= 33 for s in samples)) - def testCategoricalSeedInTrainingLoop(self): - def train(config): - return 0 - - config = { - "integer": tune.randint(0, 100_000), - "choice": tune.choice(list(range(100_000))) - } - - np.random.seed(1000) - - out_1 = tune.run(train, config=config, num_samples=8, verbose=0) - - integers_1 = [t.config["integer"] for t in out_1.trials] - choices_1 = [t.config["choice"] for t in out_1.trials] - - np.random.seed(1000) - - out_2 = tune.run(train, config=config, num_samples=8, verbose=0) - - integers_2 = [t.config["integer"] for t in out_2.trials] - choices_2 = [t.config["choice"] for t in out_2.trials] - - self.assertSequenceEqual(integers_1, integers_2) - self.assertSequenceEqual(choices_1, choices_2) - def testConvertAx(self): from ray.tune.suggest.ax import AxSearch from ax.service.ax_client import AxClient @@ -978,11 +952,9 @@ def testPointsToEvaluateSkOpt(self): return self._testPointsToEvaluate(SkOptSearch, config) def testPointsToEvaluateZoOpt(self): - self.skipTest( - "ZOOpt's latest release (0.4.1) does not support sampling " - "initial points. Please re-enable this test after the next " - "release.") - + # https://github.com/polixir/ZOOpt/issues/5 + self.skipTest("ZoOpt currently ignores initial points. This test " + "will be enabled after this has been fixed.") config = { "metric": tune.sample.Categorical([1, 2, 3, 4]).uniform(), "a": tune.sample.Categorical(["t1", "t2", "t3", "t4"]).uniform(), From b1c5af65f40aa4e7104e04be6e50fa5bc8f4b6bc Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 181/244] Revert "[Placement Group]Add detached support for placement group. (#13582)" This reverts commit fbe9a68f4e4b6c764081da7781d49d0b0a83c775. --- doc/source/placement-group.rst | 36 ------ python/ray/_raylet.pyx | 6 +- python/ray/actor.py | 4 +- python/ray/includes/common.pxd | 3 +- python/ray/tests/test_placement_group.py | 113 ------------------ python/ray/util/placement_group.py | 17 +-- src/ray/common/placement_group.h | 6 +- src/ray/core_worker/common.h | 9 +- src/ray/core_worker/core_worker.cc | 4 +- ...io_ray_runtime_task_NativeTaskSubmitter.cc | 3 +- .../gcs_server/gcs_placement_group_manager.cc | 12 +- .../gcs_server/gcs_placement_group_manager.h | 8 +- src/ray/gcs/test/gcs_test_util.h | 5 +- src/ray/protobuf/common.proto | 2 - src/ray/protobuf/gcs.proto | 2 - 15 files changed, 21 insertions(+), 209 deletions(-) diff --git a/doc/source/placement-group.rst b/doc/source/placement-group.rst index 1424b850c9c8..6fe8bc3a894d 100644 --- a/doc/source/placement-group.rst +++ b/doc/source/placement-group.rst @@ -252,42 +252,6 @@ Note that you can anytime remove the placement group to clean up resources. ray.shutdown() -Placement Group Lifetimes -------------------------- - -.. tabs:: - .. group-tab:: Python - - By default, the lifetimes of placement groups are not detached and will be destroyed - when the driver is terminated (but, if it is created from a detached actor, it is - killed when the detached actor is killed). If you'd like to keep the placement group - alive regardless of its job or detached actor, you should specify - `lifetime="detached"`. For example: - - .. code-block:: python - - # first_driver.py - pg = placement_group([{"CPU": 2}, {"CPU": 2}], strategy="STRICT_SPREAD", lifetime="detached") - ray.get(pg.ready()) - - The placement group's lifetime will be independent of the driver now. This means it - is possible to retrieve the placement group from other drivers regardless of when - the current driver exits. Let's see an example: - - .. code-block:: python - - # second_driver.py - table = ray.util.placement_group_table() - print(len(table)) - - Note that the lifetime option is decoupled from the name. If we only specified - the name without specifying ``lifetime="detached"``, then the placement group can - only be retrieved as long as the original driver is still running. - - .. group-tab:: Java - - The lifetime argument is not implemented for Java APIs yet. - Tips for Using Placement Groups ------------------------------- - Learn the :ref:`lifecycle ` of placement groups. diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 0fc3f4bf25da..8ba80852fb40 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -1184,8 +1184,7 @@ cdef class CoreWorker: self, c_string name, c_vector[unordered_map[c_string, double]] bundles, - c_string strategy, - c_bool is_detached): + c_string strategy): cdef: CPlacementGroupID c_placement_group_id CPlacementStrategy c_strategy @@ -1209,8 +1208,7 @@ cdef class CoreWorker: CPlacementGroupCreationOptions( name, c_strategy, - bundles, - is_detached + bundles ), &c_placement_group_id)) diff --git a/python/ray/actor.py b/python/ray/actor.py index 547a2929db15..499cd1eacd36 100644 --- a/python/ray/actor.py +++ b/python/ray/actor.py @@ -584,9 +584,7 @@ def _remote(self, elif lifetime == "detached": detached = True else: - raise ValueError( - "actor `lifetime` argument must be either `None` or 'detached'" - ) + raise ValueError("lifetime must be either `None` or 'detached'") if placement_group_capture_child_tasks is None: placement_group_capture_child_tasks = ( diff --git a/python/ray/includes/common.pxd b/python/ray/includes/common.pxd index 679ff6f0aa3b..a7ba4b23b8b2 100644 --- a/python/ray/includes/common.pxd +++ b/python/ray/includes/common.pxd @@ -270,8 +270,7 @@ cdef extern from "ray/core_worker/common.h" nogil: CPlacementGroupCreationOptions( const c_string &name, CPlacementStrategy strategy, - const c_vector[unordered_map[c_string, double]] &bundles, - c_bool is_detached + const c_vector[unordered_map[c_string, double]] &bundles ) cdef extern from "ray/gcs/gcs_client.h" nogil: diff --git a/python/ray/tests/test_placement_group.py b/python/ray/tests/test_placement_group.py index 87273a4998c9..7c5963f9e8a1 100644 --- a/python/ray/tests/test_placement_group.py +++ b/python/ray/tests/test_placement_group.py @@ -1309,119 +1309,6 @@ def is_all_placement_group_removed(): wait_for_condition(is_all_placement_group_removed) - ray.shutdown() - - -def test_detached_placement_group(ray_start_cluster): - cluster = ray_start_cluster - for _ in range(2): - cluster.add_node(num_cpus=3) - cluster.wait_for_nodes() - info = ray.init(address=cluster.address) - - # Make sure detached placement group will alive when job dead. - driver_code = f""" -import ray - -ray.init(address="{info["redis_address"]}") - -pg = ray.util.placement_group( - [{{"CPU": 1}} for _ in range(2)], - strategy="STRICT_SPREAD", lifetime="detached") -ray.get(pg.ready()) - -@ray.remote(num_cpus=1) -class Actor: - def ready(self): - return True - -for bundle_index in range(2): - actor = Actor.options(lifetime="detached", placement_group=pg, - placement_group_bundle_index=bundle_index).remote() - ray.get(actor.ready.remote()) - -ray.shutdown() - """ - - run_string_as_driver(driver_code) - - # Wait until the driver is reported as dead by GCS. - def is_job_done(): - jobs = ray.jobs() - for job in jobs: - if "StopTime" in job: - return True - return False - - def assert_alive_num_pg(expected_num_pg): - alive_num_pg = 0 - for _, placement_group_info in ray.util.placement_group_table().items( - ): - if placement_group_info["state"] == "CREATED": - alive_num_pg += 1 - return alive_num_pg == expected_num_pg - - def assert_alive_num_actor(expected_num_actor): - alive_num_actor = 0 - for actor_info in ray.actors().values(): - if actor_info["State"] == ray.gcs_utils.ActorTableData.ALIVE: - alive_num_actor += 1 - return alive_num_actor == expected_num_actor - - wait_for_condition(is_job_done) - - assert assert_alive_num_pg(1) - assert assert_alive_num_actor(2) - - # Make sure detached placement group will alive when its creator which - # is detached actor dead. - # Test actors first. - @ray.remote(num_cpus=1) - class NestedActor: - def ready(self): - return True - - @ray.remote(num_cpus=1) - class Actor: - def __init__(self): - self.actors = [] - - def ready(self): - return True - - def schedule_nested_actor_with_detached_pg(self): - # Create placement group which is detached. - pg = ray.util.placement_group( - [{ - "CPU": 1 - } for _ in range(2)], - strategy="STRICT_SPREAD", - lifetime="detached", - name="detached_pg") - ray.get(pg.ready()) - # Schedule nested actor with the placement group. - for bundle_index in range(2): - actor = NestedActor.options( - placement_group=pg, - placement_group_bundle_index=bundle_index, - lifetime="detached").remote() - ray.get(actor.ready.remote()) - self.actors.append(actor) - - a = Actor.options(lifetime="detached").remote() - ray.get(a.ready.remote()) - # 1 parent actor and 2 children actor. - ray.get(a.schedule_nested_actor_with_detached_pg.remote()) - - # Kill an actor and wait until it is killed. - ray.kill(a) - with pytest.raises(ray.exceptions.RayActorError): - ray.get(a.ready.remote()) - - # We should have 2 alive pgs and 4 alive actors. - assert assert_alive_num_pg(2) - assert assert_alive_num_actor(4) - if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/util/placement_group.py b/python/ray/util/placement_group.py index 6d15f607f22c..be24772ab518 100644 --- a/python/ray/util/placement_group.py +++ b/python/ray/util/placement_group.py @@ -145,8 +145,7 @@ def _fill_bundle_cache_if_needed(self): def placement_group(bundles: List[Dict[str, float]], strategy: str = "PACK", - name: str = "unnamed_group", - lifetime=None) -> PlacementGroup: + name: str = "unnamed_group") -> PlacementGroup: """Asynchronously creates a PlacementGroup. Args: @@ -161,10 +160,6 @@ def placement_group(bundles: List[Dict[str, float]], - "STRICT_SPREAD": Packs Bundles across distinct nodes. name(str): The name of the placement group. - lifetime(str): Either `None`, which defaults to the placement group - will fate share with its creator and will be deleted once its - creator is dead, or "detached", which means the placement group - will live as a global object independent of the creator. Return: PlacementGroup: Placement group object. @@ -184,16 +179,8 @@ def placement_group(bundles: List[Dict[str, float]], "Bundles cannot be an empty dictionary or " f"resources with only 0 values. Bundles: {bundles}") - if lifetime is None: - detached = False - elif lifetime == "detached": - detached = True - else: - raise ValueError("placement group `lifetime` argument must be either" - " `None` or 'detached'") - placement_group_id = worker.core_worker.create_placement_group( - name, bundles, strategy, detached) + name, bundles, strategy) return PlacementGroup(placement_group_id) diff --git a/src/ray/common/placement_group.h b/src/ray/common/placement_group.h index 532f69d74ef9..a068ce4a1e51 100644 --- a/src/ray/common/placement_group.h +++ b/src/ray/common/placement_group.h @@ -67,9 +67,8 @@ class PlacementGroupSpecBuilder { PlacementGroupSpecBuilder &SetPlacementGroupSpec( const PlacementGroupID &placement_group_id, std::string name, const std::vector> &bundles, - const rpc::PlacementStrategy strategy, const bool is_detached, - const JobID &creator_job_id, const ActorID &creator_actor_id, - bool is_creator_detached_actor) { + const rpc::PlacementStrategy strategy, const JobID &creator_job_id, + const ActorID &creator_actor_id, bool is_creator_detached_actor) { message_->set_placement_group_id(placement_group_id.Binary()); message_->set_name(name); message_->set_strategy(strategy); @@ -83,7 +82,6 @@ class PlacementGroupSpecBuilder { message_->set_creator_job_dead(is_creator_detached_actor); message_->set_creator_actor_id(creator_actor_id.Binary()); message_->set_creator_actor_dead(creator_actor_id.IsNil()); - message_->set_is_detached(is_detached); for (size_t i = 0; i < bundles.size(); i++) { auto resources = bundles[i]; diff --git a/src/ray/core_worker/common.h b/src/ray/core_worker/common.h index bb10aff958ad..1716fe606de9 100644 --- a/src/ray/core_worker/common.h +++ b/src/ray/core_worker/common.h @@ -144,11 +144,8 @@ using PlacementStrategy = rpc::PlacementStrategy; struct PlacementGroupCreationOptions { PlacementGroupCreationOptions( std::string name, PlacementStrategy strategy, - std::vector> bundles, bool is_detached) - : name(std::move(name)), - strategy(strategy), - bundles(std::move(bundles)), - is_detached(is_detached) {} + std::vector> bundles) + : name(std::move(name)), strategy(strategy), bundles(std::move(bundles)) {} /// The name of the placement group. const std::string name; @@ -156,8 +153,6 @@ struct PlacementGroupCreationOptions { const PlacementStrategy strategy = rpc::PACK; /// The resource bundles in this placement group. const std::vector> bundles; - /// Whether to keep the placement group persistent after its creator dead. - const bool is_detached = false; }; } // namespace ray diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 2f5dcc57efc1..f7e473eca5a2 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -1463,8 +1463,8 @@ Status CoreWorker::CreatePlacementGroup( builder.SetPlacementGroupSpec( placement_group_id, placement_group_creation_options.name, placement_group_creation_options.bundles, placement_group_creation_options.strategy, - placement_group_creation_options.is_detached, worker_context_.GetCurrentJobID(), - worker_context_.GetCurrentActorID(), worker_context_.CurrentActorDetached()); + worker_context_.GetCurrentJobID(), worker_context_.GetCurrentActorID(), + worker_context_.CurrentActorDetached()); PlacementGroupSpecification placement_group_spec = builder.Build(); *return_placement_group_id = placement_group_id; RAY_LOG(INFO) << "Submitting Placement Group creation to GCS: " << placement_group_id; diff --git a/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.cc b/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.cc index cd374b76a272..5470f70fb395 100644 --- a/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.cc +++ b/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.cc @@ -201,8 +201,7 @@ inline ray::PlacementGroupCreationOptions ToPlacementGroupCreationOptions( }); }); return ray::PlacementGroupCreationOptions(JavaStringToNativeString(env, name), - ConvertStrategy(java_strategy), bundles, - /*is_detached=*/false); + ConvertStrategy(java_strategy), bundles); } #ifdef __cplusplus diff --git a/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc b/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc index a856002b6465..b56f6b1d3b81 100644 --- a/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc @@ -96,15 +96,11 @@ void GcsPlacementGroup::MarkCreatorActorDead() { placement_group_table_data_.set_creator_actor_dead(true); } -bool GcsPlacementGroup::IsPlacementGroupLifetimeDone() const { - return !IsDetached() && placement_group_table_data_.creator_job_dead() && +bool GcsPlacementGroup::IsPlacementGroupRemovable() const { + return placement_group_table_data_.creator_job_dead() && placement_group_table_data_.creator_actor_dead(); } -bool GcsPlacementGroup::IsDetached() const { - return placement_group_table_data_.is_detached(); -} - ///////////////////////////////////////////////////////////////////////////////////////// GcsPlacementGroupManager::GcsPlacementGroupManager( @@ -499,7 +495,7 @@ void GcsPlacementGroupManager::CleanPlacementGroupIfNeededWhenJobDead( continue; } placement_group->MarkCreatorJobDead(); - if (placement_group->IsPlacementGroupLifetimeDone()) { + if (placement_group->IsPlacementGroupRemovable()) { RemovePlacementGroup(placement_group->GetPlacementGroupID(), [](Status status) {}); } } @@ -513,7 +509,7 @@ void GcsPlacementGroupManager::CleanPlacementGroupIfNeededWhenActorDead( continue; } placement_group->MarkCreatorActorDead(); - if (placement_group->IsPlacementGroupLifetimeDone()) { + if (placement_group->IsPlacementGroupRemovable()) { RemovePlacementGroup(placement_group->GetPlacementGroupID(), [](Status status) {}); } } diff --git a/src/ray/gcs/gcs_server/gcs_placement_group_manager.h b/src/ray/gcs/gcs_server/gcs_placement_group_manager.h index 28ce82090077..c76849108990 100644 --- a/src/ray/gcs/gcs_server/gcs_placement_group_manager.h +++ b/src/ray/gcs/gcs_server/gcs_placement_group_manager.h @@ -61,7 +61,6 @@ class GcsPlacementGroup { placement_group_spec.creator_job_dead()); placement_group_table_data_.set_creator_actor_dead( placement_group_spec.creator_actor_dead()); - placement_group_table_data_.set_is_detached(placement_group_spec.is_detached()); } /// Get the immutable PlacementGroupTableData of this placement group. @@ -108,11 +107,8 @@ class GcsPlacementGroup { /// Mark that the creator actor of this placement group is dead. void MarkCreatorActorDead(); - /// Return True if the placement group lifetime is done. False otherwise. - bool IsPlacementGroupLifetimeDone() const; - - /// Returns whether or not this is a detached placement group. - bool IsDetached() const; + /// Return True if the placement group is removable. False otherwise. + bool IsPlacementGroupRemovable() const; private: /// The placement_group meta data which contains the task specification as well as the diff --git a/src/ray/gcs/test/gcs_test_util.h b/src/ray/gcs/test/gcs_test_util.h index 4d51fdd866f6..bf908c3a278f 100644 --- a/src/ray/gcs/test/gcs_test_util.h +++ b/src/ray/gcs/test/gcs_test_util.h @@ -101,9 +101,8 @@ struct Mocker { PlacementGroupSpecBuilder builder; auto placement_group_id = PlacementGroupID::FromRandom(); - builder.SetPlacementGroupSpec(placement_group_id, name, bundles, strategy, - /* is_detached */ false, job_id, actor_id, - /* is_creator_detached */ false); + builder.SetPlacementGroupSpec(placement_group_id, name, bundles, strategy, job_id, + actor_id, /* is_creator_detached */ false); return builder.Build(); } diff --git a/src/ray/protobuf/common.proto b/src/ray/protobuf/common.proto index 844f44bea723..cc3149e84f46 100644 --- a/src/ray/protobuf/common.proto +++ b/src/ray/protobuf/common.proto @@ -233,8 +233,6 @@ message PlacementGroupSpec { bool creator_job_dead = 7; // Whether or not if the creator actor is dead. bool creator_actor_dead = 8; - // Whether the placement group is persistent. - bool is_detached = 9; } message ObjectReference { diff --git a/src/ray/protobuf/gcs.proto b/src/ray/protobuf/gcs.proto index 902c29cb7f58..1e59ae8123ca 100644 --- a/src/ray/protobuf/gcs.proto +++ b/src/ray/protobuf/gcs.proto @@ -191,8 +191,6 @@ message PlacementGroupTableData { bool creator_job_dead = 8; // Whether or not if the creator actor is dead. bool creator_actor_dead = 9; - // Whether the placement group is persistent. - bool is_detached = 10; } message ScheduleData { From 46f046ed5fe2a6b9ba18daa98fb67321bed61bd9 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 182/244] Revert "[Object Spilling] Clean up FS storage upon sigint for ray.init(). (#13649)" This reverts commit b0fc13f52fed8617b05fe398b0f79a5dc1be8427. --- python/ray/external_storage.py | 43 +------- python/ray/node.py | 12 --- python/ray/tests/test_object_spilling.py | 129 +++++++---------------- python/ray/worker.py | 2 - 4 files changed, 40 insertions(+), 146 deletions(-) diff --git a/python/ray/external_storage.py b/python/ray/external_storage.py index f764e9c0fc5e..6e16351482cd 100644 --- a/python/ray/external_storage.py +++ b/python/ray/external_storage.py @@ -1,7 +1,5 @@ import abc -import logging import os -import shutil import urllib from collections import namedtuple from typing import List, IO, Tuple @@ -11,7 +9,6 @@ from ray._raylet import ObjectRef ParsedURL = namedtuple("ParsedURL", "base_url, offset, size") -logger = logging.getLogger(__name__) def create_url_with_offset(*, url: str, offset: int, size: int) -> str: @@ -179,14 +176,6 @@ def delete_spilled_objects(self, urls: List[str]): urls: URLs that store spilled object files. """ - @abc.abstractmethod - def destroy_external_storage(self): - """Destroy external storage when a head node is down. - - NOTE: This is currently working when the cluster is - started by ray.init - """ - class NullStorage(ExternalStorage): """The class that represents an uninitialized external storage.""" @@ -200,9 +189,6 @@ def restore_spilled_objects(self, object_refs, url_with_offset_list): def delete_spilled_objects(self, urls: List[str]): raise NotImplementedError("External storage is not initialized") - def destroy_external_storage(self): - raise NotImplementedError("External storage is not initialized") - class FileSystemStorage(ExternalStorage): """The class for filesystem-like external storage. @@ -213,8 +199,8 @@ class FileSystemStorage(ExternalStorage): """ def __init__(self, directory_path): - self.spill_dir_name = DEFAULT_OBJECT_PREFIX - self.directory_path = os.path.join(directory_path, self.spill_dir_name) + self.directory_path = directory_path + self.prefix = DEFAULT_OBJECT_PREFIX os.makedirs(self.directory_path, exist_ok=True) if not os.path.exists(self.directory_path): raise ValueError("The given directory path to store objects, " @@ -225,7 +211,7 @@ def spill_objects(self, object_refs) -> List[str]: return [] # Always use the first object ref as a key when fusioning objects. first_ref = object_refs[0] - filename = f"{first_ref.hex()}-multi-{len(object_refs)}" + filename = f"{self.prefix}-{first_ref.hex()}-multi-{len(object_refs)}" url = f"{os.path.join(self.directory_path, filename)}" with open(url, "wb") as f: return self._write_multiple_objects(f, object_refs, url) @@ -257,25 +243,6 @@ def delete_spilled_objects(self, urls: List[str]): filename = parse_url_with_offset(url.decode()).base_url os.remove(os.path.join(self.directory_path, filename)) - def destroy_external_storage(self): - # Q: Should we add stdout here to - # indicate we are deleting a directory? - - # There's a race condition where IO workers are still - # deleting each objects while we try deleting the - # whole directory. So we should keep trying it until - # The directory is actually deleted. - while os.path.isdir(self.directory_path): - try: - shutil.rmtree(self.directory_path) - except FileNotFoundError: - # If excpetion occurs when other IO workers are - # deleting the file at the same time. - pass - except Exception: - logger.exception("Error cleaning up spill files") - break - class ExternalStorageSmartOpenImpl(ExternalStorage): """The external storage class implemented by smart_open. @@ -364,9 +331,6 @@ def restore_spilled_objects(self, object_refs: List[ObjectRef], def delete_spilled_objects(self, urls: List[str]): pass - def destroy_external_storage(self): - pass - _external_storage = NullStorage() @@ -389,7 +353,6 @@ def setup_external_storage(config): raise ValueError(f"Unknown external storage type: {storage_type}") else: _external_storage = NullStorage() - return _external_storage def reset_external_storage(): diff --git a/python/ray/node.py b/python/ray/node.py index 2668d9aa0735..9130b39fbe86 100644 --- a/python/ray/node.py +++ b/python/ray/node.py @@ -421,9 +421,6 @@ def address_info(self): "metrics_export_port": self._metrics_export_port } - def is_head(self): - return self.head - def create_redis_client(self): """Create a redis client.""" return ray._private.services.create_redis_client( @@ -1155,12 +1152,3 @@ def remaining_processes_alive(self): True if any process that wasn't explicitly killed is still alive. """ return not any(self.dead_processes()) - - def destroy_external_storage(self): - object_spilling_config = self._config.get("object_spilling_config", {}) - if object_spilling_config: - object_spilling_config = json.loads(object_spilling_config) - from ray import external_storage - storage = external_storage.setup_external_storage( - object_spilling_config) - storage.destroy_external_storage() diff --git a/python/ray/tests/test_object_spilling.py b/python/ray/tests/test_object_spilling.py index 3f5b5f7ae885..a80a91580c6f 100644 --- a/python/ray/tests/test_object_spilling.py +++ b/python/ray/tests/test_object_spilling.py @@ -3,7 +3,6 @@ import os import random import platform -import subprocess import sys import numpy as np @@ -11,7 +10,7 @@ import ray from ray.external_storage import (create_url_with_offset, parse_url_with_offset) -from ray.test_utils import wait_for_condition, run_string_as_driver +from ray.test_utils import wait_for_condition from ray.internal.internal_api import memory_summary bucket_name = "object-spilling-test" @@ -69,17 +68,6 @@ def multi_node_object_spilling_config(request, tmp_path): yield create_object_spilling_config(request, tmp_path) -def is_dir_empty(temp_folder, - append_path=ray.ray_constants.DEFAULT_OBJECT_PREFIX): - # append_path is used because the file based spilling will append - # new directory path. - num_files = 0 - temp_folder = temp_folder / append_path - for path in temp_folder.iterdir(): - num_files += 1 - return num_files == 0 - - def test_invalid_config_raises_exception(shutdown_only): # Make sure ray.init raises an exception before # it starts processes when invalid object spilling @@ -132,7 +120,13 @@ def test_spilling_not_done_for_pinned_object(object_spilling_config, with pytest.raises(ray.exceptions.ObjectStoreFullError): ref2 = ray.put(arr) # noqa - wait_for_condition(lambda: is_dir_empty(temp_folder)) + def is_dir_empty(): + num_files = 0 + for path in temp_folder.iterdir(): + num_files += 1 + return num_files == 0 + + wait_for_condition(is_dir_empty) @pytest.mark.skipif( @@ -209,7 +203,7 @@ def test_spill_objects_automatically(object_spilling_config, shutdown_only): ref = ray.put(arr) replay_buffer.append(ref) solution_buffer.append(arr) - print("spill done.") + # randomly sample objects for _ in range(1000): index = random.choice(list(range(buffer_length))) @@ -323,7 +317,6 @@ def test_spill_deadlock(object_spilling_config, shutdown_only): def test_delete_objects(object_spilling_config, shutdown_only): # Limit our object store to 75 MiB of memory. object_spilling_config, temp_folder = object_spilling_config - ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ @@ -344,9 +337,15 @@ def test_delete_objects(object_spilling_config, shutdown_only): print("-----------------------------------") + def is_dir_empty(): + num_files = 0 + for path in temp_folder.iterdir(): + num_files += 1 + return num_files == 0 + del replay_buffer del ref - wait_for_condition(lambda: is_dir_empty(temp_folder)) + wait_for_condition(is_dir_empty) @pytest.mark.skipif( @@ -355,7 +354,6 @@ def test_delete_objects_delete_while_creating(object_spilling_config, shutdown_only): # Limit our object store to 75 MiB of memory. object_spilling_config, temp_folder = object_spilling_config - ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ @@ -383,10 +381,16 @@ def test_delete_objects_delete_while_creating(object_spilling_config, sample = ray.get(ref, timeout=0) assert np.array_equal(sample, arr) + def is_dir_empty(): + num_files = 0 + for path in temp_folder.iterdir(): + num_files += 1 + return num_files == 0 + # After all, make sure all objects are killed without race condition. del replay_buffer del ref - wait_for_condition(lambda: is_dir_empty(temp_folder)) + wait_for_condition(is_dir_empty, timeout=1000) @pytest.mark.skipif( @@ -395,7 +399,6 @@ def test_delete_objects_on_worker_failure(object_spilling_config, shutdown_only): # Limit our object store to 75 MiB of memory. object_spilling_config, temp_folder = object_spilling_config - ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ @@ -446,8 +449,14 @@ def wait_until_actor_dead(): wait_for_condition(wait_until_actor_dead) + def is_dir_empty(): + num_files = 0 + for path in temp_folder.iterdir(): + num_files += 1 + return num_files == 0 + # After all, make sure all objects are deleted upon worker failures. - wait_for_condition(lambda: is_dir_empty(temp_folder)) + wait_for_condition(is_dir_empty, timeout=1000) @pytest.mark.skipif( @@ -456,7 +465,6 @@ def test_delete_objects_multi_node(multi_node_object_spilling_config, ray_start_cluster): # Limit our object store to 75 MiB of memory. object_spilling_config, temp_folder = multi_node_object_spilling_config - cluster = ray_start_cluster # Head node. cluster.add_node( @@ -510,12 +518,18 @@ def wait_until_actor_dead(actor): return True return False + def is_dir_empty(): + num_files = 0 + for path in temp_folder.iterdir(): + num_files += 1 + return num_files == 0 + # Kill actors to remove all references. for actor in actors: ray.kill(actor) wait_for_condition(lambda: wait_until_actor_dead(actor)) # The multi node deletion should work. - wait_for_condition(lambda: is_dir_empty(temp_folder)) + wait_for_condition(is_dir_empty) @pytest.mark.skipif(platform.system() == "Windows", reason="Flaky on Windows.") @@ -556,9 +570,6 @@ def test_fusion_objects(object_spilling_config, shutdown_only): assert np.array_equal(sample, solution) is_test_passing = False - # Since we'd like to see the temp directory that stores the files, - # we need to append this directory. - temp_folder = temp_folder / ray.ray_constants.DEFAULT_OBJECT_PREFIX for path in temp_folder.iterdir(): file_size = path.stat().st_size # Make sure there are at least one @@ -680,71 +691,5 @@ def allocate(*args): ray.get(tasks) -@pytest.mark.skipif( - platform.system() in ["Windows"], reason="Failing on " - "Windows and Mac.") -def test_file_deleted_when_driver_exits(tmp_path, shutdown_only): - # Limit our object store to 75 MiB of memory. - temp_folder = tmp_path / "spill" - temp_folder.mkdir() - - driver = """ -import json -import os -import signal -import numpy as np - -import ray - -ray.init( - object_store_memory=75 * 1024 * 1024, - _system_config={{ - "max_io_workers": 2, - "min_spilling_size": 0, - "automatic_object_spilling_enabled": True, - "object_store_full_delay_ms": 100, - "object_spilling_config": json.dumps({{ - "type": "filesystem", - "params": {{ - "directory_path": "{temp_dir}" - }} - }}), - }}) -arr = np.random.rand(1024 * 1024) # 8 MB data -replay_buffer = [] - -# Spill lots of objects -for _ in range(30): - ref = None - while ref is None: - ref = ray.put(arr) - replay_buffer.append(ref) -# Send sigterm to itself. -signum = {signum} -sig = None -if signum == 2: - sig = signal.SIGINT -elif signum == 15: - sig = signal.SIGTERM -os.kill(os.getpid(), sig) -""" - - # Run a driver with sigint. - print("Sending sigint...") - with pytest.raises(subprocess.CalledProcessError): - print( - run_string_as_driver( - driver.format(temp_dir=str(temp_folder), signum=2))) - wait_for_condition(lambda: is_dir_empty(temp_folder, append_path="")) - - # Q: Looks like Sigterm doesn't work with Ray? - # print("Sending sigterm...") - # # Run a driver with sigterm. - # with pytest.raises(subprocess.CalledProcessError): - # print(run_string_as_driver( - # driver.format(temp_dir=str(temp_folder), signum=15))) - # wait_for_condition(is_dir_empty, timeout=1000) - - if __name__ == "__main__": sys.exit(pytest.main(["-sv", __file__])) diff --git a/python/ray/worker.py b/python/ray/worker.py index 337b4ffc95fe..350bbc6491e5 100644 --- a/python/ray/worker.py +++ b/python/ray/worker.py @@ -818,8 +818,6 @@ def shutdown(_exiting_interpreter=False): # Shut down the Ray processes. global _global_node if _global_node is not None: - if _global_node.is_head(): - _global_node.destroy_external_storage() _global_node.kill_all_processes(check_alive=False, allow_graceful=True) _global_node = None From fcad9812ed2e65757f408f10dc089a1c0f0cbec1 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 183/244] Revert "[Logging] Log rotation config (#13375)" This reverts commit 055ef9cbd9c3b3296981b7bdcc66e9a3a9a839db. --- python/ray/_private/services.py | 67 ++++++---------- python/ray/node.py | 32 +------- python/ray/ray_constants.py | 11 +-- python/ray/tests/test_logging.py | 112 --------------------------- python/ray/workers/default_worker.py | 15 ---- src/ray/common/ray_config_def.h | 9 --- src/ray/util/logging.cc | 16 +--- 7 files changed, 34 insertions(+), 228 deletions(-) delete mode 100644 python/ray/tests/test_logging.py diff --git a/python/ray/_private/services.py b/python/ray/_private/services.py index d0eafc9693c6..688babad6ac9 100644 --- a/python/ray/_private/services.py +++ b/python/ray/_private/services.py @@ -1045,9 +1045,7 @@ def start_log_monitor(redis_address, stdout_file=None, stderr_file=None, redis_password=None, - fate_share=None, - max_bytes=0, - backup_count=0): + fate_share=None): """Start a log monitor process. Args: @@ -1058,20 +1056,17 @@ def start_log_monitor(redis_address, stderr_file: A file handle opened for writing to redirect stderr to. If no redirection should happen, then this should be None. redis_password (str): The password of the redis server. - max_bytes (int): Log rotation parameter. Corresponding to - RotatingFileHandler's maxBytes. - backup_count (int): Log rotation parameter. Corresponding to - RotatingFileHandler's backupCount. Returns: ProcessInfo for the process that was started. """ log_monitor_filepath = os.path.join(RAY_PATH, "log_monitor.py") command = [ - sys.executable, "-u", log_monitor_filepath, - f"--redis-address={redis_address}", f"--logs-dir={logs_dir}", - f"--logging-rotate-bytes={max_bytes}", - f"--logging-rotate-backup-count={backup_count}" + sys.executable, + "-u", + log_monitor_filepath, + f"--redis-address={redis_address}", + f"--logs-dir={logs_dir}", ] if redis_password: command += ["--redis-password", redis_password] @@ -1093,9 +1088,7 @@ def start_dashboard(require_dashboard, stdout_file=None, stderr_file=None, redis_password=None, - fate_share=None, - max_bytes=0, - backup_count=0): + fate_share=None): """Start a dashboard process. Args: @@ -1114,10 +1107,6 @@ def start_dashboard(require_dashboard, stderr_file: A file handle opened for writing to redirect stderr to. If no redirection should happen, then this should be None. redis_password (str): The password of the redis server. - max_bytes (int): Log rotation parameter. Corresponding to - RotatingFileHandler's maxBytes. - backup_count (int): Log rotation parameter. Corresponding to - RotatingFileHandler's backupCount. Returns: ProcessInfo for the process that was started. @@ -1143,11 +1132,14 @@ def start_dashboard(require_dashboard, dashboard_dir = "new_dashboard" dashboard_filepath = os.path.join(RAY_PATH, dashboard_dir, "dashboard.py") command = [ - sys.executable, "-u", dashboard_filepath, f"--host={host}", - f"--port={port}", f"--redis-address={redis_address}", - f"--temp-dir={temp_dir}", f"--log-dir={logdir}", - f"--logging-rotate-bytes={max_bytes}", - f"--logging-rotate-backup-count={backup_count}" + sys.executable, + "-u", + dashboard_filepath, + f"--host={host}", + f"--port={port}", + f"--redis-address={redis_address}", + f"--temp-dir={temp_dir}", + f"--log-dir={logdir}", ] if redis_password: @@ -1266,9 +1258,7 @@ def start_raylet(redis_address, fate_share=None, socket_to_use=None, head_node=False, - start_initial_python_workers_for_first_job=False, - max_bytes=0, - backup_count=0): + start_initial_python_workers_for_first_job=False): """Start a raylet, which is a combined local scheduler and object manager. Args: @@ -1305,10 +1295,6 @@ def start_raylet(redis_address, config (dict|None): Optional Raylet configuration that will override defaults in RayConfig. java_worker_options (list): The command options for Java worker. - max_bytes (int): Log rotation parameter. Corresponding to - RotatingFileHandler's maxBytes. - backup_count (int): Log rotation parameter. Corresponding to - RotatingFileHandler's backupCount. Returns: ProcessInfo for the process that was started. """ @@ -1386,8 +1372,6 @@ def start_raylet(redis_address, f"--config-list={config_str}", f"--temp-dir={temp_dir}", f"--metrics-agent-port={metrics_agent_port}", - f"--logging-rotate-bytes={max_bytes}", - f"--logging-rotate-backup-count={backup_count}", "RAY_WORKER_DYNAMIC_OPTION_PLACEHOLDER", ] if redis_password: @@ -1418,8 +1402,6 @@ def start_raylet(redis_address, f"--raylet-name={raylet_name}", f"--temp-dir={temp_dir}", f"--log-dir={log_dir}", - f"--logging-rotate-bytes={max_bytes}", - f"--logging-rotate-backup-count={backup_count}", ] if redis_password is not None and len(redis_password) != 0: @@ -1798,9 +1780,7 @@ def start_monitor(redis_address, stderr_file=None, autoscaling_config=None, redis_password=None, - fate_share=None, - max_bytes=0, - backup_count=0): + fate_share=None): """Run a process to monitor the other processes. Args: @@ -1812,20 +1792,17 @@ def start_monitor(redis_address, no redirection should happen, then this should be None. autoscaling_config: path to autoscaling config file. redis_password (str): The password of the redis server. - max_bytes (int): Log rotation parameter. Corresponding to - RotatingFileHandler's maxBytes. - backup_count (int): Log rotation parameter. Corresponding to - RotatingFileHandler's backupCount. Returns: ProcessInfo for the process that was started. """ monitor_path = os.path.join(RAY_PATH, "monitor.py") command = [ - sys.executable, "-u", monitor_path, f"--logs-dir={logs_dir}", - f"--redis-address={redis_address}", - f"--logging-rotate-bytes={max_bytes}", - f"--logging-rotate-backup-count={backup_count}" + sys.executable, + "-u", + monitor_path, + f"--logs-dir={logs_dir}", + "--redis-address=" + str(redis_address), ] if autoscaling_config: command.append("--autoscaling-config=" + str(autoscaling_config)) diff --git a/python/ray/node.py b/python/ray/node.py index 9130b39fbe86..086865023e54 100644 --- a/python/ray/node.py +++ b/python/ray/node.py @@ -142,18 +142,6 @@ def __init__(self, if "plasma_store_as_thread" not in self._config: self._config["plasma_store_as_thread"] = True - # Configure log rotation parameters. - self.max_bytes = int( - os.getenv("RAY_ROTATION_MAX_BYTES", - ray_constants.LOGGING_ROTATE_BYTES)) - self.backup_count = int( - os.getenv("RAY_ROTATION_BACKUP_COUNT", - ray_constants.LOGGING_ROTATE_BACKUP_COUNT)) - - assert self.max_bytes >= 0 - assert self.backup_count >= 0 - - # Register the temp dir. if head: redis_client = None # date including microsecond @@ -399,14 +387,6 @@ def socket(self): except AttributeError: return None - @property - def logging_config(self): - """Get the logging config of the current node.""" - return { - "log_rotation_max_bytes": self.max_bytes, - "log_rotation_backup_count": self.backup_count - } - @property def address_info(self): """Get a dictionary of addresses.""" @@ -673,9 +653,7 @@ def start_log_monitor(self): stdout_file=subprocess.DEVNULL, stderr_file=subprocess.DEVNULL, redis_password=self._ray_params.redis_password, - fate_share=self.kernel_fate_share, - max_bytes=self.max_bytes, - backup_count=self.backup_count) + fate_share=self.kernel_fate_share) assert ray_constants.PROCESS_TYPE_LOG_MONITOR not in self.all_processes self.all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR] = [ process_info, @@ -699,8 +677,6 @@ def start_dashboard(self, require_dashboard): stderr_file=subprocess.DEVNULL, # Avoid hang(fd inherit) redis_password=self._ray_params.redis_password, fate_share=self.kernel_fate_share, - max_bytes=self.max_bytes, - backup_count=self.backup_count, port=self._ray_params.dashboard_port) assert ray_constants.PROCESS_TYPE_DASHBOARD not in self.all_processes if process_info is not None: @@ -796,8 +772,6 @@ def start_raylet(self, fate_share=self.kernel_fate_share, socket_to_use=self.socket, head_node=self.head, - max_bytes=self.max_bytes, - backup_count=self.backup_count, start_initial_python_workers_for_first_job=self._ray_params. start_initial_python_workers_for_first_job) assert ray_constants.PROCESS_TYPE_RAYLET not in self.all_processes @@ -823,9 +797,7 @@ def start_monitor(self): stderr_file=stderr_file, autoscaling_config=self._ray_params.autoscaling_config, redis_password=self._ray_params.redis_password, - fate_share=self.kernel_fate_share, - max_bytes=self.max_bytes, - backup_count=self.backup_count) + fate_share=self.kernel_fate_share) assert ray_constants.PROCESS_TYPE_MONITOR not in self.all_processes self.all_processes[ray_constants.PROCESS_TYPE_MONITOR] = [process_info] diff --git a/python/ray/ray_constants.py b/python/ray/ray_constants.py index 04dfd8f173b7..a5459b8637ba 100644 --- a/python/ray/ray_constants.py +++ b/python/ray/ray_constants.py @@ -150,9 +150,12 @@ def to_memory_units(memory_bytes, round_up): LOGGER_LEVEL_CHOICES = ["debug", "info", "warning", "error", "critical"] LOGGER_LEVEL_HELP = ("The logging level threshold, choices=['debug', 'info'," " 'warning', 'error', 'critical'], default='info'") - -LOGGING_ROTATE_BYTES = 512 * 1024 * 1024 # 512MB. -LOGGING_ROTATE_BACKUP_COUNT = 5 # 5 Backup files at max. +# Default param for RotatingFileHandler +# maxBytes. 10G by default. We intentionally set the default value high +# so that users who won't care don't know about the existence of this. +LOGGING_ROTATE_BYTES = 10 * 1000 * 1000 * 1000 +# The default will grow logs up until 500GB without log loss. +LOGGING_ROTATE_BACKUP_COUNT = 50 # backupCount # Constants used to define the different process types. PROCESS_TYPE_REAPER = "reaper" @@ -169,8 +172,6 @@ def to_memory_units(memory_bytes, round_up): PROCESS_TYPE_REDIS_SERVER = "redis_server" PROCESS_TYPE_WEB_UI = "web_ui" PROCESS_TYPE_GCS_SERVER = "gcs_server" -PROCESS_TYPE_PYTHON_CORE_WORKER_DRIVER = "python-core-driver" -PROCESS_TYPE_PYTHON_CORE_WORKER = "python-core-worker" # Log file names MONITOR_LOG_FILE_NAME = f"{PROCESS_TYPE_MONITOR}.log" diff --git a/python/ray/tests/test_logging.py b/python/ray/tests/test_logging.py deleted file mode 100644 index 6796ac4f7187..000000000000 --- a/python/ray/tests/test_logging.py +++ /dev/null @@ -1,112 +0,0 @@ -import os -from collections import defaultdict -from pathlib import Path - -import ray -from ray import ray_constants - - -def set_logging_config(max_bytes, backup_count): - os.environ["RAY_ROTATION_MAX_BYTES"] = str(max_bytes) - os.environ["RAY_ROTATION_BACKUP_COUNT"] = str(backup_count) - - -def test_log_rotation_config(ray_start_cluster): - cluster = ray_start_cluster - max_bytes = 100 - backup_count = 3 - - # Create a cluster. - set_logging_config(max_bytes, backup_count) - head_node = cluster.add_node(num_cpus=0) - # Set a different env var for a worker node. - set_logging_config(0, 0) - worker_node = cluster.add_node(num_cpus=0) - cluster.wait_for_nodes() - - config = head_node.logging_config - assert config["log_rotation_max_bytes"] == max_bytes - assert config["log_rotation_backup_count"] == backup_count - config = worker_node.logging_config - assert config["log_rotation_max_bytes"] == 0 - assert config["log_rotation_backup_count"] == 0 - - -def test_log_rotation(shutdown_only): - max_bytes = 1 - backup_count = 3 - set_logging_config(max_bytes, backup_count) - ray.init(num_cpus=1) - session_dir = ray.worker.global_worker.node.address_info["session_dir"] - session_path = Path(session_dir) - log_dir_path = session_path / "logs" - - log_rotating_component = [ - ray_constants.PROCESS_TYPE_DASHBOARD, - ray_constants.PROCESS_TYPE_DASHBOARD_AGENT, - ray_constants.PROCESS_TYPE_LOG_MONITOR, - ray_constants.PROCESS_TYPE_MONITOR, - ray_constants.PROCESS_TYPE_PYTHON_CORE_WORKER_DRIVER, - ray_constants.PROCESS_TYPE_PYTHON_CORE_WORKER, - # Below components are not log rotating now. - # ray_constants.PROCESS_TYPE_RAYLET, - # ray_constants.PROCESS_TYPE_GCS_SERVER, - # ray_constants.PROCESS_TYPE_WORKER, - ] - - # Run the basic workload. - @ray.remote - def f(): - for i in range(10): - print(f"test {i}") - - ray.get(f.remote()) - - paths = list(log_dir_path.iterdir()) - - def component_exist(component, paths): - for path in paths: - filename = path.stem - if component in filename: - return True - return False - - def component_file_size_small_enough(component): - """Although max_bytes is 1, the file can have size that is big. - For example, if the logger prints the traceback, it can be - much bigger. So, we shouldn't make the assertion too tight. - """ - small_enough_bytes = 512 # 512 bytes. - for path in paths: - if not component_exist(component, [path]): - continue - - if path.stat().st_size > small_enough_bytes: - return False - return True - - for component in log_rotating_component: - assert component_exist(component, paths) - assert component_file_size_small_enough(component) - - # Check if the backup count is respected. - file_cnts = defaultdict(int) - for path in paths: - filename = path.stem - filename_without_suffix = filename.split(".")[0] - file_cnts[filename_without_suffix] += 1 - for filename, file_cnt in file_cnts.items(): - # There could be backup_count + 1 files. - # EX) *.log, *.log.* (as many as backup count). - assert file_cnt <= backup_count + 1, ( - f"{filename} has files that are more than " - f"backup count {backup_count}, file count: {file_cnt}") - - -if __name__ == "__main__": - import pytest - import sys - # Make subprocess happy in bazel. - os.environ["LC_ALL"] = "en_US.UTF-8" - os.environ["LANG"] = "en_US.UTF-8" - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/workers/default_worker.py b/python/ray/workers/default_worker.py index 7b9c2677bd0b..d9f7837ff2ce 100644 --- a/python/ray/workers/default_worker.py +++ b/python/ray/workers/default_worker.py @@ -109,21 +109,6 @@ help="A list of directories or jar files separated by colon that specify " "the search path for user code. This will be used as `CLASSPATH` in " "Java and `PYTHONPATH` in Python.") -parser.add_argument( - "--logging-rotate-bytes", - required=False, - type=int, - default=ray_constants.LOGGING_ROTATE_BYTES, - help="Specify the max bytes for rotating " - "log file, default is " - f"{ray_constants.LOGGING_ROTATE_BYTES} bytes.") -parser.add_argument( - "--logging-rotate-backup-count", - required=False, - type=int, - default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT, - help="Specify the backup count of rotated log file, default is " - f"{ray_constants.LOGGING_ROTATE_BACKUP_COUNT}.") if __name__ == "__main__": # NOTE(sang): For some reason, if we move the code below # to a separate function, tensorflow will capture that method diff --git a/src/ray/common/ray_config_def.h b/src/ray/common/ray_config_def.h index cd6bd84cee9c..d06a1c358196 100644 --- a/src/ray/common/ray_config_def.h +++ b/src/ray/common/ray_config_def.h @@ -369,12 +369,3 @@ RAY_CONFIG(bool, is_external_storage_type_fs, true) /// Whether to enable locality-aware leasing. If enabled, then Ray will consider task /// dependency locality when choosing a worker for leasing. RAY_CONFIG(bool, locality_aware_leasing_enabled, true) - -/* Configuration parameters for logging */ -/// Parameters for log rotation. This value is equivalent to RotatingFileHandler's -/// maxBytes argument. -RAY_CONFIG(int64_t, log_rotation_max_bytes, 100 * 1024 * 1024) - -/// Parameters for log rotation. This value is equivalent to RotatingFileHandler's -/// backupCount argument. -RAY_CONFIG(int64_t, log_rotation_backup_count, 5) diff --git a/src/ray/util/logging.cc b/src/ray/util/logging.cc index b06d64441087..1640c5cfc657 100644 --- a/src/ray/util/logging.cc +++ b/src/ray/util/logging.cc @@ -307,19 +307,11 @@ void RayLog::StartRayLog(const std::string &app_name, RayLogLevel severity_thres #endif // Reset log pattern and level and we assume a log file can be rotated with // 10 files in max size 512M by default. - if (getenv("RAY_ROTATION_MAX_BYTES")) { - long max_size = std::atol(getenv("RAY_ROTATION_MAX_BYTES")); - // 0 means no log rotation in python, but not in spdlog. We just use the default - // value here. - if (max_size != 0) { - log_rotation_max_size_ = max_size; - } + if (getenv("RAY_ROTATION_MAX_SIZE")) { + log_rotation_max_size_ = std::atol(getenv("RAY_RAOTATION_MAX_SIZE")); } - if (getenv("RAY_ROTATION_BACKUP_COUNT")) { - long file_num = std::atol(getenv("RAY_ROTATION_BACKUP_COUNT")); - if (file_num != 0) { - log_rotation_file_num_ = file_num; - } + if (getenv("RAY_ROTATION_FILE_NUM")) { + log_rotation_file_num_ = std::atol(getenv("RAY_ROTATION_FILE_NUM")); } spdlog::set_pattern(log_format_pattern_); spdlog::set_level(static_cast(severity_threshold_)); From 6739c6b8f301649af82f97437340c11dc4f4e31c Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 184/244] Revert "[CI] Skip test_multi_node_3 on Windows (#13723)" This reverts commit 352ee2f91ef8e43abcf3412d43768962d62c956e. --- ci/travis/ci.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/travis/ci.sh b/ci/travis/ci.sh index 82286c8c211c..d9c679bc7218 100755 --- a/ci/travis/ci.sh +++ b/ci/travis/ci.sh @@ -156,7 +156,6 @@ test_python() { -python/ray/tests:test_metrics_agent # timeout -python/ray/tests:test_multi_node -python/ray/tests:test_multi_node_2 - -python/ray/tests:test_multi_node_3 -python/ray/tests:test_multiprocessing # test_connect_to_ray() fails to connect to raylet -python/ray/tests:test_node_manager -python/ray/tests:test_object_manager From ce3020beeb3aed30dca5d9f70d444bcaa336b0b6 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 185/244] Revert "[Core] Better error if /dev/shm is too small (#13624)" This reverts commit a4d9dd8c05fe96efc8ad6e97cf089477eed87aca. --- python/ray/_private/services.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/python/ray/_private/services.py b/python/ray/_private/services.py index 688babad6ac9..c9ea996f9c0c 100644 --- a/python/ray/_private/services.py +++ b/python/ray/_private/services.py @@ -1622,11 +1622,10 @@ def determine_plasma_store_config(object_store_memory, "This will harm performance! You may be able to free up " "space by deleting files in /dev/shm. If you are inside a " "Docker container, you can increase /dev/shm size by " - "passing '--shm-size={:.2f}gb' to 'docker run' (or add it " - "to the run_options list in a Ray cluster config). Make " - "sure to set this to more than 30% of available RAM.". - format(ray.utils.get_user_temp_dir(), shm_avail, - object_store_memory * (1.1) / (2**30))) + "passing '--shm-size=Xgb' to 'docker run' (or add it to " + "the run_options list in a Ray cluster config). Make sure " + "to set this to more than 2gb.".format( + ray.utils.get_user_temp_dir(), shm_avail)) else: plasma_directory = ray.utils.get_user_temp_dir() From c060a3870ef1fd6c289582798be267c4b48a7e16 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 186/244] Revert "Fix multiprocessing starmap to allow passing in zip (#13664)" This reverts commit ef1be1880b540c016026a59139699d4c99da25fe. --- python/ray/tests/test_multiprocessing.py | 1 - python/ray/util/multiprocessing/pool.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/python/ray/tests/test_multiprocessing.py b/python/ray/tests/test_multiprocessing.py index 8ec3cb43c7df..3f63b72db19a 100644 --- a/python/ray/tests/test_multiprocessing.py +++ b/python/ray/tests/test_multiprocessing.py @@ -340,7 +340,6 @@ def f(*args): args = [tuple(range(i)) for i in range(100)] assert pool.starmap(f, args) == args - assert pool.starmap(lambda x, y: x + y, zip([1, 2], [3, 4])) == [4, 6] def test_callbacks(pool_4_processes): diff --git a/python/ray/util/multiprocessing/pool.py b/python/ray/util/multiprocessing/pool.py index 9910bc3a46a9..2d8f3d5fb911 100644 --- a/python/ray/util/multiprocessing/pool.py +++ b/python/ray/util/multiprocessing/pool.py @@ -494,7 +494,7 @@ def _submit_chunk(self, def _chunk_and_run(self, func, iterable, chunksize=None, unpack_args=False): if not hasattr(iterable, "__len__"): - iterable = list(iterable) + iterable = [iterable] if chunksize is None: chunksize = self._calculate_chunksize(iterable) From 86950beb7bcec31c808362e34c931b25fe458ffe Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 187/244] Revert "Revert "[CLI] Fix Ray Status with ENV Variable set (#13707)" (#13719)" This reverts commit 7691059a5fbed7f8ea920dc07f5c797a18dbc6cd. --- python/ray/_private/services.py | 2 +- python/ray/tests/test_cli.py | 19 +++++++++++++++++++ .../test_cli_patterns/test_ray_status.txt | 12 ++++++++++++ 3 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 python/ray/tests/test_cli_patterns/test_ray_status.txt diff --git a/python/ray/_private/services.py b/python/ray/_private/services.py index c9ea996f9c0c..435c16d4eebc 100644 --- a/python/ray/_private/services.py +++ b/python/ray/_private/services.py @@ -216,7 +216,7 @@ def get_ray_address_to_use_or_die(): A string to pass into `ray.init(address=...)` """ if "RAY_ADDRESS" in os.environ: - return "auto" # Avoid conflict with RAY_ADDRESS env var + return os.environ.get("RAY_ADDRESS") return find_redis_address_or_die() diff --git a/python/ray/tests/test_cli.py b/python/ray/tests/test_cli.py index 57bf61419690..a6f1b1989ae9 100644 --- a/python/ray/tests/test_cli.py +++ b/python/ray/tests/test_cli.py @@ -415,5 +415,24 @@ def commands_mock(command, stdin): _check_output_via_pattern("test_ray_submit.txt", result) +def test_ray_status(): + import ray + address = ray.init().get("redis_address") + runner = CliRunner() + result = runner.invoke(scripts.status, []) + _check_output_via_pattern("test_ray_status.txt", result) + + result_arg = runner.invoke(scripts.status, ["--address", address]) + _check_output_via_pattern("test_ray_status.txt", result_arg) + + # Try to check status with RAY_ADDRESS set + os.environ["RAY_ADDRESS"] = address + result_env = runner.invoke(scripts.status) + _check_output_via_pattern("test_ray_status.txt", result_env) + + result_env_arg = runner.invoke(scripts.status, ["--address", address]) + _check_output_via_pattern("test_ray_status.txt", result_env_arg) + + if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_cli_patterns/test_ray_status.txt b/python/ray/tests/test_cli_patterns/test_ray_status.txt new file mode 100644 index 000000000000..7169c5f0f096 --- /dev/null +++ b/python/ray/tests/test_cli_patterns/test_ray_status.txt @@ -0,0 +1,12 @@ +======== Cluster status: .+ +Node status +------------------------------------------------------------ + + +Resources +------------------------------------------------------------ +Usage: + + +Demands: + \(no resource demands\) From 4276b46605757b522eb02efde11a54a13fefaba7 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 188/244] Revert "[Serve] Revert "Revert "[Serve] Refactor BackendState" (#13626) (#13697)" This reverts commit a2fcf9706a3e006a8e42db8c92d31cc6685ef12b. --- python/ray/serve/backend_state.py | 533 ++++++++++--------------- python/ray/serve/config.py | 4 +- python/ray/serve/controller.py | 4 +- python/ray/serve/tests/test_api.py | 3 - python/ray/serve/tests/test_failure.py | 3 - 5 files changed, 217 insertions(+), 330 deletions(-) diff --git a/python/ray/serve/backend_state.py b/python/ray/serve/backend_state.py index 4aad2671ea4e..673c4b2cfbc8 100644 --- a/python/ray/serve/backend_state.py +++ b/python/ray/serve/backend_state.py @@ -1,8 +1,7 @@ import asyncio +from asyncio.futures import Future from collections import defaultdict -from enum import Enum -import time -from typing import Dict, List, Optional, Tuple +from typing import Dict, Any, List, Optional, Set, Tuple import ray import ray.cloudpickle as pickle @@ -18,6 +17,7 @@ ) from ray.serve.config import BackendConfig, ReplicaConfig from ray.serve.constants import LongPollKey +from ray.serve.exceptions import RayServeException from ray.serve.kv_store import RayInternalKVStore from ray.serve.long_poll import LongPollHost from ray.serve.utils import (format_actor_name, get_random_letters, logger, @@ -30,150 +30,6 @@ _RESOURCE_CHECK_ENABLED = True -class ReplicaState(Enum): - SHOULD_START = 1 - STARTING = 2 - RUNNING = 3 - SHOULD_STOP = 4 - STOPPING = 5 - STOPPED = 6 - - -class BackendReplica: - def __init__(self, controller_name: str, detached: bool, - replica_tag: ReplicaTag, backend_tag: BackendTag): - self._actor_name = format_actor_name(replica_tag, controller_name) - self._controller_name = controller_name - self._detached = detached - self._replica_tag = replica_tag - self._backend_tag = backend_tag - self._actor_handle = None - self._startup_obj_ref = None - self._drain_obj_ref = None - self._state = ReplicaState.SHOULD_START - - def __get_state__(self): - clean_dict = self.__dict__.copy() - del clean_dict["_actor_handle"] - del clean_dict["_startup_obj_ref"] - del clean_dict["_drain_obj_ref"] - return clean_dict - - def __set_state__(self, d): - self.__dict__ = d - self._actor_handle = None - self._startup_obj_ref = None - self._drain_obj_ref = None - self._recover_from_checkpoint() - - def _recover_from_checkpoint(self): - if self._state == ReplicaState.STARTING: - # We do not need to pass in the class here because the actor - # creation has already been started if this class was checkpointed - # in the STARTING state. - self.start() - elif self._state == ReplicaState.RUNNING: - # Fetch actor handles for all backend replicas in the system. - # The actors must exist if this class was checkpointed in the - # RUNNING state. - self._actor_handle = ray.get_actor(self._actor_name) - elif self._state == ReplicaState.STOPPING: - self.stop() - - def start(self, backend_info: Optional[BackendInfo]): - assert self._state in { - ReplicaState.SHOULD_START, ReplicaState.STARTING - }, (f"State must be {ReplicaState.SHOULD_START} or " - f"{ReplicaState.STARTING}, *not* {self._state}") - try: - self._actor_handle = ray.get_actor(self._actor_name) - except ValueError: - logger.debug("Starting replica '{}' for backend '{}'.".format( - self._replica_tag, self._backend_tag)) - self._actor_handle = ray.remote(backend_info.worker_class).options( - name=self._actor_name, - lifetime="detached" if self._detached else None, - max_restarts=-1, - max_task_retries=-1, - **backend_info.replica_config.ray_actor_options).remote( - self._backend_tag, self._replica_tag, - backend_info.replica_config.actor_init_args, - backend_info.backend_config, self._controller_name) - self._startup_obj_ref = self._actor_handle.ready.remote() - self._state = ReplicaState.STARTING - - def check_started(self): - if self._state == ReplicaState.RUNNING: - return True - assert self._state == ReplicaState.STARTING, ( - f"State must be {ReplicaState.STARTING}, *not* {self._state}") - ready, _ = ray.wait([self._startup_obj_ref], timeout=0) - if len(ready) == 1: - self._state = ReplicaState.RUNNING - return True - return False - - def set_should_stop(self, graceful_shutdown_timeout_s: Duration): - self._state = ReplicaState.SHOULD_STOP - self._graceful_shutdown_timeout_s = graceful_shutdown_timeout_s - - def stop(self): - # We need to handle transitions from: - # SHOULD_START -> SHOULD_STOP -> STOPPING - # This means that the replica_handle may not have been created. - - assert self._state in { - ReplicaState.SHOULD_STOP, ReplicaState.STOPPING - }, (f"State must be {ReplicaState.SHOULD_STOP} or " - f"{ReplicaState.STOPPING}, *not* {self._state}") - - def drain_actor(actor_name): - # NOTE: the replicas may already be stopped if we failed - # after stopping them but before writing a checkpoint. - try: - replica = ray.get_actor(actor_name) - except ValueError: - return None - return replica.drain_pending_queries.remote() - - self._state = ReplicaState.STOPPING - self._drain_obj_ref = drain_actor(self._actor_name) - self._shutdown_deadline = time.time( - ) + self._graceful_shutdown_timeout_s - - def check_stopped(self): - if self._state == ReplicaState.STOPPED: - return True - assert self._state == ReplicaState.STOPPING, ( - f"State must be {ReplicaState.STOPPING}, *not* {self._state}") - - try: - replica = ray.get_actor(self._actor_name) - except ValueError: - self._state = ReplicaState.STOPPED - return True - - ready, _ = ray.wait([self._drain_obj_ref], timeout=0) - timeout_passed = time.time() > self._shutdown_deadline - - if len(ready) == 1 or timeout_passed: - if timeout_passed: - # Graceful period passed, kill it forcefully. - logger.debug( - f"{self._actor_name} did not shutdown after " - f"{self._graceful_shutdown_timeout_s}s, force-killing.") - - ray.kill(replica, no_restart=True) - self._state = ReplicaState.STOPPED - return True - return False - - def get_actor_handle(self): - assert self._state == ReplicaState.RUNNING, ( - f"State must be {ReplicaState.RUNNING}, *not* {self._state}") - return self._actor_handle - - class BackendState: """Manages all state for backends in the system. @@ -190,65 +46,79 @@ def __init__(self, controller_name: str, detached: bool, self._long_poll_host = long_poll_host self._goal_manager = goal_manager - self._replicas: Dict[BackendTag, Dict[ReplicaState, List[ - BackendReplica]]] = defaultdict(lambda: defaultdict(list)) - self._backend_metadata: Dict[BackendTag, BackendInfo] = dict() - self._target_replicas: Dict[BackendTag, int] = defaultdict(int) - self.backend_goals: Dict[BackendTag, GoalId] = dict() + # Non-checkpointed state. + self.currently_starting_replicas: Dict[asyncio.Future, Tuple[ + BackendTag, ReplicaTag, ActorHandle]] = dict() + self.currently_stopping_replicas: Dict[asyncio.Future, Tuple[ + BackendTag, ReplicaTag]] = dict() - # Un-Checkpointed state. - self.pending_goals: Dict[GoalId, asyncio.Event] = dict() + # Checkpointed state. + self.backends: Dict[BackendTag, BackendInfo] = dict() + self.backend_replicas: Dict[BackendTag, Dict[ + ReplicaTag, ActorHandle]] = defaultdict(dict) + self.backend_goals: Dict[BackendTag, GoalId] = dict() + self.backend_replicas_to_start: Dict[BackendTag, List[ + ReplicaTag]] = defaultdict(list) + self.backend_replicas_to_stop: Dict[BackendTag, List[Tuple[ + ReplicaTag, Duration]]] = defaultdict(list) + self.backends_to_remove: List[BackendTag] = list() checkpoint = self._kv_store.get(CHECKPOINT_KEY) if checkpoint is not None: - (self._replicas, self._backend_metadata, self._target_replicas, - self.backend_goals, pending_goal_ids) = pickle.loads(checkpoint) + (self.backends, self.backend_replicas, self.backend_goals, + self.backend_replicas_to_start, self.backend_replicas_to_stop, + self.backend_to_remove, + pending_goal_ids) = pickle.loads(checkpoint) for goal_id in pending_goal_ids: self._goal_manager.create_goal(goal_id) + # Fetch actor handles for all backend replicas in the system. + # All of these backend_replicas are guaranteed to already exist + # because they would not be written to a checkpoint in + # self.backend_replicas until they were created. + for backend_tag, replica_dict in self.backend_replicas.items(): + for replica_tag in replica_dict.keys(): + replica_name = format_actor_name(replica_tag, + self._controller_name) + self.backend_replicas[backend_tag][ + replica_tag] = ray.get_actor(replica_name) + self._notify_backend_configs_changed() self._notify_replica_handles_changed() def _checkpoint(self) -> None: self._kv_store.put( CHECKPOINT_KEY, - pickle.dumps((self._replicas, self._backend_metadata, - self._target_replicas, self.backend_goals, - self._goal_manager.get_pending_goal_ids()))) + pickle.dumps( + (self.backends, self.backend_replicas, self.backend_goals, + self.backend_replicas_to_start, self.backend_replicas_to_stop, + self.backends_to_remove, + self._goal_manager.get_pending_goal_ids()))) def _notify_backend_configs_changed(self) -> None: self._long_poll_host.notify_changed(LongPollKey.BACKEND_CONFIGS, self.get_backend_configs()) - def get_running_replica_handles( - self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]: - return { - backend_tag: { - backend_replica._replica_tag: - backend_replica.get_actor_handle() - for backend_replica in state_to_replica_dict[ - ReplicaState.RUNNING] - } - for backend_tag, state_to_replica_dict in self._replicas.items() - } - def _notify_replica_handles_changed(self) -> None: self._long_poll_host.notify_changed( LongPollKey.REPLICA_HANDLES, { backend_tag: list(replica_dict.values()) - for backend_tag, replica_dict in - self.get_running_replica_handles().items() + for backend_tag, replica_dict in self.backend_replicas.items() }) def get_backend_configs(self) -> Dict[BackendTag, BackendConfig]: return { tag: info.backend_config - for tag, info in self._backend_metadata.items() + for tag, info in self.backends.items() } + def get_replica_handles( + self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]: + return self.backend_replicas + def get_backend(self, backend_tag: BackendTag) -> Optional[BackendInfo]: - return self._backend_metadata.get(backend_tag) + return self.backends.get(backend_tag) def _set_backend_goal(self, backend_tag: BackendTag, backend_info: BackendInfo) -> None: @@ -256,11 +126,7 @@ def _set_backend_goal(self, backend_tag: BackendTag, new_goal_id = self._goal_manager.create_goal() if backend_info is not None: - self._backend_metadata[backend_tag] = backend_info - self._target_replicas[ - backend_tag] = backend_info.backend_config.num_replicas - else: - self._target_replicas[backend_tag] = 0 + self.backends[backend_tag] = backend_info self.backend_goals[backend_tag] = new_goal_id @@ -270,25 +136,31 @@ def create_backend(self, backend_tag: BackendTag, backend_config: BackendConfig, replica_config: ReplicaConfig) -> Optional[GoalId]: # Ensures this method is idempotent. - backend_info = self._backend_metadata.get(backend_tag) + backend_info = self.backends.get(backend_tag) if backend_info is not None: if (backend_info.backend_config == backend_config and backend_info.replica_config == replica_config): return None - backend_replica_class = create_backend_replica( - replica_config.func_or_class) + backend_replica = create_backend_replica(replica_config.func_or_class) # Save creator that starts replicas, the arguments to be passed in, # and the configuration for the backends. backend_info = BackendInfo( - worker_class=backend_replica_class, + worker_class=backend_replica, backend_config=backend_config, replica_config=replica_config) new_goal_id, existing_goal_id = self._set_backend_goal( backend_tag, backend_info) + try: + self.scale_backend_replicas(backend_tag, + backend_config.num_replicas) + except RayServeException as e: + del self.backends[backend_tag] + raise e + # NOTE(edoakes): we must write a checkpoint before starting new # or pushing the updated config to avoid inconsistent state if we # crash while making the change. @@ -303,15 +175,20 @@ def delete_backend(self, backend_tag: BackendTag, force_kill: bool = False) -> Optional[GoalId]: # This method must be idempotent. We should validate that the # specified backend exists on the client. - if backend_tag not in self._backend_metadata: + if backend_tag not in self.backends: return None + # Scale its replicas down to 0. + self.scale_backend_replicas(backend_tag, 0, force_kill) + + # Remove the backend's metadata. + del self.backends[backend_tag] + + # Add the intention to remove the backend from the routers. + self.backends_to_remove.append(backend_tag) + new_goal_id, existing_goal_id = self._set_backend_goal( backend_tag, None) - if force_kill: - self._backend_metadata[ - backend_tag].backend_config.\ - experimental_graceful_shutdown_timeout_s = 0 self._checkpoint() if existing_goal_id is not None: @@ -320,18 +197,20 @@ def delete_backend(self, backend_tag: BackendTag, def update_backend_config(self, backend_tag: BackendTag, config_options: BackendConfig): - if backend_tag not in self._backend_metadata: + if backend_tag not in self.backends: raise ValueError(f"Backend {backend_tag} is not registered") - stored_backend_config = self._backend_metadata[ - backend_tag].backend_config + stored_backend_config = self.backends[backend_tag].backend_config updated_config = stored_backend_config.copy( update=config_options.dict(exclude_unset=True)) updated_config._validate_complete() - self._backend_metadata[backend_tag].backend_config = updated_config + self.backends[backend_tag].backend_config = updated_config new_goal_id, existing_goal_id = self._set_backend_goal( - backend_tag, self._backend_metadata[backend_tag]) + backend_tag, self.backends[backend_tag]) + + # Scale the replicas with the new configuration. + self.scale_backend_replicas(backend_tag, updated_config.num_replicas) # NOTE(edoakes): we must write a checkpoint before pushing the # update to avoid inconsistent state if we crash after pushing the @@ -381,38 +260,31 @@ def _start_backend_replica(self, backend_tag: BackendTag, def scale_backend_replicas( self, backend_tag: BackendTag, - ) -> bool: + num_replicas: int, + force_kill: bool = False, + ) -> None: """Scale the given backend to the number of replicas. NOTE: this does not actually start or stop the replicas, but instead - adds them to ReplicaState.SHOULD_START or ReplicaState.SHOULD_STOP. - The caller is responsible for then first writing a checkpoint and then - actually starting/stopping the intended replicas. This avoids - inconsistencies with starting/stopping a replica and then crashing - before writing a checkpoint. + adds the intention to start/stop them to self.backend_replicas_to_start + and self.backend_replicas_to_stop. The caller is responsible for then + first writing a checkpoint and then actually starting/stopping the + intended replicas. This avoids inconsistencies with starting/stopping a + replica and then crashing before writing a checkpoint. """ - num_replicas = self._target_replicas.get(backend_tag, 0) logger.debug("Scaling backend '{}' to {} replicas".format( backend_tag, num_replicas)) - assert (backend_tag in self._backend_metadata + assert (backend_tag in self.backends ), "Backend {} is not registered.".format(backend_tag) assert num_replicas >= 0, ("Number of replicas must be" " greater than or equal to 0.") - current_num_replicas = sum([ - len(self._replicas[backend_tag][ReplicaState.SHOULD_START]), - len(self._replicas[backend_tag][ReplicaState.STARTING]), - len(self._replicas[backend_tag][ReplicaState.RUNNING]), - ]) - + current_num_replicas = len(self.backend_replicas[backend_tag]) delta_num_replicas = num_replicas - current_num_replicas - backend_info: BackendInfo = self._backend_metadata[backend_tag] - if delta_num_replicas == 0: - return False - - elif delta_num_replicas > 0: + backend_info: BackendInfo = self.backends[backend_tag] + if delta_num_replicas > 0: can_schedule = try_schedule_resources_on_nodes(requirements=[ backend_info.replica_config.resource_dict for _ in range(delta_num_replicas) @@ -420,11 +292,10 @@ def scale_backend_replicas( if _RESOURCE_CHECK_ENABLED and not all(can_schedule): num_possible = sum(can_schedule) - logger.error( + raise RayServeException( "Cannot scale backend {} to {} replicas. Ray Serve tried " "to add {} replicas but the resources only allows {} " - "to be added. This is not a problem if the cluster is " - "autoscaling. To fix this, consider scaling to replica to " + "to be added. To fix this, consider scaling to replica to " "{} or add more resources to the cluster. You can check " "avaiable resources with ray.nodes().".format( backend_tag, num_replicas, delta_num_replicas, @@ -434,132 +305,154 @@ def scale_backend_replicas( delta_num_replicas, backend_tag)) for _ in range(delta_num_replicas): replica_tag = "{}#{}".format(backend_tag, get_random_letters()) - self._replicas[backend_tag][ReplicaState.SHOULD_START].append( - BackendReplica(self._controller_name, self._detached, - replica_tag, backend_tag)) + self.backend_replicas_to_start[backend_tag].append(replica_tag) elif delta_num_replicas < 0: logger.debug("Removing {} replicas from backend '{}'".format( -delta_num_replicas, backend_tag)) - assert self._target_replicas[backend_tag] >= delta_num_replicas - + assert len( + self.backend_replicas[backend_tag]) >= delta_num_replicas + replicas_copy = self.backend_replicas.copy() for _ in range(-delta_num_replicas): - replica_state_dict = self._replicas[backend_tag] - list_to_use = replica_state_dict[ReplicaState.SHOULD_START] \ - or replica_state_dict[ReplicaState.STARTING] \ - or replica_state_dict[ReplicaState.RUNNING] - - assert len(list_to_use), replica_state_dict - replica_to_stop = list_to_use.pop() + replica_tag, _ = replicas_copy[backend_tag].popitem() graceful_timeout_s = (backend_info.backend_config. experimental_graceful_shutdown_timeout_s) - - replica_to_stop.set_should_stop(graceful_timeout_s) - self._replicas[backend_tag][ReplicaState.SHOULD_STOP].append( - replica_to_stop) - - return True - - def scale_all_backends(self): - checkpoint_needed = False - for backend_tag, num_replicas in list(self._target_replicas.items()): - checkpoint_needed = (checkpoint_needed - or self.scale_backend_replicas(backend_tag)) - if num_replicas == 0: - del self._backend_metadata[backend_tag] - del self._target_replicas[backend_tag] - - if checkpoint_needed: - self._checkpoint() - - def _pop_replicas_of_state(self, state: ReplicaState - ) -> List[Tuple[ReplicaState, BackendTag]]: - replicas = [] - for backend_tag, state_to_replica_dict in self._replicas.items(): - if state in state_to_replica_dict: - replicas.extend( - (replica, backend_tag) - for replica in state_to_replica_dict.pop(state)) - - return replicas + if force_kill: + graceful_timeout_s = 0 + self.backend_replicas_to_stop[backend_tag].append(( + replica_tag, + graceful_timeout_s, + )) + + def _start_pending_replicas(self): + for backend_tag, replicas_to_create in self.backend_replicas_to_start.\ + items(): + for replica_tag in replicas_to_create: + replica_handle = self._start_backend_replica( + backend_tag, replica_tag) + ready_future = replica_handle.ready.remote().as_future() + self.currently_starting_replicas[ready_future] = ( + backend_tag, replica_tag, replica_handle) + + def _stop_pending_replicas(self): + for backend_tag, replicas_to_stop in ( + self.backend_replicas_to_stop.items()): + for replica_tag, shutdown_timeout in replicas_to_stop: + replica_name = format_actor_name(replica_tag, + self._controller_name) + + async def kill_actor(replica_name_to_use): + # NOTE: the replicas may already be stopped if we failed + # after stopping them but before writing a checkpoint. + try: + replica = ray.get_actor(replica_name_to_use) + except ValueError: + return + + try: + await asyncio.wait_for( + replica.drain_pending_queries.remote(), + timeout=shutdown_timeout) + except asyncio.TimeoutError: + # Graceful period passed, kill it forcefully. + logger.debug( + f"{replica_name_to_use} did not shutdown after " + f"{shutdown_timeout}s, killing.") + finally: + ray.kill(replica, no_restart=True) + + self.currently_stopping_replicas[asyncio.ensure_future( + kill_actor(replica_name))] = (backend_tag, replica_tag) + + async def _check_currently_starting_replicas(self) -> int: + """Returns the number of pending replicas waiting to start""" + in_flight: Set[Future[Any]] = set() + + if self.currently_starting_replicas: + done, in_flight = await asyncio.wait( + list(self.currently_starting_replicas.keys()), timeout=0) + for fut in done: + (backend_tag, replica_tag, + replica_handle) = self.currently_starting_replicas.pop(fut) + self.backend_replicas[backend_tag][ + replica_tag] = replica_handle + + backend = self.backend_replicas_to_start.get(backend_tag) + if backend: + try: + backend.remove(replica_tag) + except ValueError: + pass + if len(backend) == 0: + del self.backend_replicas_to_start[backend_tag] + + async def _check_currently_stopping_replicas(self) -> int: + """Returns the number of replicas waiting to stop""" + in_flight: Set[Future[Any]] = set() + + if self.currently_stopping_replicas: + done_stopping, in_flight = await asyncio.wait( + list(self.currently_stopping_replicas.keys()), timeout=0) + for fut in done_stopping: + (backend_tag, + replica_tag) = self.currently_stopping_replicas.pop(fut) + + backend_to_stop = self.backend_replicas_to_stop.get( + backend_tag) + + if backend_to_stop: + try: + backend_to_stop.remove(replica_tag) + except ValueError: + pass + if len(backend_to_stop) == 0: + del self.backend_replicas_to_stop[backend_tag] + + backend = self.backend_replicas.get(backend_tag) + if backend: + try: + del backend[replica_tag] + except KeyError: + pass + + if len(self.backend_replicas[backend_tag]) == 0: + del self.backend_replicas[backend_tag] def _completed_goals(self) -> List[GoalId]: completed_goals = [] - all_tags = set(self._replicas.keys()).union( - set(self._backend_metadata.keys())) + all_tags = set(self.backend_replicas.keys()).union( + set(self.backends.keys())) for backend_tag in all_tags: - desired_num_replicas = self._target_replicas.get(backend_tag) - state_dict = self._replicas.get(backend_tag, {}) - existing_info = state_dict.get(ReplicaState.RUNNING, []) - - # If we have pending ops, the current goal is *not* ready - if (state_dict.get(ReplicaState.SHOULD_START) - or state_dict.get(ReplicaState.STARTING) - or state_dict.get(ReplicaState.SHOULD_STOP) - or state_dict.get(ReplicaState.STOPPING)): - continue - - # TODO(ilr): FIX + desired_info = self.backends.get(backend_tag) + existing_info = self.backend_replicas.get(backend_tag) # Check for deleting - if (not desired_num_replicas or - desired_num_replicas == 0) and \ + if (not desired_info or + desired_info.backend_config.num_replicas == 0) and \ (not existing_info or len(existing_info) == 0): - completed_goals.append( - self.backend_goals.pop(backend_tag, None)) + completed_goals.append(self.backend_goals.get(backend_tag)) # Check for a non-zero number of backends - if (desired_num_replicas and existing_info) \ - and desired_num_replicas == len(existing_info): - completed_goals.append( - self.backend_goals.pop(backend_tag, None)) + if desired_info and existing_info and desired_info.backend_config.\ + num_replicas == len(existing_info): + completed_goals.append(self.backend_goals.get(backend_tag)) return [goal for goal in completed_goals if goal] async def update(self) -> bool: - self.scale_all_backends() - for goal_id in self._completed_goals(): self._goal_manager.complete_goal(goal_id) - for replica_state, backend_tag in self._pop_replicas_of_state( - ReplicaState.SHOULD_START): - replica_state.start(self._backend_metadata[backend_tag]) - self._replicas[backend_tag][ReplicaState.STARTING].append( - replica_state) - - for replica_state, backend_tag in self._pop_replicas_of_state( - ReplicaState.SHOULD_STOP): - replica_state.stop() - self._replicas[backend_tag][ReplicaState.STOPPING].append( - replica_state) - - transition_triggered = False - - for replica_state, backend_tag in self._pop_replicas_of_state( - ReplicaState.STARTING): - if replica_state.check_started(): - self._replicas[backend_tag][ReplicaState.RUNNING].append( - replica_state) - transition_triggered = True - else: - self._replicas[backend_tag][ReplicaState.STARTING].append( - replica_state) - - for replica_state, backend_tag in self._pop_replicas_of_state( - ReplicaState.STOPPING): - if replica_state.check_stopped(): - transition_triggered = True - else: - self._replicas[backend_tag][ReplicaState.STOPPING].append( - replica_state) - - for backend_tag in list(self._replicas.keys()): - if not any(self._replicas[backend_tag]): - del self._replicas[backend_tag] - del self._backend_metadata[backend_tag] - del self._target_replicas[backend_tag] - - if transition_triggered: + self._start_pending_replicas() + self._stop_pending_replicas() + + num_starting = len(self.currently_starting_replicas) + num_stopping = len(self.currently_stopping_replicas) + + await self._check_currently_starting_replicas() + await self._check_currently_stopping_replicas() + + if (len(self.currently_starting_replicas) != num_starting) or \ + (len(self.currently_stopping_replicas) != num_stopping): self._checkpoint() self._notify_replica_handles_changed() diff --git a/python/ray/serve/config.py b/python/ray/serve/config.py index 41a1eca08ae8..205af81b065a 100644 --- a/python/ray/serve/config.py +++ b/python/ray/serve/config.py @@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional import pydantic -from pydantic import BaseModel, confloat, PositiveFloat, PositiveInt, validator +from pydantic import BaseModel, PositiveFloat, PositiveInt, validator from ray.serve.constants import (ASYNC_CONCURRENCY, DEFAULT_HTTP_HOST, DEFAULT_HTTP_PORT) @@ -64,7 +64,7 @@ class BackendConfig(BaseModel): user_config: Any = None experimental_graceful_shutdown_wait_loop_s: PositiveFloat = 2.0 - experimental_graceful_shutdown_timeout_s: confloat(ge=0) = 20.0 + experimental_graceful_shutdown_timeout_s: PositiveFloat = 20.0 class Config: validate_assignment = True diff --git a/python/ray/serve/controller.py b/python/ray/serve/controller.py index b5c65111a8f9..a3c75c711878 100644 --- a/python/ray/serve/controller.py +++ b/python/ray/serve/controller.py @@ -118,7 +118,7 @@ async def run_control_loop(self) -> None: def _all_replica_handles( self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]: """Used for testing.""" - return self.backend_state.get_running_replica_handles() + return self.backend_state.get_replica_handles() def get_all_backends(self) -> Dict[BackendTag, BackendConfig]: """Returns a dictionary of backend tag to backend config.""" @@ -235,7 +235,7 @@ async def shutdown(self) -> None: async with self.write_lock: for proxy in self.http_state.get_http_proxy_handles().values(): ray.kill(proxy, no_restart=True) - for replica_dict in self.backend_state.get_running_replica_handles( + for replica_dict in self.backend_state.get_replica_handles( ).values(): for replica in replica_dict.values(): ray.kill(replica, no_restart=True) diff --git a/python/ray/serve/tests/test_api.py b/python/ray/serve/tests/test_api.py index a35f7e54b361..202b01386059 100644 --- a/python/ray/serve/tests/test_api.py +++ b/python/ray/serve/tests/test_api.py @@ -683,9 +683,6 @@ def f(): client.create_endpoint("endpoint", backend="backend") -# This error is only printed because creation is run in the control loop, not -# in the API path. -@pytest.mark.skip() def test_create_infeasible_error(serve_instance): client = serve_instance diff --git a/python/ray/serve/tests/test_failure.py b/python/ray/serve/tests/test_failure.py index de7003c39f8f..7ecba4d51735 100644 --- a/python/ray/serve/tests/test_failure.py +++ b/python/ray/serve/tests/test_failure.py @@ -1,10 +1,8 @@ import os import requests -import sys import tempfile import time -import pytest import ray from ray.test_utils import wait_for_condition from ray import serve @@ -156,7 +154,6 @@ def __call__(self, *args): # Test that if there are multiple replicas for a worker and one dies # unexpectedly, the others continue to serve requests. -@pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.") def test_worker_replica_failure(serve_instance): client = serve_instance From e1d1018274d620cc115adbdf62f4d60d10e900e0 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 189/244] Revert "[ray_client] Fix and extend get_actor test to detached actors (#13016)" This reverts commit b656a2fb7cbe67bd154b253e019f090ae4dd76d1. --- python/ray/tests/test_client.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/python/ray/tests/test_client.py b/python/ray/tests/test_client.py index 73b19a2f2ab9..30d6faccbad9 100644 --- a/python/ray/tests/test_client.py +++ b/python/ray/tests/test_client.py @@ -322,25 +322,12 @@ def get(self): actor.inc.remote() actor.inc.remote() + del actor - # Make sure the get_actor call works new_actor = ray.get_actor("test_acc") new_actor.inc.remote() assert ray.get(new_actor.get.remote()) == 3 - del actor - - actor = Accumulator.options( - name="test_acc2", lifetime="detached").remote() - actor.inc.remote() - del actor - - detatched_actor = ray.get_actor("test_acc2") - for i in range(5): - detatched_actor.inc.remote() - - assert ray.get(detatched_actor.get.remote()) == 6 - @pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.") def test_internal_kv(ray_start_regular_shared): From 2a741e95ab0a30d737def00d036249b3e58697db Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 190/244] Revert "[CI] Split test_multi_node to avoid timeouts (#13712)" This reverts commit cf2ffa7941a19acf4eaec78a2199636f0868fead. --- python/ray/tests/BUILD | 1 - python/ray/tests/test_multi_node.py | 389 ++++++++++++++++++++++++- python/ray/tests/test_multi_node_3.py | 397 -------------------------- 3 files changed, 385 insertions(+), 402 deletions(-) delete mode 100644 python/ray/tests/test_multi_node_3.py diff --git a/python/ray/tests/BUILD b/python/ray/tests/BUILD index 97980a641a4a..2ccdb4be2644 100644 --- a/python/ray/tests/BUILD +++ b/python/ray/tests/BUILD @@ -48,7 +48,6 @@ py_test_module_list( "test_metrics.py", "test_multi_node.py", "test_multi_node_2.py", - "test_multi_node_3.py", "test_multi_tenancy.py", "test_multinode_failures.py", "test_multinode_failures_2.py", diff --git a/python/ray/tests/test_multi_node.py b/python/ray/tests/test_multi_node.py index ae9ae1c1e981..fbce475c12af 100644 --- a/python/ray/tests/test_multi_node.py +++ b/python/ray/tests/test_multi_node.py @@ -1,13 +1,15 @@ import os import pytest +import subprocess import sys import time import ray -from ray.test_utils import (RayTestTimeoutException, run_string_as_driver, - run_string_as_driver_nonblocking, - wait_for_condition, init_error_pubsub, - get_error_message) +from ray.test_utils import ( + RayTestTimeoutException, check_call_ray, run_string_as_driver, + run_string_as_driver_nonblocking, wait_for_children_of_pid, + wait_for_children_of_pid_to_exit, wait_for_condition, kill_process_by_name, + Semaphore, init_error_pubsub, get_error_message) def test_remote_raylet_cleanup(ray_start_cluster): @@ -366,6 +368,385 @@ def wait_for_success_output(process_handle, timeout=10): process_handle.kill() +def test_calling_start_ray_head(call_ray_stop_only): + + # Test that we can call ray start with various command line + # parameters. TODO(rkn): This test only tests the --head code path. We + # should also test the non-head node code path. + + # Test starting Ray with a redis port specified. + check_call_ray(["start", "--head", "--port", "0"]) + check_call_ray(["stop"]) + + # Test starting Ray with a node IP address specified. + check_call_ray( + ["start", "--head", "--node-ip-address", "127.0.0.1", "--port", "0"]) + check_call_ray(["stop"]) + + # Test starting Ray with a system config parameter set. + check_call_ray([ + "start", "--head", "--system-config", + "{\"metrics_report_interval_ms\":100}", "--port", "0" + ]) + check_call_ray(["stop"]) + + # Test starting Ray with the object manager and node manager ports + # specified. + check_call_ray([ + "start", "--head", "--object-manager-port", "12345", + "--node-manager-port", "54321", "--port", "0" + ]) + check_call_ray(["stop"]) + + # Test starting Ray with the worker port range specified. + check_call_ray([ + "start", "--head", "--min-worker-port", "50000", "--max-worker-port", + "51000", "--port", "0" + ]) + check_call_ray(["stop"]) + + # Test starting Ray with a worker port list. + check_call_ray(["start", "--head", "--worker-port-list", "10000,10001"]) + check_call_ray(["stop"]) + + # Test starting Ray with a non-int in the worker port list. + with pytest.raises(subprocess.CalledProcessError): + check_call_ray(["start", "--head", "--worker-port-list", "10000,a"]) + check_call_ray(["stop"]) + + # Test starting Ray with an invalid port in the worker port list. + with pytest.raises(subprocess.CalledProcessError): + check_call_ray(["start", "--head", "--worker-port-list", "100"]) + check_call_ray(["stop"]) + + # Test starting Ray with the number of CPUs specified. + check_call_ray(["start", "--head", "--num-cpus", "2", "--port", "0"]) + check_call_ray(["stop"]) + + # Test starting Ray with the number of GPUs specified. + check_call_ray(["start", "--head", "--num-gpus", "100", "--port", "0"]) + check_call_ray(["stop"]) + + # Test starting Ray with redis shard ports specified. + check_call_ray([ + "start", "--head", "--redis-shard-ports", "6380,6381,6382", "--port", + "0" + ]) + check_call_ray(["stop"]) + + # Test starting Ray with all arguments specified. + check_call_ray([ + "start", "--head", "--redis-shard-ports", "6380,6381,6382", + "--object-manager-port", "12345", "--num-cpus", "2", "--num-gpus", "0", + "--resources", "{\"Custom\": 1}", "--port", "0" + ]) + check_call_ray(["stop"]) + + # Test starting Ray with invalid arguments. + with pytest.raises(subprocess.CalledProcessError): + check_call_ray( + ["start", "--head", "--address", "127.0.0.1:6379", "--port", "0"]) + check_call_ray(["stop"]) + + # Test --block. Killing a child process should cause the command to exit. + blocked = subprocess.Popen( + ["ray", "start", "--head", "--block", "--port", "0"]) + + wait_for_children_of_pid(blocked.pid, num_children=7, timeout=30) + + blocked.poll() + assert blocked.returncode is None + + kill_process_by_name("raylet") + wait_for_children_of_pid_to_exit(blocked.pid, timeout=30) + blocked.wait() + assert blocked.returncode != 0, "ray start shouldn't return 0 on bad exit" + + # Test --block. Killing the command should clean up all child processes. + blocked = subprocess.Popen( + ["ray", "start", "--head", "--block", "--port", "0"]) + blocked.poll() + assert blocked.returncode is None + + wait_for_children_of_pid(blocked.pid, num_children=7, timeout=30) + + blocked.terminate() + wait_for_children_of_pid_to_exit(blocked.pid, timeout=30) + blocked.wait() + assert blocked.returncode != 0, "ray start shouldn't return 0 on bad exit" + + +@pytest.mark.parametrize( + "call_ray_start", + ["ray start --head --num-cpus=1 " + "--node-ip-address=localhost"], + indirect=True) +def test_using_hostnames(call_ray_start): + ray.init(_node_ip_address="localhost", address="localhost:6379") + + @ray.remote + def f(): + return 1 + + assert ray.get(f.remote()) == 1 + + +def test_connecting_in_local_case(ray_start_regular): + address_info = ray_start_regular + + # Define a driver that just connects to Redis. + driver_script = """ +import ray +ray.init(address="{}") +print("success") +""".format(address_info["redis_address"]) + + out = run_string_as_driver(driver_script) + # Make sure the other driver succeeded. + assert "success" in out + + +def test_run_driver_twice(ray_start_regular): + # We used to have issue 2165 and 2288: + # https://github.com/ray-project/ray/issues/2165 + # https://github.com/ray-project/ray/issues/2288 + # both complain that driver will hang when run for the second time. + # This test is used to verify the fix for above issue, it will run the + # same driver for twice and verify whether both of them succeed. + address_info = ray_start_regular + driver_script = """ +import ray +import ray.tune as tune +import os +import time + +def train_func(config, reporter): # add a reporter arg + for i in range(2): + time.sleep(0.1) + reporter(timesteps_total=i, mean_accuracy=i+97) # report metrics + +os.environ["TUNE_RESUME_PROMPT_OFF"] = "True" +ray.init(address="{}") +ray.tune.register_trainable("train_func", train_func) + +tune.run_experiments({{ + "my_experiment": {{ + "run": "train_func", + "stop": {{"mean_accuracy": 99}}, + "config": {{ + "layer1": {{ + "class_name": tune.grid_search(["a"]), + "config": {{"lr": tune.grid_search([1, 2])}} + }}, + }}, + "local_dir": os.path.expanduser("~/tmp") + }} +}}) +print("success") +""".format(address_info["redis_address"]) + + for i in range(2): + out = run_string_as_driver(driver_script) + assert "success" in out + + +@pytest.mark.skip(reason="fate sharing not implemented yet") +def test_driver_exiting_when_worker_blocked(call_ray_start): + # This test will create some drivers that submit some tasks and then + # exit without waiting for the tasks to complete. + address = call_ray_start + + ray.init(address=address) + + # Define a driver that creates two tasks, one that runs forever and the + # other blocked on the first in a `ray.get`. + driver_script = """ +import time +import ray +ray.init(address="{}") +@ray.remote +def f(): + time.sleep(10**6) +@ray.remote +def g(): + ray.get(f.remote()) +g.remote() +time.sleep(1) +print("success") +""".format(address) + + # Create some drivers and let them exit and make sure everything is + # still alive. + for _ in range(3): + out = run_string_as_driver(driver_script) + # Make sure the first driver ran to completion. + assert "success" in out + + # Define a driver that creates two tasks, one that runs forever and the + # other blocked on the first in a `ray.wait`. + driver_script = """ +import time +import ray +ray.init(address="{}") +@ray.remote +def f(): + time.sleep(10**6) +@ray.remote +def g(): + ray.wait([f.remote()]) +g.remote() +time.sleep(1) +print("success") +""".format(address) + + # Create some drivers and let them exit and make sure everything is + # still alive. + for _ in range(3): + out = run_string_as_driver(driver_script) + # Make sure the first driver ran to completion. + assert "success" in out + + # Define a driver that creates one task that depends on a nonexistent + # object. This task will be queued as waiting to execute. + driver_script_template = """ +import time +import ray +ray.init(address="{}") +@ray.remote +def g(x): + return +g.remote(ray.ObjectRef(ray.utils.hex_to_binary("{}"))) +time.sleep(1) +print("success") +""" + + # Create some drivers and let them exit and make sure everything is + # still alive. + for _ in range(3): + nonexistent_id = ray.ObjectRef.from_random() + driver_script = driver_script_template.format(address, + nonexistent_id.hex()) + out = run_string_as_driver(driver_script) + # Simulate the nonexistent dependency becoming available. + ray.worker.global_worker.put_object(None, nonexistent_id) + # Make sure the first driver ran to completion. + assert "success" in out + + # Define a driver that calls `ray.wait` on a nonexistent object. + driver_script_template = """ +import time +import ray +ray.init(address="{}") +@ray.remote +def g(): + ray.wait(ray.ObjectRef(ray.utils.hex_to_binary("{}"))) +g.remote() +time.sleep(1) +print("success") +""" + + # Create some drivers and let them exit and make sure everything is + # still alive. + for _ in range(3): + nonexistent_id = ray.ObjectRef.from_random() + driver_script = driver_script_template.format(address, + nonexistent_id.hex()) + out = run_string_as_driver(driver_script) + # Simulate the nonexistent dependency becoming available. + ray.worker.global_worker.put_object(None, nonexistent_id) + # Make sure the first driver ran to completion. + assert "success" in out + + @ray.remote + def f(): + return 1 + + # Make sure we can still talk with the raylet. + ray.get(f.remote()) + + +def test_multi_driver_logging(ray_start_regular): + address_info = ray_start_regular + address = address_info["redis_address"] + + # ray.init(address=address) + driver1_wait = Semaphore.options(name="driver1_wait").remote(value=0) + driver2_wait = Semaphore.options(name="driver2_wait").remote(value=0) + main_wait = Semaphore.options(name="main_wait").remote(value=0) + + # The creation of an actor is asynchronous. + # We need to wait for the completion of the actor creation, + # otherwise we can't get the actor by name. + ray.get(driver1_wait.locked.remote()) + ray.get(driver2_wait.locked.remote()) + ray.get(main_wait.locked.remote()) + + # Params are address, semaphore name, output1, output2 + driver_script_template = """ +import ray +import sys +from ray.test_utils import Semaphore + +@ray.remote(num_cpus=0) +def remote_print(s, file=None): + print(s, file=file) + +ray.init(address="{}") + +driver_wait = ray.get_actor("{}") +main_wait = ray.get_actor("main_wait") + +ray.get(main_wait.release.remote()) +ray.get(driver_wait.acquire.remote()) + +s1 = "{}" +ray.get(remote_print.remote(s1)) + +ray.get(main_wait.release.remote()) +ray.get(driver_wait.acquire.remote()) + +s2 = "{}" +ray.get(remote_print.remote(s2)) + +ray.get(main_wait.release.remote()) + """ + + p1 = run_string_as_driver_nonblocking( + driver_script_template.format(address, "driver1_wait", "1", "2")) + p2 = run_string_as_driver_nonblocking( + driver_script_template.format(address, "driver2_wait", "3", "4")) + + ray.get(main_wait.acquire.remote()) + ray.get(main_wait.acquire.remote()) + # At this point both of the other drivers are fully initialized. + + ray.get(driver1_wait.release.remote()) + ray.get(driver2_wait.release.remote()) + + # At this point driver1 should receive '1' and driver2 '3' + ray.get(main_wait.acquire.remote()) + ray.get(main_wait.acquire.remote()) + + ray.get(driver1_wait.release.remote()) + ray.get(driver2_wait.release.remote()) + + # At this point driver1 should receive '2' and driver2 '4' + ray.get(main_wait.acquire.remote()) + ray.get(main_wait.acquire.remote()) + + driver1_out = p1.stdout.read().decode("ascii") + driver2_out = p2.stdout.read().decode("ascii") + if sys.platform == "win32": + driver1_out = driver1_out.replace("\r", "") + driver2_out = driver2_out.replace("\r", "") + driver1_out_split = driver1_out.split("\n") + driver2_out_split = driver2_out.split("\n") + + assert driver1_out_split[0][-1] == "1", driver1_out_split + assert driver1_out_split[1][-1] == "2", driver1_out_split + assert driver2_out_split[0][-1] == "3", driver2_out_split + assert driver2_out_split[1][-1] == "4", driver2_out_split + + if __name__ == "__main__": import pytest # Make subprocess happy in bazel. diff --git a/python/ray/tests/test_multi_node_3.py b/python/ray/tests/test_multi_node_3.py deleted file mode 100644 index 9c270b64da55..000000000000 --- a/python/ray/tests/test_multi_node_3.py +++ /dev/null @@ -1,397 +0,0 @@ -import os -import pytest -import subprocess -import sys - -import ray -from ray.test_utils import ( - check_call_ray, run_string_as_driver, run_string_as_driver_nonblocking, - wait_for_children_of_pid, wait_for_children_of_pid_to_exit, - kill_process_by_name, Semaphore) - - -def test_calling_start_ray_head(call_ray_stop_only): - - # Test that we can call ray start with various command line - # parameters. TODO(rkn): This test only tests the --head code path. We - # should also test the non-head node code path. - - # Test starting Ray with a redis port specified. - check_call_ray(["start", "--head", "--port", "0"]) - check_call_ray(["stop"]) - - # Test starting Ray with a node IP address specified. - check_call_ray( - ["start", "--head", "--node-ip-address", "127.0.0.1", "--port", "0"]) - check_call_ray(["stop"]) - - # Test starting Ray with a system config parameter set. - check_call_ray([ - "start", "--head", "--system-config", - "{\"metrics_report_interval_ms\":100}", "--port", "0" - ]) - check_call_ray(["stop"]) - - # Test starting Ray with the object manager and node manager ports - # specified. - check_call_ray([ - "start", "--head", "--object-manager-port", "12345", - "--node-manager-port", "54321", "--port", "0" - ]) - check_call_ray(["stop"]) - - # Test starting Ray with the worker port range specified. - check_call_ray([ - "start", "--head", "--min-worker-port", "50000", "--max-worker-port", - "51000", "--port", "0" - ]) - check_call_ray(["stop"]) - - # Test starting Ray with a worker port list. - check_call_ray(["start", "--head", "--worker-port-list", "10000,10001"]) - check_call_ray(["stop"]) - - # Test starting Ray with a non-int in the worker port list. - with pytest.raises(subprocess.CalledProcessError): - check_call_ray(["start", "--head", "--worker-port-list", "10000,a"]) - check_call_ray(["stop"]) - - # Test starting Ray with an invalid port in the worker port list. - with pytest.raises(subprocess.CalledProcessError): - check_call_ray(["start", "--head", "--worker-port-list", "100"]) - check_call_ray(["stop"]) - - # Test starting Ray with the number of CPUs specified. - check_call_ray(["start", "--head", "--num-cpus", "2", "--port", "0"]) - check_call_ray(["stop"]) - - # Test starting Ray with the number of GPUs specified. - check_call_ray(["start", "--head", "--num-gpus", "100", "--port", "0"]) - check_call_ray(["stop"]) - - # Test starting Ray with redis shard ports specified. - check_call_ray([ - "start", "--head", "--redis-shard-ports", "6380,6381,6382", "--port", - "0" - ]) - check_call_ray(["stop"]) - - # Test starting Ray with all arguments specified. - check_call_ray([ - "start", "--head", "--redis-shard-ports", "6380,6381,6382", - "--object-manager-port", "12345", "--num-cpus", "2", "--num-gpus", "0", - "--resources", "{\"Custom\": 1}", "--port", "0" - ]) - check_call_ray(["stop"]) - - # Test starting Ray with invalid arguments. - with pytest.raises(subprocess.CalledProcessError): - check_call_ray( - ["start", "--head", "--address", "127.0.0.1:6379", "--port", "0"]) - check_call_ray(["stop"]) - - # Test --block. Killing a child process should cause the command to exit. - blocked = subprocess.Popen( - ["ray", "start", "--head", "--block", "--port", "0"]) - - wait_for_children_of_pid(blocked.pid, num_children=7, timeout=30) - - blocked.poll() - assert blocked.returncode is None - - kill_process_by_name("raylet") - wait_for_children_of_pid_to_exit(blocked.pid, timeout=30) - blocked.wait() - assert blocked.returncode != 0, "ray start shouldn't return 0 on bad exit" - - # Test --block. Killing the command should clean up all child processes. - blocked = subprocess.Popen( - ["ray", "start", "--head", "--block", "--port", "0"]) - blocked.poll() - assert blocked.returncode is None - - wait_for_children_of_pid(blocked.pid, num_children=7, timeout=30) - - blocked.terminate() - wait_for_children_of_pid_to_exit(blocked.pid, timeout=30) - blocked.wait() - assert blocked.returncode != 0, "ray start shouldn't return 0 on bad exit" - - -@pytest.mark.parametrize( - "call_ray_start", - ["ray start --head --num-cpus=1 " + "--node-ip-address=localhost"], - indirect=True) -def test_using_hostnames(call_ray_start): - ray.init(_node_ip_address="localhost", address="localhost:6379") - - @ray.remote - def f(): - return 1 - - assert ray.get(f.remote()) == 1 - - -def test_connecting_in_local_case(ray_start_regular): - address_info = ray_start_regular - - # Define a driver that just connects to Redis. - driver_script = """ -import ray -ray.init(address="{}") -print("success") -""".format(address_info["redis_address"]) - - out = run_string_as_driver(driver_script) - # Make sure the other driver succeeded. - assert "success" in out - - -def test_run_driver_twice(ray_start_regular): - # We used to have issue 2165 and 2288: - # https://github.com/ray-project/ray/issues/2165 - # https://github.com/ray-project/ray/issues/2288 - # both complain that driver will hang when run for the second time. - # This test is used to verify the fix for above issue, it will run the - # same driver for twice and verify whether both of them succeed. - address_info = ray_start_regular - driver_script = """ -import ray -import ray.tune as tune -import os -import time - -def train_func(config, reporter): # add a reporter arg - for i in range(2): - time.sleep(0.1) - reporter(timesteps_total=i, mean_accuracy=i+97) # report metrics - -os.environ["TUNE_RESUME_PROMPT_OFF"] = "True" -ray.init(address="{}") -ray.tune.register_trainable("train_func", train_func) - -tune.run_experiments({{ - "my_experiment": {{ - "run": "train_func", - "stop": {{"mean_accuracy": 99}}, - "config": {{ - "layer1": {{ - "class_name": tune.grid_search(["a"]), - "config": {{"lr": tune.grid_search([1, 2])}} - }}, - }}, - "local_dir": os.path.expanduser("~/tmp") - }} -}}) -print("success") -""".format(address_info["redis_address"]) - - for i in range(2): - out = run_string_as_driver(driver_script) - assert "success" in out - - -@pytest.mark.skip(reason="fate sharing not implemented yet") -def test_driver_exiting_when_worker_blocked(call_ray_start): - # This test will create some drivers that submit some tasks and then - # exit without waiting for the tasks to complete. - address = call_ray_start - - ray.init(address=address) - - # Define a driver that creates two tasks, one that runs forever and the - # other blocked on the first in a `ray.get`. - driver_script = """ -import time -import ray -ray.init(address="{}") -@ray.remote -def f(): - time.sleep(10**6) -@ray.remote -def g(): - ray.get(f.remote()) -g.remote() -time.sleep(1) -print("success") -""".format(address) - - # Create some drivers and let them exit and make sure everything is - # still alive. - for _ in range(3): - out = run_string_as_driver(driver_script) - # Make sure the first driver ran to completion. - assert "success" in out - - # Define a driver that creates two tasks, one that runs forever and the - # other blocked on the first in a `ray.wait`. - driver_script = """ -import time -import ray -ray.init(address="{}") -@ray.remote -def f(): - time.sleep(10**6) -@ray.remote -def g(): - ray.wait([f.remote()]) -g.remote() -time.sleep(1) -print("success") -""".format(address) - - # Create some drivers and let them exit and make sure everything is - # still alive. - for _ in range(3): - out = run_string_as_driver(driver_script) - # Make sure the first driver ran to completion. - assert "success" in out - - # Define a driver that creates one task that depends on a nonexistent - # object. This task will be queued as waiting to execute. - driver_script_template = """ -import time -import ray -ray.init(address="{}") -@ray.remote -def g(x): - return -g.remote(ray.ObjectRef(ray.utils.hex_to_binary("{}"))) -time.sleep(1) -print("success") -""" - - # Create some drivers and let them exit and make sure everything is - # still alive. - for _ in range(3): - nonexistent_id = ray.ObjectRef.from_random() - driver_script = driver_script_template.format(address, - nonexistent_id.hex()) - out = run_string_as_driver(driver_script) - # Simulate the nonexistent dependency becoming available. - ray.worker.global_worker.put_object(None, nonexistent_id) - # Make sure the first driver ran to completion. - assert "success" in out - - # Define a driver that calls `ray.wait` on a nonexistent object. - driver_script_template = """ -import time -import ray -ray.init(address="{}") -@ray.remote -def g(): - ray.wait(ray.ObjectRef(ray.utils.hex_to_binary("{}"))) -g.remote() -time.sleep(1) -print("success") -""" - - # Create some drivers and let them exit and make sure everything is - # still alive. - for _ in range(3): - nonexistent_id = ray.ObjectRef.from_random() - driver_script = driver_script_template.format(address, - nonexistent_id.hex()) - out = run_string_as_driver(driver_script) - # Simulate the nonexistent dependency becoming available. - ray.worker.global_worker.put_object(None, nonexistent_id) - # Make sure the first driver ran to completion. - assert "success" in out - - @ray.remote - def f(): - return 1 - - # Make sure we can still talk with the raylet. - ray.get(f.remote()) - - -def test_multi_driver_logging(ray_start_regular): - address_info = ray_start_regular - address = address_info["redis_address"] - - # ray.init(address=address) - driver1_wait = Semaphore.options(name="driver1_wait").remote(value=0) - driver2_wait = Semaphore.options(name="driver2_wait").remote(value=0) - main_wait = Semaphore.options(name="main_wait").remote(value=0) - - # The creation of an actor is asynchronous. - # We need to wait for the completion of the actor creation, - # otherwise we can't get the actor by name. - ray.get(driver1_wait.locked.remote()) - ray.get(driver2_wait.locked.remote()) - ray.get(main_wait.locked.remote()) - - # Params are address, semaphore name, output1, output2 - driver_script_template = """ -import ray -import sys -from ray.test_utils import Semaphore - -@ray.remote(num_cpus=0) -def remote_print(s, file=None): - print(s, file=file) - -ray.init(address="{}") - -driver_wait = ray.get_actor("{}") -main_wait = ray.get_actor("main_wait") - -ray.get(main_wait.release.remote()) -ray.get(driver_wait.acquire.remote()) - -s1 = "{}" -ray.get(remote_print.remote(s1)) - -ray.get(main_wait.release.remote()) -ray.get(driver_wait.acquire.remote()) - -s2 = "{}" -ray.get(remote_print.remote(s2)) - -ray.get(main_wait.release.remote()) - """ - - p1 = run_string_as_driver_nonblocking( - driver_script_template.format(address, "driver1_wait", "1", "2")) - p2 = run_string_as_driver_nonblocking( - driver_script_template.format(address, "driver2_wait", "3", "4")) - - ray.get(main_wait.acquire.remote()) - ray.get(main_wait.acquire.remote()) - # At this point both of the other drivers are fully initialized. - - ray.get(driver1_wait.release.remote()) - ray.get(driver2_wait.release.remote()) - - # At this point driver1 should receive '1' and driver2 '3' - ray.get(main_wait.acquire.remote()) - ray.get(main_wait.acquire.remote()) - - ray.get(driver1_wait.release.remote()) - ray.get(driver2_wait.release.remote()) - - # At this point driver1 should receive '2' and driver2 '4' - ray.get(main_wait.acquire.remote()) - ray.get(main_wait.acquire.remote()) - - driver1_out = p1.stdout.read().decode("ascii") - driver2_out = p2.stdout.read().decode("ascii") - if sys.platform == "win32": - driver1_out = driver1_out.replace("\r", "") - driver2_out = driver2_out.replace("\r", "") - driver1_out_split = driver1_out.split("\n") - driver2_out_split = driver2_out.split("\n") - - assert driver1_out_split[0][-1] == "1", driver1_out_split - assert driver1_out_split[1][-1] == "2", driver1_out_split - assert driver2_out_split[0][-1] == "3", driver2_out_split - assert driver2_out_split[1][-1] == "4", driver2_out_split - - -if __name__ == "__main__": - import pytest - # Make subprocess happy in bazel. - os.environ["LC_ALL"] = "en_US.UTF-8" - os.environ["LANG"] = "en_US.UTF-8" - sys.exit(pytest.main(["-v", __file__])) From b5ff0bb80fcad9cc37685e52bfe2ad7264af3ff0 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 191/244] Revert "[ray_client]: Monitor client stream errors (#13386)" This reverts commit a53b28b8c4cb328eadb4021ab0651a3fed306b89. --- python/ray/tests/test_client.py | 27 --------------------------- python/ray/util/client/__init__.py | 4 +--- python/ray/util/client/dataclient.py | 19 ++----------------- python/ray/util/client/logsclient.py | 14 ++------------ python/ray/util/client/worker.py | 10 ---------- 5 files changed, 5 insertions(+), 69 deletions(-) diff --git a/python/ray/tests/test_client.py b/python/ray/tests/test_client.py index 30d6faccbad9..dc5de2470e6e 100644 --- a/python/ray/tests/test_client.py +++ b/python/ray/tests/test_client.py @@ -364,32 +364,5 @@ def run_client(): ray_client._inside_client_test = False -def test_dataclient_server_drop(ray_start_regular_shared): - from ray.util.client import ray as ray_client - ray_client._inside_client_test = True - - @ray_client.remote - def f(x): - time.sleep(4) - return x - - def stop_server(server): - time.sleep(2) - server.stop(0) - - server = ray_client_server.serve("localhost:50051") - ray_client.connect("localhost:50051") - thread = threading.Thread(target=stop_server, args=(server, )) - thread.start() - x = f.remote(2) - with pytest.raises(ConnectionError): - _ = ray_client.get(x) - thread.join() - ray_client.disconnect() - ray_client._inside_client_test = False - # Wait for f(x) to finish before ray.shutdown() in the fixture - time.sleep(3) - - if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/util/client/__init__.py b/python/ray/util/client/__init__.py index 1c28dc53c64a..02aab93ff5ae 100644 --- a/python/ray/util/client/__init__.py +++ b/python/ray/util/client/__init__.py @@ -89,9 +89,7 @@ def __getattr__(self, key: str): return getattr(self.api, key) def is_connected(self) -> bool: - if self.client_worker is None: - return False - return self.client_worker.is_connected() + return self.client_worker is not None def init(self, *args, **kwargs): if self._server is not None: diff --git a/python/ray/util/client/dataclient.py b/python/ray/util/client/dataclient.py index a0750b790bb6..6e29ea927b83 100644 --- a/python/ray/util/client/dataclient.py +++ b/python/ray/util/client/dataclient.py @@ -37,7 +37,6 @@ def __init__(self, channel: "grpc._channel.Channel", client_id: str, self._req_id = 0 self._client_id = client_id self._metadata = metadata - self._in_shutdown = False self.data_thread.start() def _next_id(self) -> int: @@ -68,19 +67,9 @@ def _data_main(self) -> None: self.ready_data[response.req_id] = response self.cv.notify_all() except grpc.RpcError as e: - with self.cv: - self._in_shutdown = True - self.cv.notify_all() - if e.code() == grpc.StatusCode.CANCELLED: + if grpc.StatusCode.CANCELLED == e.code(): # Gracefully shutting down logger.info("Cancelling data channel") - elif e.code() == grpc.StatusCode.UNAVAILABLE: - # TODO(barakmich): The server may have - # dropped. In theory, we can retry, as per - # https://grpc.github.io/grpc/core/md_doc_statuscodes.html but - # in practice we may need to think about the correct semantics - # here. - logger.info("Server disconnected from data channel") else: logger.error( f"Got Error from data channel -- shutting down: {e}") @@ -99,11 +88,7 @@ def _blocking_send(self, req: ray_client_pb2.DataRequest self.request_queue.put(req) data = None with self.cv: - self.cv.wait_for( - lambda: req_id in self.ready_data or self._in_shutdown) - if self._in_shutdown: - raise ConnectionError( - f"cannot send request {req}: data channel shutting down") + self.cv.wait_for(lambda: req_id in self.ready_data) data = self.ready_data[req_id] del self.ready_data[req_id] return data diff --git a/python/ray/util/client/logsclient.py b/python/ray/util/client/logsclient.py index f7902024d256..0e4d02846a37 100644 --- a/python/ray/util/client/logsclient.py +++ b/python/ray/util/client/logsclient.py @@ -44,18 +44,8 @@ def _log_main(self) -> None: self.stdstream(level=record.level, msg=record.msg) self.log(level=record.level, msg=record.msg) except grpc.RpcError as e: - if e.code() == grpc.StatusCode.CANCELLED: - # Graceful shutdown. We've cancelled our own connection. - logger.info("Cancelling logs channel") - elif e.code() == grpc.StatusCode.UNAVAILABLE: - # TODO(barakmich): The server may have - # dropped. In theory, we can retry, as per - # https://grpc.github.io/grpc/core/md_doc_statuscodes.html but - # in practice we may need to think about the correct semantics - # here. - logger.info("Server disconnected from logs channel") - else: - # Some other, unhandled, gRPC error + if grpc.StatusCode.CANCELLED != e.code(): + # Not just shutting down normally logger.error( f"Got Error from logger channel -- shutting down: {e}") raise e diff --git a/python/ray/util/client/worker.py b/python/ray/util/client/worker.py index 9f2f189c6ae2..d62173be745f 100644 --- a/python/ray/util/client/worker.py +++ b/python/ray/util/client/worker.py @@ -60,7 +60,6 @@ def __init__(self, """ self.metadata = metadata if metadata else [] self.channel = None - self._conn_state = grpc.ChannelConnectivity.IDLE self._client_id = make_client_id() if secure: credentials = grpc.ssl_channel_credentials() @@ -68,8 +67,6 @@ def __init__(self, else: self.channel = grpc.insecure_channel(conn_str) - self.channel.subscribe(self._on_channel_state_change) - # Retry the connection until the channel responds to something # looking like a gRPC connection, though it may be a proxy. conn_attempts = 0 @@ -131,10 +128,6 @@ def __init__(self, self.log_client.set_logstream_level(logging.INFO) self.closed = False - def _on_channel_state_change(self, conn_state: grpc.ChannelConnectivity): - logger.debug(f"client gRPC channel state change: {conn_state}") - self._conn_state = conn_state - def connection_info(self): try: data = self.data_client.ConnectionInfo() @@ -364,9 +357,6 @@ def is_initialized(self) -> bool: ray_client_pb2.ClusterInfoType.IS_INITIALIZED) return False - def is_connected(self) -> bool: - return self._conn_state == grpc.ChannelConnectivity.READY - def make_client_id() -> str: id = uuid.uuid4() From 567991189608d90530bcacf82e2d3575d363cde9 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 192/244] Revert "[CLI] Fix Ray Status with ENV Variable set (#13707)" This reverts commit 3ffefe222357217bf0bfafe2a93336b494a28f10. --- python/ray/_private/services.py | 2 +- python/ray/tests/test_cli.py | 19 ------------------- .../test_cli_patterns/test_ray_status.txt | 12 ------------ 3 files changed, 1 insertion(+), 32 deletions(-) delete mode 100644 python/ray/tests/test_cli_patterns/test_ray_status.txt diff --git a/python/ray/_private/services.py b/python/ray/_private/services.py index 435c16d4eebc..c9ea996f9c0c 100644 --- a/python/ray/_private/services.py +++ b/python/ray/_private/services.py @@ -216,7 +216,7 @@ def get_ray_address_to_use_or_die(): A string to pass into `ray.init(address=...)` """ if "RAY_ADDRESS" in os.environ: - return os.environ.get("RAY_ADDRESS") + return "auto" # Avoid conflict with RAY_ADDRESS env var return find_redis_address_or_die() diff --git a/python/ray/tests/test_cli.py b/python/ray/tests/test_cli.py index a6f1b1989ae9..57bf61419690 100644 --- a/python/ray/tests/test_cli.py +++ b/python/ray/tests/test_cli.py @@ -415,24 +415,5 @@ def commands_mock(command, stdin): _check_output_via_pattern("test_ray_submit.txt", result) -def test_ray_status(): - import ray - address = ray.init().get("redis_address") - runner = CliRunner() - result = runner.invoke(scripts.status, []) - _check_output_via_pattern("test_ray_status.txt", result) - - result_arg = runner.invoke(scripts.status, ["--address", address]) - _check_output_via_pattern("test_ray_status.txt", result_arg) - - # Try to check status with RAY_ADDRESS set - os.environ["RAY_ADDRESS"] = address - result_env = runner.invoke(scripts.status) - _check_output_via_pattern("test_ray_status.txt", result_env) - - result_env_arg = runner.invoke(scripts.status, ["--address", address]) - _check_output_via_pattern("test_ray_status.txt", result_env_arg) - - if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_cli_patterns/test_ray_status.txt b/python/ray/tests/test_cli_patterns/test_ray_status.txt deleted file mode 100644 index 7169c5f0f096..000000000000 --- a/python/ray/tests/test_cli_patterns/test_ray_status.txt +++ /dev/null @@ -1,12 +0,0 @@ -======== Cluster status: .+ -Node status ------------------------------------------------------------- - - -Resources ------------------------------------------------------------- -Usage: - - -Demands: - \(no resource demands\) From b22f4cc3eb9edd4264e49bf3cc20c8ad3bdd9a03 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 193/244] Revert "Rename the ray.operator module to ray.ray_operator (#13705)" This reverts commit 9b7944c885bfa0cec230f2ef8a5cb7abb126533c. --- ci/travis/format.sh | 4 ++-- python/ray/{ray_operator => operator}/__init__.py | 0 python/ray/{ray_operator => operator}/operator.py | 2 +- python/ray/{ray_operator => operator}/operator_utils.py | 0 python/ray/setup-dev.py | 2 +- python/setup.py | 2 +- 6 files changed, 5 insertions(+), 5 deletions(-) rename python/ray/{ray_operator => operator}/__init__.py (100%) rename python/ray/{ray_operator => operator}/operator.py (99%) rename python/ray/{ray_operator => operator}/operator_utils.py (100%) diff --git a/ci/travis/format.sh b/ci/travis/format.sh index bb916869cca2..3f4b753f4d12 100755 --- a/ci/travis/format.sh +++ b/ci/travis/format.sh @@ -107,8 +107,8 @@ MYPY_FILES=( 'autoscaler/node_provider.py' 'autoscaler/sdk.py' 'autoscaler/_private/commands.py' - 'ray_operator/operator.py' - 'ray_operator/operator_utils.py' + 'operator/operator.py' + 'operator/operator_utils.py' ) YAPF_EXCLUDES=( diff --git a/python/ray/ray_operator/__init__.py b/python/ray/operator/__init__.py similarity index 100% rename from python/ray/ray_operator/__init__.py rename to python/ray/operator/__init__.py diff --git a/python/ray/ray_operator/operator.py b/python/ray/operator/operator.py similarity index 99% rename from python/ray/ray_operator/operator.py rename to python/ray/operator/operator.py index cc03c2fefc8f..cf83eaa240d5 100644 --- a/python/ray/ray_operator/operator.py +++ b/python/ray/operator/operator.py @@ -9,7 +9,7 @@ from ray._private import services from ray.autoscaler._private import commands from ray import monitor -from ray.ray_operator import operator_utils +from ray.operator import operator_utils from ray import ray_constants diff --git a/python/ray/ray_operator/operator_utils.py b/python/ray/operator/operator_utils.py similarity index 100% rename from python/ray/ray_operator/operator_utils.py rename to python/ray/operator/operator_utils.py diff --git a/python/ray/setup-dev.py b/python/ray/setup-dev.py index dcbb622ad16d..285c0028e159 100755 --- a/python/ray/setup-dev.py +++ b/python/ray/setup-dev.py @@ -66,7 +66,7 @@ def do_link(package, force=False, local_path=None): do_link("rllib", force=args.yes, local_path="../../../rllib") do_link("tune", force=args.yes) do_link("autoscaler", force=args.yes) - do_link("ray_operator", force=args.yes) + do_link("operator", force=args.yes) do_link("cloudpickle", force=args.yes) do_link("scripts", force=args.yes) do_link("internal", force=args.yes) diff --git a/python/setup.py b/python/setup.py index e00fcc0820bb..a1542a7a292c 100644 --- a/python/setup.py +++ b/python/setup.py @@ -449,7 +449,7 @@ def has_ext_modules(self): "ray=ray.scripts.scripts:main", "rllib=ray.rllib.scripts:cli [rllib]", "tune=ray.tune.scripts:cli", - "ray-operator=ray.ray_operator.operator:main", + "ray-operator=ray.operator.operator:main", "serve=ray.serve.scripts:cli", ] }, From 9eecd41a97d1659ffef887436d5675275076e14f Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 194/244] Revert "[CI] skip failing java tests (#13702)" This reverts commit a0bf67c142e10cb36a9bd8009d1def38a6e512b1. --- java/test/src/main/java/io/ray/test/ActorRestartTest.java | 4 +--- java/test/src/main/java/io/ray/test/ExitActorTest.java | 4 +--- java/test/src/main/java/io/ray/test/MultiDriverTest.java | 4 +--- java/test/src/main/java/io/ray/test/PlacementGroupTest.java | 4 +--- 4 files changed, 4 insertions(+), 12 deletions(-) diff --git a/java/test/src/main/java/io/ray/test/ActorRestartTest.java b/java/test/src/main/java/io/ray/test/ActorRestartTest.java index 26326073c634..fe70e086764d 100644 --- a/java/test/src/main/java/io/ray/test/ActorRestartTest.java +++ b/java/test/src/main/java/io/ray/test/ActorRestartTest.java @@ -9,9 +9,7 @@ import org.testng.Assert; import org.testng.annotations.Test; -@Test( - groups = {"cluster"}, - enabled = false) +@Test(groups = {"cluster"}) public class ActorRestartTest extends BaseTest { public static class Counter { diff --git a/java/test/src/main/java/io/ray/test/ExitActorTest.java b/java/test/src/main/java/io/ray/test/ExitActorTest.java index a1c40e2ac8a1..279af55c05e5 100644 --- a/java/test/src/main/java/io/ray/test/ExitActorTest.java +++ b/java/test/src/main/java/io/ray/test/ExitActorTest.java @@ -15,9 +15,7 @@ import org.testng.Assert; import org.testng.annotations.Test; -@Test( - groups = {"cluster"}, - enabled = false) +@Test(groups = {"cluster"}) public class ExitActorTest extends BaseTest { private static class ExitingActor { diff --git a/java/test/src/main/java/io/ray/test/MultiDriverTest.java b/java/test/src/main/java/io/ray/test/MultiDriverTest.java index 3feb981927c0..9c781f56283f 100644 --- a/java/test/src/main/java/io/ray/test/MultiDriverTest.java +++ b/java/test/src/main/java/io/ray/test/MultiDriverTest.java @@ -17,9 +17,7 @@ import org.testng.Assert; import org.testng.annotations.Test; -@Test( - groups = {"cluster"}, - enabled = false) +@Test(groups = {"cluster"}) public class MultiDriverTest extends BaseTest { private static final int DRIVER_COUNT = 10; diff --git a/java/test/src/main/java/io/ray/test/PlacementGroupTest.java b/java/test/src/main/java/io/ray/test/PlacementGroupTest.java index 89d1fab69452..edbd2c30e4d6 100644 --- a/java/test/src/main/java/io/ray/test/PlacementGroupTest.java +++ b/java/test/src/main/java/io/ray/test/PlacementGroupTest.java @@ -83,9 +83,7 @@ public void testGetPlacementGroup() { Assert.assertEquals(placementGroupRes.getStrategy(), expectPlacementGroup.getStrategy()); } - @Test( - groups = {"cluster"}, - enabled = false) + @Test(groups = {"cluster"}) public void testRemovePlacementGroup() { PlacementGroupTestUtils.createNameSpecifiedSimpleGroup( "CPU", 1, PlacementStrategy.PACK, 1.0, "first_placement_group"); From 4977b55690b8417537f6e71b5f313ee0c1e00948 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 195/244] Revert "[Serve] fix k8s doc (#13713)" This reverts commit b2dc7690c41be5b1052e75e1aeefda90cccd7f16. --- doc/source/serve/deployment.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/serve/deployment.rst b/doc/source/serve/deployment.rst index 1ab190595796..5ab65a7a35c1 100644 --- a/doc/source/serve/deployment.rst +++ b/doc/source/serve/deployment.rst @@ -225,7 +225,7 @@ With the cluster now running, we can run a simple script to start Ray Serve and # Connect to the running Ray cluster. ray.init(address="auto") # Bind on 0.0.0.0 to expose the HTTP server on external IPs. - client = serve.start(detached=True, http_options={"host": "0.0.0.0"}) + client = serve.start(http_options={"host": "0.0.0.0"}) def hello(): return "hello world" From 2eb037ee9068ba1defbaf18f7089f0e1ea975d1d Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 196/244] Revert "[tune](deps): Bump autogluon-core in /python/requirements (#13698)" This reverts commit ce4d601f2e68ceb1d8e75b1db1ba5c5e026ed912. --- python/requirements/linux-py3.6-requirements_tune.txt | 2 +- python/requirements/linux-py3.7-requirements_tune.txt | 2 +- python/requirements/linux-py3.8-requirements_tune.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/requirements/linux-py3.6-requirements_tune.txt b/python/requirements/linux-py3.6-requirements_tune.txt index 1bafdac84b67..eb72499c1ed9 100644 --- a/python/requirements/linux-py3.6-requirements_tune.txt +++ b/python/requirements/linux-py3.6-requirements_tune.txt @@ -27,7 +27,7 @@ attrs==20.3.0 # pytest autocfg==0.0.6 # via gluoncv -autogluon.core==0.0.16b20210125 +autogluon.core==0.0.16b20210122 # via gluoncv autograd==1.3 # via autogluon.core diff --git a/python/requirements/linux-py3.7-requirements_tune.txt b/python/requirements/linux-py3.7-requirements_tune.txt index 920222b459ef..99e7fe1a9b53 100644 --- a/python/requirements/linux-py3.7-requirements_tune.txt +++ b/python/requirements/linux-py3.7-requirements_tune.txt @@ -27,7 +27,7 @@ attrs==20.3.0 # pytest autocfg==0.0.6 # via gluoncv -autogluon.core==0.0.16b20210125 +autogluon.core==0.0.16b20210122 # via gluoncv autograd==1.3 # via autogluon.core diff --git a/python/requirements/linux-py3.8-requirements_tune.txt b/python/requirements/linux-py3.8-requirements_tune.txt index 14aade6549ee..8ef61bd51b63 100644 --- a/python/requirements/linux-py3.8-requirements_tune.txt +++ b/python/requirements/linux-py3.8-requirements_tune.txt @@ -27,7 +27,7 @@ attrs==20.3.0 # pytest autocfg==0.0.6 # via gluoncv -autogluon.core==0.0.16b20210125 +autogluon.core==0.0.16b20210122 # via gluoncv autograd==1.3 # via autogluon.core From 61733d63bcf7da495e2d8f14952053ea022b5139 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 197/244] Revert "[tune](deps): Bump smart-open[s3] in /python/requirements (#13699)" This reverts commit f4642332de1848c414f08d883dcd6fcdae769132. --- python/requirements/linux-py3.6-requirements_tune.txt | 2 +- python/requirements/linux-py3.7-requirements_tune.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/requirements/linux-py3.6-requirements_tune.txt b/python/requirements/linux-py3.6-requirements_tune.txt index eb72499c1ed9..bae7f20ae363 100644 --- a/python/requirements/linux-py3.6-requirements_tune.txt +++ b/python/requirements/linux-py3.6-requirements_tune.txt @@ -735,7 +735,7 @@ six==1.15.0 # traitlets # wandb # websocket-client -smart_open[s3]==4.1.2 +smart_open[s3]==4.0.1 # via # -c ../requirements.txt # -r requirements_tune.in diff --git a/python/requirements/linux-py3.7-requirements_tune.txt b/python/requirements/linux-py3.7-requirements_tune.txt index 99e7fe1a9b53..bb10df777068 100644 --- a/python/requirements/linux-py3.7-requirements_tune.txt +++ b/python/requirements/linux-py3.7-requirements_tune.txt @@ -725,7 +725,7 @@ six==1.15.0 # tensorflow-probability # wandb # websocket-client -smart_open[s3]==4.1.2 +smart_open[s3]==4.0.1 # via # -c ../requirements.txt # -r requirements_tune.in From 437321ffad0fe02bf91e78148a81e40999db575d Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 198/244] Revert "[Collective][PR 4/6] NCCL Communicator caching and preliminary stream management (#13030)" This reverts commit ed6b2121e0031d34354d06cf7a59aceca04c0ded. --- python/ray/util/collective/__init__.py | 18 +- python/ray/util/collective/collective.py | 327 ++-------- .../collective_group/nccl_collective_group.py | 609 ++++++------------ .../collective/collective_group/nccl_util.py | 50 +- .../examples/nccl_allreduce_example.py | 7 +- ...reduce_example_declare_collective_group.py | 1 + .../nccl_allreduce_multigpu_example.py | 43 -- .../examples/nccl_p2p_example_multigpu.py | 53 -- python/ray/util/collective/tests/conftest.py | 39 +- .../distributed_multigpu_tests/__init__.py | 0 .../test_distributed_multigpu_allgather.py | 82 --- .../test_distributed_multigpu_allreduce.py | 160 ----- .../test_distributed_multigpu_basic_apis.py | 117 ---- .../test_distributed_multigpu_broadcast.py | 92 --- .../test_distributed_multigpu_reduce.py | 173 ----- ...test_distributed_multigpu_reducescatter.py | 82 --- .../test_distributed_multigpu_sendrecv.py | 47 -- .../test_distributed_basic_apis.py | 6 +- .../test_distributed_broadcast.py | 3 +- .../tests/sinlge_node_tests/__init__.py | 0 .../{sinlge_node_tests => }/test_allgather.py | 0 .../{sinlge_node_tests => }/test_allreduce.py | 0 .../test_basic_apis.py | 6 +- .../{sinlge_node_tests => }/test_broadcast.py | 0 .../{sinlge_node_tests => }/test_reduce.py | 0 .../test_reducescatter.py | 0 .../{sinlge_node_tests => }/test_sendrecv.py | 0 python/ray/util/collective/tests/util.py | 272 +------- python/ray/util/collective/types.py | 19 - 29 files changed, 276 insertions(+), 1930 deletions(-) delete mode 100644 python/ray/util/collective/examples/nccl_allreduce_multigpu_example.py delete mode 100644 python/ray/util/collective/examples/nccl_p2p_example_multigpu.py delete mode 100644 python/ray/util/collective/tests/distributed_multigpu_tests/__init__.py delete mode 100644 python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allgather.py delete mode 100644 python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allreduce.py delete mode 100644 python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_basic_apis.py delete mode 100644 python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_broadcast.py delete mode 100644 python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reduce.py delete mode 100644 python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reducescatter.py delete mode 100644 python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_sendrecv.py delete mode 100644 python/ray/util/collective/tests/sinlge_node_tests/__init__.py rename python/ray/util/collective/tests/{sinlge_node_tests => }/test_allgather.py (100%) rename python/ray/util/collective/tests/{sinlge_node_tests => }/test_allreduce.py (100%) rename python/ray/util/collective/tests/{sinlge_node_tests => }/test_basic_apis.py (97%) rename python/ray/util/collective/tests/{sinlge_node_tests => }/test_broadcast.py (100%) rename python/ray/util/collective/tests/{sinlge_node_tests => }/test_reduce.py (100%) rename python/ray/util/collective/tests/{sinlge_node_tests => }/test_reducescatter.py (100%) rename python/ray/util/collective/tests/{sinlge_node_tests => }/test_sendrecv.py (100%) diff --git a/python/ray/util/collective/__init__.py b/python/ray/util/collective/__init__.py index 694698474062..4ae88660702f 100644 --- a/python/ray/util/collective/__init__.py +++ b/python/ray/util/collective/__init__.py @@ -1,15 +1,11 @@ -from ray.util.collective.collective import nccl_available, gloo_available, \ +from ray.util.collective.collective import nccl_available, mpi_available, \ is_group_initialized, init_collective_group, destroy_collective_group, \ - declare_collective_group, get_rank, get_world_size, allreduce, \ - allreduce_multigpu, barrier, reduce, reduce_multigpu, broadcast, \ - broadcast_multigpu, allgather, allgather_multigpu, reducescatter, \ - reducescatter_multigpu, send, send_multigpu, recv, recv_multigpu + get_rank, get_world_size, allreduce, barrier, reduce, broadcast, \ + allgather, reducescatter, send, recv __all__ = [ - "nccl_available", "gloo_available", "is_group_initialized", - "init_collective_group", "destroy_collective_group", - "declare_collective_group", "get_rank", "get_world_size", "allreduce", - "allreduce_multigpu", "barrier", "reduce", "reduce_multigpu", "broadcast", - "broadcast_multigpu", "allgather", "allgather_multigpu", "reducescatter", - "reducescatter_multigpu", "send", "send_multigpu", "recv", "recv_multigpu" + "nccl_available", "mpi_available", "is_group_initialized", + "init_collective_group", "destroy_collective_group", "get_rank", + "get_world_size", "allreduce", "barrier", "reduce", "broadcast", + "allgather", "reducescatter", "send", "recv" ] diff --git a/python/ray/util/collective/collective.py b/python/ray/util/collective/collective.py index afd523e6bf37..08f9026b0467 100644 --- a/python/ray/util/collective/collective.py +++ b/python/ray/util/collective/collective.py @@ -7,9 +7,14 @@ import ray from ray.util.collective import types -_GLOO_AVAILABLE = False +_MPI_AVAILABLE = False _NCCL_AVAILABLE = True +# try: +# from ray.util.collective.collective_group.mpi_collective_group \ +# import MPIGroup +# except ImportError: +# _MPI_AVAILABLE = False try: from ray.util.collective.collective_group import NCCLGroup except ImportError: @@ -22,8 +27,8 @@ def nccl_available(): return _NCCL_AVAILABLE -def gloo_available(): - return _GLOO_AVAILABLE +def mpi_available(): + return _MPI_AVAILABLE class GroupManager(object): @@ -46,11 +51,9 @@ def create_collective_group(self, backend, world_size, rank, group_name): """ backend = types.Backend(backend) if backend == types.Backend.MPI: - raise RuntimeError("Ray does not support MPI.") - elif backend == types.Backend.GLOO: raise NotImplementedError() elif backend == types.Backend.NCCL: - logger.debug("Creating NCCL group: '{}'...".format(group_name)) + logger.debug("creating NCCL group: '{}'".format(group_name)) g = NCCLGroup(world_size, rank, group_name) self._name_group_map[group_name] = g self._group_name_map[g] = group_name @@ -97,9 +100,9 @@ def init_collective_group(world_size: int, """Initialize a collective group inside an actor process. Args: - world_size (int): the total number of processes in the group. + world_size (int): the total number of processed in the group. rank (int): the rank of the current process. - backend: the CCL backend to use, NCCL or GLOO. + backend: the CCL backend to use, NCCL or MPI. group_name (str): the name of the collective group. Returns: @@ -134,13 +137,10 @@ def declare_collective_group(actors, Args: actors (list): a list of actors to be set in a collective group. - world_size (int): the total number of processes in the group. - ranks (List[int]): the rank of each actor. - backend: the CCL backend to use, NCCL or GLOO. - group_name (str): the name of the collective group. - - Returns: - None + group_options (dict): a dictionary that contains group_name(str), + world_size(int), rank(list of int, e.g. [0,1] + means the first actor is rank 0, and the second + actor is rank 1), backend(str). """ backend = types.Backend(backend) _check_backend_availability(backend) @@ -162,25 +162,18 @@ def declare_collective_group(actors, "Ranks must be a permutation from 0 to '{}'. Got '{}'.".format( len(ranks), "".join([str(r) for r in ranks]))) - if world_size <= 0: - raise RuntimeError("World size must be greater than zero. " - "Got '{}'.".format(world_size)) - if not all(ranks) >= 0: - raise RuntimeError("Ranks must be non-negative.") - if not all(ranks) < world_size: - raise RuntimeError("Ranks cannot be greater than world_size.") + assert world_size > 0 + assert all(ranks) >= 0 and all(ranks) < world_size # avoid a circular dependency from ray.util.collective.util import Info - # store the information into a NamedActor that can be accessed later. + # store the information into a NamedActor that can be accessed later/ name = "info_" + group_name actors_id = [a._ray_actor_id for a in actors] - # TODO (Dacheng): how do we recycle this name actor? info = Info.options(name=name, lifetime="detached").remote() ray.get([info.set_info.remote(actors_id, world_size, ranks, backend)]) -# TODO (we need a declarative destroy() API here.) def destroy_collective_group(group_name: str = "default") -> None: """Destroy a collective group given its group name.""" _check_inside_actor() @@ -213,8 +206,9 @@ def get_world_size(group_name: str = "default") -> int: group_name: the name of the group to query Returns: - The world size of the collective group, -1 if the group does - not exist or the process does not belong to the group. + The world size of the collective group, + -1 if the group does not exist or the process does + not belong to the group. """ _check_inside_actor() if not is_group_initialized(group_name): @@ -238,29 +232,7 @@ def allreduce(tensor, group_name: str = "default", op=types.ReduceOp.SUM): g = _check_and_get_group(group_name) opts = types.AllReduceOptions opts.reduceOp = op - g.allreduce([tensor], opts) - - -def allreduce_multigpu(tensor_list: list, - group_name: str = "default", - op=types.ReduceOp.SUM): - """Collective allreduce a list of tensors across the group. - - Args: - tensor_list (List[tensor]): list of tensors to be allreduced, - each on a GPU. - group_name (str): the collective group name to perform allreduce. - - Returns: - None - """ - if not types.cupy_available(): - raise RuntimeError("Multigpu calls requires NCCL and Cupy.") - _check_tensor_list_input(tensor_list) - g = _check_and_get_group(group_name) - opts = types.AllReduceOptions - opts.reduceOp = op - g.allreduce(tensor_list, opts) + g.allreduce(tensor, opts) def barrier(group_name: str = "default"): @@ -284,8 +256,8 @@ def reduce(tensor, Args: tensor: the tensor to be reduced on this process. - dst_rank (int): the rank of the destination process. - group_name (str): the collective group name to perform reduce. + dst_rank: the rank of the destination process. + group_name: the collective group name to perform reduce. op: The reduce operation. Returns: @@ -299,42 +271,7 @@ def reduce(tensor, opts = types.ReduceOptions() opts.reduceOp = op opts.root_rank = dst_rank - opts.root_tensor = 0 - g.reduce([tensor], opts) - - -def reduce_multigpu(tensor_list: list, - dst_rank: int = 0, - dst_tensor: int = 0, - group_name: str = "default", - op=types.ReduceOp.SUM): - """Reduce the tensor across the group to the destination rank - and destination tensor. - - Args: - tensor_list: the list of tensors to be reduced on this process; - each tensor located on a GPU. - dst_rank (int): the rank of the destination process. - dst_tensor: the index of GPU at the destination. - group_name (str): the collective group name to perform reduce. - op: The reduce operation. - - Returns: - None - """ - if not types.cupy_available(): - raise RuntimeError("Multigpu calls requires NCCL and Cupy.") - _check_tensor_list_input(tensor_list) - g = _check_and_get_group(group_name) - - # check dst rank - _check_rank_valid(g, dst_rank) - _check_root_tensor_valid(len(tensor_list), dst_tensor) - opts = types.ReduceOptions() - opts.reduceOp = op - opts.root_rank = dst_rank - opts.root_tensor = dst_tensor - g.reduce(tensor_list, opts) + g.reduce(tensor, opts) def broadcast(tensor, src_rank: int = 0, group_name: str = "default"): @@ -342,8 +279,8 @@ def broadcast(tensor, src_rank: int = 0, group_name: str = "default"): Args: tensor: the tensor to be broadcasted (src) or received (destination). - src_rank (int): the rank of the source process. - group_name (str): the collective group name to perform broadcast. + src_rank: the rank of the source process. + group_name: he collective group name to perform broadcast. Returns: None @@ -355,37 +292,7 @@ def broadcast(tensor, src_rank: int = 0, group_name: str = "default"): _check_rank_valid(g, src_rank) opts = types.BroadcastOptions() opts.root_rank = src_rank - opts.root_tensor = 0 - g.broadcast([tensor], opts) - - -def broadcast_multigpu(tensor_list, - src_rank: int = 0, - src_tensor: int = 0, - group_name: str = "default"): - """Broadcast the tensor from a source GPU to all other GPUs. - - Args: - tensor_list: the tensors to broadcast (src) or receive (dst). - src_rank (int): the rank of the source process. - src_tensor (int): the index of the source GPU on the source process. - group_name (str): the collective group name to perform broadcast. - - Returns: - None - """ - if not types.cupy_available(): - raise RuntimeError("Multigpu calls requires NCCL and Cupy.") - _check_tensor_list_input(tensor_list) - g = _check_and_get_group(group_name) - - # check src rank - _check_rank_valid(g, src_rank) - _check_root_tensor_valid(len(tensor_list), src_tensor) - opts = types.BroadcastOptions() - opts.root_rank = src_rank - opts.root_tensor = src_tensor - g.broadcast(tensor_list, opts) + g.broadcast(tensor, opts) def allgather(tensor_list: list, tensor, group_name: str = "default"): @@ -394,7 +301,7 @@ def allgather(tensor_list: list, tensor, group_name: str = "default"): Args: tensor_list (list): the results, stored as a list of tensors. tensor: the tensor (to be gathered) in the current process - group_name (str): the name of the collective group. + group_name: the name of the collective group. Returns: None @@ -407,33 +314,9 @@ def allgather(tensor_list: list, tensor, group_name: str = "default"): # Here we make it more strict: len(tensor_list) == world_size. raise RuntimeError( "The length of the tensor list operands to allgather " - "must be equal to world_size.") - opts = types.AllGatherOptions() - g.allgather([tensor_list], [tensor], opts) - - -def allgather_multigpu(output_tensor_lists: list, - input_tensor_list: list, - group_name: str = "default"): - """Allgather tensors from each gpus of the group into lists. - - Args: - output_tensor_lists (List[List[tensor]]): gathered results, with shape - must be num_gpus * world_size * shape(tensor). - input_tensor_list: (List[tensor]): a list of tensors, with shape - num_gpus * shape(tensor). - group_name (str): the name of the collective group. - - Returns: - None - """ - if not types.cupy_available(): - raise RuntimeError("Multigpu calls requires NCCL and Cupy.") - _check_tensor_lists_input(output_tensor_lists) - _check_tensor_list_input(input_tensor_list) - g = _check_and_get_group(group_name) + "must not be equal to world_size.") opts = types.AllGatherOptions() - g.allgather(output_tensor_lists, input_tensor_list, opts) + g.allgather(tensor_list, tensor, opts) def reducescatter(tensor, @@ -463,38 +346,11 @@ def reducescatter(tensor, "must not be equal to world_size.") opts = types.ReduceScatterOptions() opts.reduceOp = op - g.reducescatter([tensor], [tensor_list], opts) - - -def reducescatter_multigpu(output_tensor_list, - input_tensor_lists, - group_name: str = "default", - op=types.ReduceOp.SUM): - """Reducescatter a list of tensors across all GPUs. - - Args: - output_tensor_list: the resulted list of tensors, with - shape: num_gpus * shape(tensor). - input_tensor_lists: the original tensors, with shape: - num_gpus * world_size * shape(tensor). - group_name (str): the name of the collective group. - op: The reduce operation. - - Returns: - None. - """ - if not types.cupy_available(): - raise RuntimeError("Multigpu calls requires NCCL and Cupy.") - _check_tensor_lists_input(input_tensor_lists) - _check_tensor_list_input(output_tensor_list) - g = _check_and_get_group(group_name) - opts = types.ReduceScatterOptions() - opts.reduceOp = op - g.reducescatter(output_tensor_list, input_tensor_lists, opts) + g.reducescatter(tensor, tensor_list, opts) def send(tensor, dst_rank: int, group_name: str = "default"): - """Send a tensor to a remote process synchronously. + """Send a tensor to a remote processes synchronously. Args: tensor: the tensor to send. @@ -510,41 +366,7 @@ def send(tensor, dst_rank: int, group_name: str = "default"): if dst_rank == g.rank: raise RuntimeError( "The destination rank '{}' is self.".format(dst_rank)) - opts = types.SendOptions() - opts.dst_rank = dst_rank - g.send([tensor], opts) - - -def send_multigpu(tensor, - dst_rank: int, - dst_gpu_index: int, - group_name: str = "default"): - """Send a tensor to a remote GPU synchronously. - - The function asssume each process owns >1 GPUs, and the sender - process and receiver process has equal nubmer of GPUs. - - Args: - tensor: the tensor to send, located on a GPU. - dst_rank (int): the rank of the destination process. - dst_gpu_index (int): the destination gpu index. - group_name (str): the name of the collective group. - - Returns: - None - """ - if not types.cupy_available(): - raise RuntimeError("send_multigpu call requires NCCL.") - _check_single_tensor_input(tensor) - g = _check_and_get_group(group_name) - _check_rank_valid(g, dst_rank) - if dst_rank == g.rank: - raise RuntimeError("The dst_rank '{}' is self. Considering " - "doing GPU to GPU memcpy instead?".format(dst_rank)) - opts = types.SendOptions() - opts.dst_rank = dst_rank - opts.dst_gpu_index = dst_gpu_index - g.send([tensor], opts) + g.send(tensor, dst_rank) def recv(tensor, src_rank: int, group_name: str = "default"): @@ -564,41 +386,7 @@ def recv(tensor, src_rank: int, group_name: str = "default"): if src_rank == g.rank: raise RuntimeError( "The destination rank '{}' is self.".format(src_rank)) - opts = types.RecvOptions() - opts.src_rank = src_rank - g.recv([tensor], opts) - - -def recv_multigpu(tensor, - src_rank: int, - src_gpu_index: int, - group_name: str = "default"): - """Receive a tensor from a remote GPU synchronously. - - The function asssume each process owns >1 GPUs, and the sender - process and receiver process has equal nubmer of GPUs. - - Args: - tensor: the received tensor, located on a GPU. - src_rank (int): the rank of the source process. - src_gpu_index (int): the index of the source gpu on the src process. - group_name (str): the name of the collective group. - - Returns: - None - """ - if not types.cupy_available(): - raise RuntimeError("recv_multigpu call requires NCCL.") - _check_single_tensor_input(tensor) - g = _check_and_get_group(group_name) - _check_rank_valid(g, src_rank) - if src_rank == g.rank: - raise RuntimeError("The dst_rank '{}' is self. Considering " - "doing GPU to GPU memcpy instead?".format(src_rank)) - opts = types.RecvOptions() - opts.src_rank = src_rank - opts.src_gpu_index = src_gpu_index - g.recv([tensor], opts) + g.recv(tensor, src_rank) def _check_and_get_group(group_name): @@ -635,6 +423,16 @@ def _check_and_get_group(group_name): return g +def _check_backend_availability(backend: types.Backend): + """Check whether the backend is available.""" + if backend == types.Backend.MPI: + if not mpi_available(): + raise RuntimeError("MPI is not available.") + elif backend == types.Backend.NCCL: + if not nccl_available(): + raise RuntimeError("NCCL is not available.") + + def _check_single_tensor_input(tensor): """Check if the tensor is with a supported type.""" if isinstance(tensor, np.ndarray): @@ -650,16 +448,6 @@ def _check_single_tensor_input(tensor): type(tensor))) -def _check_backend_availability(backend: types.Backend): - """Check whether the backend is available.""" - if backend == types.Backend.GLOO: - if not gloo_available(): - raise RuntimeError("GLOO is not available.") - elif backend == types.Backend.NCCL: - if not nccl_available(): - raise RuntimeError("NCCL is not available.") - - def _check_inside_actor(): """Check if currently it is inside a Ray actor/task.""" worker = ray.worker.global_worker @@ -674,8 +462,8 @@ def _check_rank_valid(g, rank: int): """Check the rank: 0 <= rank < world_size.""" if rank < 0: raise ValueError("rank '{}' is negative.".format(rank)) - if rank >= g.world_size: - raise ValueError("rank '{}' must be less than world size " + if rank > g.world_size: + raise ValueError("rank '{}' is greater than world size " "'{}'".format(rank, g.world_size)) @@ -688,24 +476,3 @@ def _check_tensor_list_input(tensor_list): raise RuntimeError("Got an empty list of tensors.") for t in tensor_list: _check_single_tensor_input(t) - - -def _check_tensor_lists_input(tensor_lists): - """Check if the input is a list of lists of supported tensor types.""" - if not isinstance(tensor_lists, list): - raise RuntimeError("The input must be a list of lists of tensors. " - "Got '{}'.".format(type(tensor_lists))) - if not tensor_lists: - raise RuntimeError(f"Did not receive tensors. Got: {tensor_lists}") - for t in tensor_lists: - _check_tensor_list_input(t) - - -def _check_root_tensor_valid(length, root_tensor): - """Check the root_tensor device is 0 <= root_tensor < length""" - if root_tensor < 0: - raise ValueError("root_tensor '{}' is negative.".format(root_tensor)) - if root_tensor >= length: - raise ValueError( - "root_tensor '{}' is greater than the number of GPUs: " - "'{}'".format(root_tensor, length)) diff --git a/python/ray/util/collective/collective_group/nccl_collective_group.py b/python/ray/util/collective/collective_group/nccl_collective_group.py index 4cc693f11479..ba8c7d2dbb08 100644 --- a/python/ray/util/collective/collective_group/nccl_collective_group.py +++ b/python/ray/util/collective/collective_group/nccl_collective_group.py @@ -11,11 +11,15 @@ from ray.util.collective.const import get_nccl_store_name from ray.util.collective.types import AllReduceOptions, \ BarrierOptions, Backend, ReduceOptions, BroadcastOptions, \ - AllGatherOptions, ReduceScatterOptions, SendOptions, \ - RecvOptions + AllGatherOptions, ReduceScatterOptions logger = logging.getLogger(__name__) +# TODO(Hao): +# (1) stream management, instead of using the default stream, +# using a dedicate stream +# (2) communicator management and support num_gpus > 2 per actor. + class Rendezvous: """A rendezvous class for different actor/task processes to meet. @@ -27,18 +31,13 @@ class Rendezvous: process. Args: - store_key (str): the unique store key, usually as a concatanation - of group_name and communicator key. See `get_nccl_communicator` - for more details. + group_name (str): the unique user-specified group name. """ - def __init__(self, store_key): - if not store_key: - raise ValueError( - "Invalid store_key. The store_key is a concatenation of " - "'group_name' and the 'communicator_key'. See the " - "docstring of `get_nccl_communicator` for details.") - self._store_key = store_key + def __init__(self, group_name): + if not group_name: + raise ValueError("Invalid group name.") + self._group_name = group_name self._store_name = None self._store = None @@ -54,7 +53,7 @@ def meet(self, timeout_s=180): if timeout_s <= 0: raise ValueError("The 'timeout' argument must be positive. " "Got '{}'.".format(timeout_s)) - self._store_name = get_nccl_store_name(self._store_key) + self._store_name = get_nccl_store_name(self._group_name) timeout_delta = datetime.timedelta(seconds=timeout_s) elapsed = datetime.timedelta(seconds=0) start_time = datetime.datetime.now() @@ -73,9 +72,7 @@ def meet(self, timeout_s=180): break if not self._store: raise RuntimeError("Unable to meet other processes " - "at the rendezvous store. If you are using " - "P2P communication, please check if tensors " - "are put in the correct GPU. ") + "at the rendezvous store.") @property def store(self): @@ -86,9 +83,8 @@ def get_nccl_id(self, timeout_s=180): Args: timeout_s: timeout in seconds. - Return: - uid (str): the NCCLUniqueID if successful. + str: the NCCLUniqueID if successful. """ if not self._store: raise ValueError("Rendezvous store is not setup.") @@ -114,52 +110,55 @@ def __init__(self, world_size, rank, group_name): """Init an NCCL collective group.""" super(NCCLGroup, self).__init__(world_size, rank, group_name) - # communicator and stream cache. - # TODO (Hao): we need a lock here... - self._dev_comm_map = {} - self._dev_streams_map = {} - - # record the used GPU IDs. - self._used_gpu_indices = set() + # TODO(Hao): change this to a be a cache + self._collective_comm_cache = None + self._p2p_comm_cache = {} if nccl_util.get_nccl_build_version() < 2000: raise RuntimeError("NCCL in Ray requires NCCL >= 2.0.") + # TODO(Hao): check version here if nccl_util.get_nccl_runtime_version() < 2704: logger.warning("NCCL send/recv calls requires NCCL>=2.7.4") + # Setup a tensor for barrier calls + self._barrier_tensor = cupy.array([1]) + def destroy_group(self): """Destroy the group and release NCCL communicators.""" - if len(self._dev_comm_map.keys()) > 0: - - # TODO(Hao): check this barrier call - # self.barrier() - - # Destroy the communicators and streams. - for comm_key, comms in self._dev_comm_map.items(): - for c in comms: - c.destroy() - self._dev_comm_map[comm_key] = None - - if self.rank == 0: - for comm_key in self._dev_comm_map: - assert not self._dev_comm_map[comm_key] - group_key = self._generate_group_key(comm_key) - self._destroy_store(group_key) - self._barrier_tensor = None - self._dev_comm_map = None - self._dev_streams_map = None + if self._collective_comm_cache: + self.barrier() + # We also need a barrier call here. + stream = self._get_cuda_stream() + stream.synchronize() + # destroy the communicator + self._collective_comm_cache.destroy() + self._collective_comm_cache = None + + if self.rank == 0: + self._destroy_store(self.group_name) + + if self._p2p_comm_cache: + for key, comm in self._p2p_comm_cache.items(): + comm.destroy() + min_rank, max_rank = self._parse_p2p_group_key(key) + if self.rank == min_rank: + self._destroy_store(key) + self._p2p_comm_cache[key] = None + for key in list(self._p2p_comm_cache.keys()): + del self._p2p_comm_cache[key] + self._p2p_comm_cache = None + super(NCCLGroup, self).destroy_group() @classmethod def backend(cls): return Backend.NCCL - def allreduce(self, tensors, allreduce_options=AllReduceOptions()): - """AllReduce tensors across the collective group following options. + def allreduce(self, tensor, allreduce_options=AllReduceOptions()): + """AllReduce the tensor across the collective group following options. Args: - tensors (List): the list of tensors to be reduced. Each tensor must - reside on one GPU of the current process. + tensor: the tensor to be reduced, each tensor locates on a GPU. allreduce_options: allreduce options. Returns: @@ -175,41 +174,29 @@ def collective_fn(input_tensor, output_tensor, comm, stream): nccl_util.get_nccl_reduce_op(allreduce_options.reduceOp), stream.ptr) - self._collective(tensors, tensors, collective_fn) + self._collective(tensor, tensor, collective_fn) def barrier(self, barrier_options=BarrierOptions()): """Blocks until all processes reach this barrier. Args: - barrier_options: barrier options. + barrier_options: Returns: None """ - # Get the device list. - if self._used_gpu_indices: - devices = list(self._used_gpu_indices) - else: - devices = list(range(nccl_util.get_num_gpus())) - barrier_tensors = [None] * len(devices) - for i, d in enumerate(devices): - with nccl_util.Device(d): - barrier_tensors[i] = cupy.array([1]) - self.allreduce(barrier_tensors) - - def reduce(self, tensors, reduce_options=ReduceOptions()): - """Reduce tensors to a destination gpu following options. + self.allreduce(self._barrier_tensor) + + def reduce(self, tensor, reduce_options=ReduceOptions()): + """Reduce tensor to a destination process following options. Args: - tensors (List): the list of tensors to be reduced, each tensor - must reside on one gpu of the current process. - reduce_options: reduce options. + tensor: the tensor to be reduced. + reduce_options: reduce options Returns: None """ - root_rank = len(tensors) * reduce_options.root_rank \ - + reduce_options.root_tensor def collective_fn(input_tensor, output_tensor, comm, stream): comm.reduce( @@ -218,43 +205,40 @@ def collective_fn(input_tensor, output_tensor, comm, stream): nccl_util.get_tensor_n_elements(input_tensor), nccl_util.get_nccl_tensor_dtype(input_tensor), nccl_util.get_nccl_reduce_op(reduce_options.reduceOp), - root_rank, stream.ptr) + reduce_options.root_rank, stream.ptr) - self._collective(tensors, tensors, collective_fn) + self._collective(tensor, tensor, collective_fn) - def broadcast(self, tensors, broadcast_options=BroadcastOptions()): - """Broadcast tensors to all other gpus following options. + def broadcast(self, tensor, broadcast_options=BroadcastOptions()): + """Broadcast tensor to all other processes following options. Args: - tensors (List): tensors to be broadcast or received. + tensor: the tensor to be broadcasted. broadcast_options: broadcast options. Returns: None """ - root_rank = len(tensors) * broadcast_options.root_rank \ - + broadcast_options.root_tensor def collective_fn(input_tensor, output_tensor, comm, stream): comm.broadcast( nccl_util.get_tensor_ptr(input_tensor), nccl_util.get_tensor_ptr(output_tensor), nccl_util.get_tensor_n_elements(input_tensor), - nccl_util.get_nccl_tensor_dtype(input_tensor), root_rank, - stream.ptr) + nccl_util.get_nccl_tensor_dtype(input_tensor), + broadcast_options.root_rank, stream.ptr) - self._collective(tensors, tensors, collective_fn) + self._collective(tensor, tensor, collective_fn) def allgather(self, - tensor_lists, - tensors, + tensor_list, + tensor, allgather_options=AllGatherOptions()): - """Allgather tensors across gpus into a list of tensors. + """Allgather tensors across the group into a list of tensors. Args: - tensor_lists (List[List[Tensor]]): allgathered tensors. - tensors: the list of tensors to allgather across the group. - Each tensor must lolcate on a GPU of the process. + tensor_list: the tensor list to store the results. + tensor: the tensor to be allgather-ed across the group. allgather_options: allgather options. Returns: @@ -268,36 +252,30 @@ def collective_fn(input_tensor, output_tensor, comm, stream): nccl_util.get_tensor_n_elements(input_tensor), nccl_util.get_nccl_tensor_dtype(input_tensor), stream.ptr) - _check_inputs_compatibility_for_scatter_gather(tensors, tensor_lists) - output_flattened = [ - _flatten_for_scatter_gather(tensor_list, copy=False) - for tensor_list in tensor_lists - ] + _check_inputs_compatibility_for_scatter_gather(tensor, tensor_list) + flattened_output_tensor = _flatten_for_scatter_gather( + tensor_list, copy=False) def postprocess_fn(stream): - # TODO(Hao): designate a copy stream. - for i, tensor_list in enumerate(tensor_lists): - for j, tensor in enumerate(tensor_list): - nccl_util.copy_tensor(tensor, output_flattened[i][j]) + for i, tensor in enumerate(tensor_list): + nccl_util.copy_tensor(tensor, flattened_output_tensor[i]) self._collective( - tensors, - output_flattened, + tensor, + flattened_output_tensor, collective_fn, postprocess_fn=postprocess_fn) def reducescatter(self, - tensors, - tensor_lists, + tensor, + tensor_list, reducescatter_options=ReduceScatterOptions()): - """Reduce the scatter a list of tensors across the group. + """Reducescatter a list of tensors across the group. Args: - tensors (List): the output tensors (could be unspecified), each - located on a GPU of the current process. - tensor_lists (List[List]): the list of tensors to be reduced then - scattered. - reducescatter_options: reduce-scatter options. + tensor: the output tensor (could be unspecified). + tensor_list: the list of tensor to be reduced then scattered. + reducescatter_options: reducescatter options. Returns: None @@ -312,30 +290,26 @@ def collective_fn(input_tensor, output_tensor, comm, stream): nccl_util.get_nccl_reduce_op(reducescatter_options.reduceOp), stream.ptr) - _check_inputs_compatibility_for_scatter_gather(tensors, tensor_lists) - input_flattened = [ - _flatten_for_scatter_gather(tensor_list, copy=False) - for tensor_list in tensor_lists - ] + _check_inputs_compatibility_for_scatter_gather(tensor, tensor_list) + flattened_input_tensor = _flatten_for_scatter_gather( + tensor_list, copy=False) def preprocess_fn(stream): - # TODO(Hao): designate a copy stream. - for i, tensor_list in enumerate(tensor_lists): - for j, tensor in enumerate(tensor_list): - nccl_util.copy_tensor(input_flattened[i][j], tensor) + for i, tensor in enumerate(tensor_list): + nccl_util.copy_tensor(flattened_input_tensor[i], tensor) self._collective( - input_flattened, - tensors, + flattened_input_tensor, + tensor, collective_fn, preprocess_fn=preprocess_fn) - def send(self, tensors, send_options=SendOptions()): - """Send a tensor to a destination gpu in the group. + def send(self, tensor, dst_rank): + """Send tensor to a destination process in the group. Args: - tensors (List): the tensor to send. - send_options: send options. + tensor: the tensor to send. + dst_rank: the rank of the destination process. Returns: None @@ -347,15 +321,14 @@ def p2p_fn(tensor, comm, stream, peer): nccl_util.get_tensor_n_elements(tensor), nccl_util.get_nccl_tensor_dtype(tensor), peer, stream.ptr) - self._point2point(tensors, p2p_fn, send_options.dst_rank, - send_options.dst_gpu_index) + self._point2point(tensor, p2p_fn, dst_rank) - def recv(self, tensors, recv_options=RecvOptions()): - """Receive a tensor from a source gpu in the group. + def recv(self, tensor, src_rank): + """Receive tensor from a source process in the group. Args: - tensors (List): the received tensor. - recv_options: Receive options. + tensor: the received tensor. + src_rank: the rank of the source process. Returns: None @@ -367,218 +340,128 @@ def p2p_fn(tensor, comm, stream, peer): nccl_util.get_tensor_n_elements(tensor), nccl_util.get_nccl_tensor_dtype(tensor), peer, stream.ptr) - self._point2point(tensors, p2p_fn, recv_options.src_rank, - recv_options.src_gpu_index) - - def _get_nccl_collective_communicator(self, comm_key, device_list): - """Create or retrieve an NCCL communicator from cache. - - If the communicator is found in cache, return the communicator. If not, - a communicator and a stream will be created and put in cache. - TODO(Hao): this function is not thread-safe now. + self._point2point(tensor, p2p_fn, src_rank) - Args: - comm_key (str): the key to query the communicator cache. - device_list (List): a list of GPU devices of the current process - that participates into the collective. + def _get_nccl_collective_communicator(self): + """Create or retrieve a cached NCCL communicator. Returns: - communicator: the NCCL communicator corresponded to the devices. + communicator """ - if not comm_key: - raise RuntimeError("Got empty communicator key.") - for d in device_list: - self._used_gpu_indices.add(d) - - # TODO(Hao): lock the _dev_comm_map here. - if comm_key in self._dev_comm_map: - return self._dev_comm_map[comm_key] - - group_key = self._generate_group_key(comm_key) - if self.rank == 0: - nccl_uid = self._generate_nccl_uid(group_key) - else: - rendezvous = Rendezvous(group_key) - rendezvous.meet() - nccl_uid = rendezvous.get_nccl_id() - - # Now create the communicators - actual_world_size = len(device_list) * self.world_size - comms = [None] * len(device_list) - streams = [None] * len(device_list) - nccl_util.groupStart() - for i, device in enumerate(device_list): - actual_rank = self.rank * len(device_list) + i - with nccl_util.Device(device): - comms[i] = nccl_util.create_nccl_communicator( - actual_world_size, nccl_uid, actual_rank) - streams[i] = cupy.cuda.Stream.null - # Stream(non_blocking=True) - nccl_util.groupEnd() - self._dev_comm_map[comm_key] = comms - self._dev_streams_map[comm_key] = streams - return comms - - @staticmethod - def _sync_streams(): - """Let NCCL streams wait for current streams for every device.""" - # FIXME: This behavior is different from nccl document. It seems like - # cupy allocate tensors on null streams. - cupy.cuda.Stream.null.synchronize() - - def _get_nccl_p2p_communicator(self, comm_key, my_gpu_idx, peer_rank, - peer_gpu_idx): + if not self._collective_comm_cache: + # create the communicator + if self.rank == 0: + group_uid = self._generate_nccl_uid(self.group_name) + else: + rendezvous = Rendezvous(self.group_name) + rendezvous.meet() + group_uid = rendezvous.get_nccl_id() + self._collective_comm_cache = \ + nccl_util.create_nccl_communicator(self.world_size, + group_uid, + self.rank) + return self._collective_comm_cache + + def _get_nccl_p2p_communicator(self, rank1, rank2): """Create or retrieve an NCCL communicator for p2p tasks. - Note(Hao): this function is not thread-safe now. - Args: - comm_key (str): communicator key. - my_gpu_idx (int): the gpu index on the current process. - peer_rank (int): the rank of the destination process. - peer_gpu_idx (int): the gpu index on the peer process. + rank1 (int): source rank. + rank2 (int): destination rank. + Returns: communicator """ - if not comm_key: - raise RuntimeError("Got empty communicator key.") - - # TODO(Hao): lock the _dev_comm_map here. - if comm_key in self._dev_comm_map: - return self._dev_comm_map[comm_key] - - # Note (Hao): This is a bit complex so I decide to take a note here. - # Here we need to consider three cases: - # Case 1: src_rank != dst_rank, hence the send and recv happen on - # different process (actors/tasks); each process makes independent - # collective calls and manages corresponding communicators. - # Case 2: src_rank == dst_rank, src_gpu_idx == dst_gpu_idx; for - # this case, we simply throw a RuntimeError; - # Case 3: src_rank == dst_rank, src_gpu_idx != dst_gpu_idx, which - # means the send and recv will be called on the same process. We - # DO NOT support this case for now. We need to properly scope: - # (1) communicators creation, and - # (2) send/recv calls - # using groupStart(( and groupEnd() calls to avoid deadlocks. - if self.rank < peer_rank: - my_p2p_rank = 0 - elif self.rank > peer_rank: - my_p2p_rank = 1 - else: - raise RuntimeError( - "Send and recv happens on the same process! " - "ray.util.collective does not support this case as of now. " - "Alternatively, consider doing GPU to GPU memcpy?") - - group_key = self._generate_group_key(comm_key) - if my_p2p_rank == 0: - nccl_uid = self._generate_nccl_uid(group_key) - else: - rendezvous = Rendezvous(group_key) - rendezvous.meet() - nccl_uid = rendezvous.get_nccl_id() - - # create the p2p communicators - with nccl_util.Device(my_gpu_idx): - comm = nccl_util.create_nccl_communicator(2, nccl_uid, my_p2p_rank) - stream = cupy.cuda.Stream.null - # Stream(non_blocking=True) - self._dev_comm_map[comm_key] = [comm] - self._dev_streams_map[comm_key] = [stream] - return [comm] - - def _generate_group_key(self, comm_key): - """Generate a unique key used to initialize the KV store. - - The group key is a concatenation of the communicator key and - the group name, following: [comm_key]@[group_name]. - """ - return comm_key + "@" + self.group_name + min_rank = min(rank1, rank2) + max_rank = max(rank1, rank2) + my_rank = 0 if self.rank == min_rank else 1 + p2p_group_key = self._generate_p2p_group_key(min_rank, max_rank) + comm = self._p2p_comm_cache.get(p2p_group_key) + if not comm: + if self.rank == min_rank: + group_uid = self._generate_nccl_uid(p2p_group_key) + else: + rendezvous = Rendezvous(p2p_group_key) + rendezvous.meet() + group_uid = rendezvous.get_nccl_id() + comm = nccl_util.create_nccl_communicator(2, group_uid, my_rank) + self._p2p_comm_cache[p2p_group_key] = comm + return comm + + def _generate_p2p_group_key(self, min_rank, max_rank): + return self.group_name + "_" + str(min_rank) + "_" + str(max_rank) @staticmethod - def _destroy_store(group_key): - """Destroy the KV store (Ray named actor). - - Args: - group_key (str): the unique key to retrieve the KV store. + def _parse_p2p_group_key(key): + strs = key.split("_") + return int(strs[-2]), int(strs[-1]) - Returns: - None - """ - store_name = get_nccl_store_name(group_key) + @staticmethod + def _destroy_store(group_name): + store_name = get_nccl_store_name(group_name) store = ray.get_actor(store_name) # ray.get([store.__ray_terminate__.remote()]) ray.kill(store) - def _generate_nccl_uid(self, key): - """Generate an NCCL unique ID for initializing communicators. - - The method will also create a KV store using Ray named actor and store - the NCCLUniqueID in the store. The store needs to be garbage collected - when destroying the collective group. + def _generate_nccl_uid(self, name): + """Generate an NCCL UID by calling the NCCL API. Args: - key (str): the key of the . + name: the name of the collective group. Returns: - NCCLUniqueID (str): NCCL unique ID. + str: NCCL uid. """ group_uid = nccl_util.get_nccl_unique_id() - store_name = get_nccl_store_name(key) + store_name = get_nccl_store_name(name) # Avoid a potential circular dependency in ray/actor.py from ray.util.collective.util import NCCLUniqueIDStore store = NCCLUniqueIDStore.options( name=store_name, lifetime="detached").remote(store_name) - ray.get([store.set_id.remote(group_uid)]) + ray.wait([store.set_id.remote(group_uid)]) return group_uid + @staticmethod + def _get_cuda_stream(): + """Obtain an idle stream from a stream pool for the collective task.""" + # TODO: implement a simple stream manager. + return cupy.cuda.Stream.null + def _collective(self, - input_tensors, - output_tensors, + input_tensor, + output_tensor, collective_fn, preprocess_fn=None, postprocess_fn=None): """A method to encapsulate all collective calls. Args: - input_tensors: the list of the input tensors. - output_tensors: the list of the output tensors. + input_tensor: the input tensor. + output_tensor: the output tensor. collective_fn: the collective function call. - preprocess_fn: preprocess procedures before collective calls. - postprocess_fn: postprocess procedures after collective calls. + preprocess_fn: preprocess function to call before collectives. + postprocess_fn: postprocess function to call after collectives. Returns: None """ - _check_gpu_tensors(input_tensors) - _check_gpu_tensors(output_tensors) - - devices = nccl_util.get_tensor_device_list(input_tensors) - key = _get_comm_key_from_devices(devices) - comms = self._get_nccl_collective_communicator(key, devices) - streams = self._dev_streams_map[key] - - # TODO(Hao): sync streams and events - self._sync_streams() + comm = self._get_nccl_collective_communicator() + stream = self._get_cuda_stream() # Make the collective call if preprocess_fn: - preprocess_fn(streams) - nccl_util.groupStart() - for i, tensor in enumerate(input_tensors): - collective_fn(tensor, output_tensors[i], comms[i], streams[i]) - nccl_util.groupEnd() + preprocess_fn(stream) + collective_fn(input_tensor, output_tensor, comm, stream) if postprocess_fn: - postprocess_fn(streams) + postprocess_fn(stream) - def _point2point(self, tensors, p2p_fn, peer_rank: int, peer_gpu_idx: int): - """A method to encapsulate all peer-to-peer calls (i.e., send/recv). + def _point2point(self, tensor, p2p_fn, peer_rank: int): + """A method to encapsulate all p2p calls. Args: - tensors: the tensor to send or receive. + tensor: the tensor to be sent/received. p2p_fn: the p2p function call. - peer_rank (int): the rank of the peer process. - peer_gpu_idx (int): the index of the gpu on the peer process. + peer_rank (int): the peer rank of the current process. Returns: None @@ -588,24 +471,13 @@ def _point2point(self, tensors, p2p_fn, peer_rank: int, peer_gpu_idx: int): raise RuntimeError("P2p send/recv requires NCCL >= 2.7.4. " "Got '{}'.".format( nccl_util.get_nccl_runtime_version())) - _check_gpu_tensors(tensors) - - # we currently only support single device to single device send/recv. - assert len(tensors) == 1 - my_gpu_idx = nccl_util.get_tensor_device(tensors[0]) - comm_key = _get_comm_key_send_recv(self.rank, my_gpu_idx, peer_rank, - peer_gpu_idx) - comms = self._get_nccl_p2p_communicator(comm_key, my_gpu_idx, - peer_rank, peer_gpu_idx) - streams = self._dev_streams_map[comm_key] - - # TODO(Hao): sync streams and events - self._sync_streams() # We have made sure that self.rank != peer_rank during API check. peer_p2p_rank = 0 if self.rank > peer_rank else 1 - for i, tensor in enumerate(tensors): - p2p_fn(tensors[i], comms[i], streams[i], peer_p2p_rank) + comm = self._get_nccl_p2p_communicator(self.rank, peer_rank) + stream = self._get_cuda_stream() + # Make the p2p call: + p2p_fn(tensor, comm, stream, peer_p2p_rank) def _flatten_for_scatter_gather(tensor_list, copy=False): @@ -624,130 +496,29 @@ def _flatten_for_scatter_gather(tensor_list, copy=False): # note we need a cupy dtype here. dtype = nccl_util.get_cupy_tensor_dtype(t) buffer_shape = [len(tensor_list)] + nccl_util.get_tensor_shape(t) - device = nccl_util.get_tensor_device(t) - with nccl_util.Device(device): - buffer = cupy.empty(buffer_shape, dtype=dtype) + buffer = cupy.empty(buffer_shape, dtype=dtype) if copy: for i, tensor in enumerate(tensor_list): nccl_util.copy_tensor(buffer[i], tensor) return buffer -def _check_inputs_compatibility_for_scatter_gather(tensors, tensor_lists): - """Check the compatibility between tensor input and tensor list input.""" - if not tensors or not isinstance(tensors, list): - raise RuntimeError( - "The first argument 'tensors' expects a list of tensors.") - if not tensor_lists or not isinstance(tensor_lists, list): - raise RuntimeError("The second argument 'tensor_lists' " - "expects a list of tensor list.") - dtype = nccl_util.get_nccl_tensor_dtype(tensors[0]) - shape = nccl_util.get_tensor_shape(tensors[0]) - for i, tensor_list in enumerate(tensor_lists): - # check all tensor in `tensors` match. - dt = nccl_util.get_nccl_tensor_dtype(tensors[i]) +def _check_inputs_compatibility_for_scatter_gather(tensor, tensor_list): + """Check the compatibility between tensor input and tensor list inputs.""" + if not tensor_list: + raise RuntimeError("Got empty list of tensors.") + dtype = nccl_util.get_nccl_tensor_dtype(tensor) + shape = nccl_util.get_tensor_shape(tensor) + for t in tensor_list: + # check dtype + dt = nccl_util.get_nccl_tensor_dtype(t) if dt != dtype: raise RuntimeError("All tensor operands to scatter/gather must " - "have the same dtype. Got '{}' and '{}'." - .format(dt, dtype)) + "have the same dtype. Got '{}' and '{}'" + "".format(dt, dtype)) # Note: typically CCL libraries only requires they have the same - # number of elements; Here we make it more strict -- we require - # exact shape match. - s = nccl_util.get_tensor_shape(tensors[i]) - if s != shape: + # number of elements; + # Here we make it more strict -- we require exact shape match. + if nccl_util.get_tensor_shape(t) != shape: raise RuntimeError("All tensor operands to scatter/gather must " - "have the same shape. Got '{}' and '{}'." - .format(s, shape)) - # check all tensors in `tensor_lists` match. - for t in tensor_lists[i]: - # check dtype - dt = nccl_util.get_nccl_tensor_dtype(t) - if dt != dtype: - raise RuntimeError( - "All tensor operands to scatter/gather must " - "have the same dtype. Got '{}' and '{}'.".format( - dt, dtype)) - s = nccl_util.get_tensor_shape(t) - if s != shape: - raise RuntimeError( - "All tensor operands to scatter/gather must " - "have the same shape. Got '{}' and '{}'.".format(s, shape)) - - -def _check_gpu_tensors(tensors): - """Check all tensors are distributed on different GPUs.""" - if not tensors or not isinstance(tensors, list): - raise RuntimeError("'tensors' must be a nonempty list.") - if len(tensors) > nccl_util.get_num_gpus(): - raise RuntimeError("Tensor list cannot be larger than the number" - "of available GPUs. Got {} > {}.".format( - len(tensors), nccl_util.get_num_gpus())) - t0 = tensors[0] - dt = nccl_util.get_nccl_tensor_dtype(t0) - s = nccl_util.get_tensor_shape(t0) - d = nccl_util.get_tensor_device(t0) - for i, t in enumerate(tensors): - if i == 0: - continue - # We need to check the following: - # (1) tensor is cuda (already checked during API) - # (2) tensor dtype - # (3) tensor shape match - # (4) each tensor is on a different GPU - dtype = nccl_util.get_nccl_tensor_dtype(t) - if dt != dtype: - raise RuntimeError("Tensors must have identical dtype. Got: '{}'." - .format(dtype)) - shape = nccl_util.get_tensor_shape(t) - if s != shape: - raise RuntimeError("Tensor must have identical shape. Got: '{}'." - .format(shape)) - device = nccl_util.get_tensor_device(t) - if device == d: - raise RuntimeError("Tensor must be on distinct GPUs.") - - -def _get_comm_key_from_devices(devices): - """Return a key from a list of devices for collective calls. - - For example, if the tensors are on gpus 0, 1, 2, 3, - then the key would be "0,1,2,3". - - Args: - devices(list): a list of GPU device indices - - Returns: - str: a string represents the key to query the communicator cache. - - """ - return ",".join([str(d) for d in devices]) - - -def _get_comm_key_send_recv(my_rank, my_gpu_idx, peer_rank, peer_gpu_idx): - """Return a key given source and destination ranks for p2p tasks. - - The p2p key is in the following form: - [min_rank]_[gpu_index]:[max_rank]_[gpu_index]. - - Args: - my_rank (int): the rank of the source process. - my_gpu_idx (int): the source gpu index on the process. - peer_rank (int): the rank of the destination process. - peer_gpu_idx (int): the destination gpu index on the process. - - Returns: - comm_key (str): a string key to query the communication cache. - """ - if my_rank < peer_rank: - lower_key = str(my_rank) + "_" + str(my_gpu_idx) - higher_key = str(peer_rank) + "_" + str(peer_gpu_idx) - elif my_rank > peer_rank: - lower_key = str(peer_rank) + "_" + str(peer_gpu_idx) - higher_key = str(my_rank) + "_" + str(my_gpu_idx) - else: - raise RuntimeError( - "Send and recv happens on the same process. ray.util.collective " - "does not support this case as of now. Alternatively, consider " - "doing GPU to GPU memcpy?") - comm_key = lower_key + ":" + higher_key - return comm_key + "have the same shape.") diff --git a/python/ray/util/collective/collective_group/nccl_util.py b/python/ray/util/collective/collective_group/nccl_util.py index 36895d79b884..889c8c443f36 100644 --- a/python/ray/util/collective/collective_group/nccl_util.py +++ b/python/ray/util/collective/collective_group/nccl_util.py @@ -3,12 +3,9 @@ try: import cupy from cupy.cuda import nccl - from cupy.cuda import Device # noqa: F401 from cupy.cuda.nccl import get_version from cupy.cuda.nccl import get_build_version from cupy.cuda.nccl import NcclCommunicator - from cupy.cuda.nccl import groupStart # noqa: F401 - from cupy.cuda.nccl import groupEnd # noqa: F401 except ImportError: raise ImportError("NCCL in Ray requires Cupy being available!") @@ -77,11 +74,6 @@ } -def get_num_gpus(): - """Returns the number of compute-capable GPUs.""" - return cupy.cuda.runtime.getDeviceCount() - - def get_nccl_build_version(): return get_build_version() @@ -98,12 +90,14 @@ def create_nccl_communicator(world_size, nccl_unique_id, rank): """Create an NCCL communicator using NCCL APIs. Args: - world_size (int): the number of processes of this communicator group. + world_size (int): the number of processes of this communcator group. nccl_unique_id (str): the NCCLUniqueID for this group. rank (int): the rank of this process. Returns: comm (nccl.ncclComm_t): an NCCL communicator. """ + # TODO(Hao): make this inside the NCCLComm class, + # and implement the abort method. Make it RAII. comm = NcclCommunicator(world_size, nccl_unique_id, rank) return comm @@ -155,7 +149,7 @@ def get_tensor_ptr(tensor): if torch_available(): if isinstance(tensor, torch.Tensor): if not tensor.is_cuda: - raise RuntimeError("Torch tensor must be on GPU.") + raise RuntimeError("torch tensor must be on gpu.") return tensor.data_ptr() raise ValueError("Unsupported tensor type. Got: {}. Supported " "GPU tensor types are: torch.Tensor, " @@ -200,24 +194,6 @@ def get_tensor_strides(tensor): "cupy.ndarray.".format(type(tensor))) -def get_tensor_device(tensor): - """Return the GPU index of a tensor.""" - if isinstance(tensor, cupy.ndarray): - try: - device = tensor.device.id - except AttributeError as exec: - raise RuntimeError("The tensor is not on a valid GPU.") \ - from exec - elif torch_available() and isinstance(tensor, torch.Tensor): - device = tensor.device.index - if not isinstance(device, int): - raise RuntimeError("The tensor is not on a valid GPU.") - else: - raise ValueError("Unsupported tensor type. " - "Got: {}.".format(type(tensor))) - return device - - def copy_tensor(dst_tensor, src_tensor): """Copy the content from src_tensor to dst_tensor. @@ -252,21 +228,3 @@ def copy_tensor(dst_tensor, src_tensor): raise ValueError("Unsupported tensor type. Got: {} and {}. Supported " "GPU tensor types are: torch.Tensor, cupy.ndarray." .format(type(dst_tensor), type(src_tensor))) - - -def get_tensor_device_list(tensors): - """Returns the gpu devices of the list of input tensors. - - Args: - tensors(list): a list of tensors, each locates on a GPU. - - Returns: - list: the list of GPU devices. - - """ - if not isinstance(tensors, list): - raise RuntimeError( - "Expect a list of tensors each locates on a GPU device. " - "Got: '{}'.".format(type(tensors))) - devices = [get_tensor_device(t) for t in tensors] - return devices diff --git a/python/ray/util/collective/examples/nccl_allreduce_example.py b/python/ray/util/collective/examples/nccl_allreduce_example.py index 797924621a52..7010d69249f2 100644 --- a/python/ray/util/collective/examples/nccl_allreduce_example.py +++ b/python/ray/util/collective/examples/nccl_allreduce_example.py @@ -11,11 +11,12 @@ def __init__(self): self.recv = cp.zeros((4, ), dtype=cp.float32) def setup(self, world_size, rank): - collective.init_collective_group(world_size, rank, "nccl", "default") + collective.init_collective_group("nccl", world_size, rank, "default") return True def compute(self): collective.allreduce(self.send, "default") + print(self.send) return self.send def destroy(self): @@ -23,8 +24,11 @@ def destroy(self): if __name__ == "__main__": + send = cp.ones((4, ), dtype=cp.float32) + ray.init(num_gpus=2) + num_workers = 2 workers = [] init_rets = [] @@ -34,4 +38,5 @@ def destroy(self): init_rets.append(w.setup.remote(num_workers, i)) _ = ray.get(init_rets) results = ray.get([w.compute.remote() for w in workers]) + # print(results) ray.shutdown() diff --git a/python/ray/util/collective/examples/nccl_allreduce_example_declare_collective_group.py b/python/ray/util/collective/examples/nccl_allreduce_example_declare_collective_group.py index 106ea31b2b7f..9d0335dbab11 100644 --- a/python/ray/util/collective/examples/nccl_allreduce_example_declare_collective_group.py +++ b/python/ray/util/collective/examples/nccl_allreduce_example_declare_collective_group.py @@ -30,4 +30,5 @@ def compute(self): } collective.declare_collective_group(workers, **_options) results = ray.get([w.compute.remote() for w in workers]) + print(results) ray.shutdown() diff --git a/python/ray/util/collective/examples/nccl_allreduce_multigpu_example.py b/python/ray/util/collective/examples/nccl_allreduce_multigpu_example.py deleted file mode 100644 index 88b75802e880..000000000000 --- a/python/ray/util/collective/examples/nccl_allreduce_multigpu_example.py +++ /dev/null @@ -1,43 +0,0 @@ -import ray -import cupy as cp - -import ray.util.collective as collective -from cupy.cuda import Device - - -@ray.remote(num_gpus=2) -class Worker: - def __init__(self): - with Device(0): - self.send1 = cp.ones((4, ), dtype=cp.float32) - with Device(1): - self.send2 = cp.ones((4, ), dtype=cp.float32) * 2 - - self.recv = cp.zeros((4, ), dtype=cp.float32) - - def setup(self, world_size, rank): - collective.init_collective_group(world_size, rank, "nccl", "177") - return True - - def compute(self): - collective.allreduce_multigpu([self.send1, self.send2], "177") - return [self.send1, self.send2], self.send1.device, self.send2.device - - def destroy(self): - collective.destroy_collective_group("177") - - -if __name__ == "__main__": - ray.init(address="auto") - num_workers = 2 - workers = [] - init_rets = [] - for i in range(num_workers): - w = Worker.remote() - workers.append(w) - init_rets.append(w.setup.remote(num_workers, i)) - a = ray.get(init_rets) - results = ray.get([w.compute.remote() for w in workers]) - print(results) - ray.get([w.destroy.remote() for w in workers]) - ray.shutdown() diff --git a/python/ray/util/collective/examples/nccl_p2p_example_multigpu.py b/python/ray/util/collective/examples/nccl_p2p_example_multigpu.py deleted file mode 100644 index 7ff637a5bd68..000000000000 --- a/python/ray/util/collective/examples/nccl_p2p_example_multigpu.py +++ /dev/null @@ -1,53 +0,0 @@ -import ray -import cupy as cp - -import ray.util.collective as collective -from cupy.cuda import Device - - -@ray.remote(num_gpus=2) -class Worker: - def __init__(self): - with Device(0): - self.send1 = cp.ones((4, ), dtype=cp.float32) - with Device(1): - self.send2 = cp.ones((4, ), dtype=cp.float32) * 2 - - with Device(0): - self.recv1 = cp.zeros((4, ), dtype=cp.float32) - with Device(1): - self.recv2 = cp.zeros((4, ), dtype=cp.float32) - self.rank = -1 - - def setup(self, world_size, rank): - self.rank = rank - collective.init_collective_group(world_size, rank, "nccl", "8") - return True - - def compute(self): - if self.rank == 0: - with Device(0): - collective.send_multigpu(self.send1 * 2, 1, 1, "8") - else: - # with Device(1): - collective.recv_multigpu(self.recv2, 0, 0, "8") - return self.recv2 - - def destroy(self): - collective.destroy_collective_group("8") - - -if __name__ == "__main__": - ray.init(address="auto") - num_workers = 2 - workers = [] - init_rets = [] - for i in range(num_workers): - w = Worker.remote() - workers.append(w) - init_rets.append(w.setup.remote(num_workers, i)) - a = ray.get(init_rets) - results = ray.get([w.compute.remote() for w in workers]) - print(results) - ray.get([w.destroy.remote() for w in workers]) - ray.shutdown() diff --git a/python/ray/util/collective/tests/conftest.py b/python/ray/util/collective/tests/conftest.py index 341142ec050d..ab5b3765d166 100644 --- a/python/ray/util/collective/tests/conftest.py +++ b/python/ray/util/collective/tests/conftest.py @@ -1,41 +1,30 @@ """Some fixtures for collective tests.""" -import logging - import pytest + import ray -from ray.util.collective.collective_group.nccl_collective_group \ - import _get_comm_key_from_devices, _get_comm_key_send_recv from ray.util.collective.const import get_nccl_store_name -logger = logging.getLogger(__name__) -logger.setLevel("INFO") - # TODO (Hao): remove this clean_up function as it sometimes crashes Ray. def clean_up(): group_names = ["default", "test", "123?34!", "default2", "random"] group_names.extend([str(i) for i in range(10)]) max_world_size = 4 - all_keys = [] + p2p_group_names = [] for name in group_names: - devices = [[0], [0, 1], [1, 0]] - for d in devices: - collective_communicator_key = _get_comm_key_from_devices(d) - all_keys.append(collective_communicator_key + "@" + name) for i in range(max_world_size): for j in range(max_world_size): - if i < j: - p2p_communicator_key = _get_comm_key_send_recv(i, 0, j, 0) - all_keys.append(p2p_communicator_key + "@" + name) - for group_key in all_keys: - store_name = get_nccl_store_name(group_key) + if i <= j: + p2p_group_name = name + "_" + str(i) + "_" + str(j) + p2p_group_names.append(p2p_group_name) + all_names = group_names + p2p_group_names + for group_name in all_names: + store_name = get_nccl_store_name(group_name) try: actor = ray.get_actor(store_name) except ValueError: actor = None if actor: - logger.debug("Killing actor with group_key: '{}' and store: '{}'." - .format(group_key, store_name)) ray.kill(actor) @@ -52,18 +41,6 @@ def ray_start_single_node_2_gpus(): # my own on-premise cluster before run this fixture. @pytest.fixture def ray_start_distributed_2_nodes_4_gpus(): - # The cluster has a setup of 2 nodes, each node with 2 - # GPUs. Each actor will be allocated 1 GPU. - ray.init("auto") - yield - clean_up() - ray.shutdown() - - -@pytest.fixture -def ray_start_distributed_multigpu_2_nodes_4_gpus(): - # The cluster has a setup of 2 nodes, each node with 2 - # GPUs. Each actor will be allocated 2 GPUs. ray.init("auto") yield clean_up() diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/__init__.py b/python/ray/util/collective/tests/distributed_multigpu_tests/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allgather.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allgather.py deleted file mode 100644 index c4cabcd45524..000000000000 --- a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allgather.py +++ /dev/null @@ -1,82 +0,0 @@ -"""Test the allgather API on a distributed Ray cluster.""" -import pytest -import ray - -import cupy as cp -import torch - -from ray.util.collective.tests.util import \ - create_collective_multigpu_workers, \ - init_tensors_for_gather_scatter_multigpu - - -@pytest.mark.parametrize("tensor_backend", ["cupy", "torch"]) -@pytest.mark.parametrize("array_size", - [2, 2**5, 2**10, 2**15, 2**20, [2, 2], [5, 5, 5]]) -def test_allgather_different_array_size( - ray_start_distributed_multigpu_2_nodes_4_gpus, array_size, - tensor_backend): - world_size = 2 - num_gpu_per_worker = 2 - actual_world_size = world_size * num_gpu_per_worker - actors, _ = create_collective_multigpu_workers(world_size) - init_tensors_for_gather_scatter_multigpu( - actors, array_size=array_size, tensor_backend=tensor_backend) - results = ray.get([a.do_allgather_multigpu.remote() for a in actors]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - for k in range(actual_world_size): - if tensor_backend == "cupy": - assert (results[i][j][k] == cp.ones( - array_size, dtype=cp.float32)).all() - else: - assert (results[i][j][k] == torch.ones( - array_size, dtype=torch.float32).cuda(j)).all() - - -def test_allgather_torch_cupy(ray_start_distributed_multigpu_2_nodes_4_gpus): - world_size = 2 - num_gpu_per_worker = 2 - actual_world_size = world_size * num_gpu_per_worker - shape = [10, 10] - actors, _ = create_collective_multigpu_workers(world_size) - - # tensor is pytorch, list is cupy - for i, a in enumerate(actors): - ray.get([ - a.set_buffer.remote( - shape, tensor_type0="torch", tensor_type1="torch") - ]) - ray.get([ - a.set_list_buffer.remote( - shape, tensor_type0="cupy", tensor_type1="cupy") - ]) - results = ray.get([a.do_allgather_multigpu.remote() for a in actors]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - for k in range(actual_world_size): - assert (results[i][j][k] == cp.ones(shape, - dtype=cp.float32)).all() - - # tensor is cupy, list is pytorch - for i, a in enumerate(actors): - ray.get([ - a.set_buffer.remote( - shape, tensor_type0="cupy", tensor_type1="cupy") - ]) - ray.get([ - a.set_list_buffer.remote( - shape, tensor_type0="torch", tensor_type1="torch") - ]) - results = ray.get([a.do_allgather_multigpu.remote() for a in actors]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - for k in range(actual_world_size): - assert (results[i][j][k] == torch.ones( - shape, dtype=torch.float32).cuda(j)).all() - - -if __name__ == "__main__": - import pytest - import sys - sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allreduce.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allreduce.py deleted file mode 100644 index b681a08490b0..000000000000 --- a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allreduce.py +++ /dev/null @@ -1,160 +0,0 @@ -"""Test the collective allreduice API on a distributed Ray cluster.""" -import pytest -import logging - -import cupy as cp - -import ray -from ray.util.collective.types import ReduceOp -from ray.util.collective.tests.util import create_collective_multigpu_workers - -logger = logging.getLogger(__name__) -logger.setLevel("DEBUG") - - -@pytest.mark.parametrize("group_name", ["default", "test", "123?34!"]) -def test_allreduce_multigpu_different_name( - ray_start_distributed_multigpu_2_nodes_4_gpus, group_name): - world_size = 2 - num_gpu_per_worker = 2 - actual_world_size = world_size * num_gpu_per_worker - actors, _ = create_collective_multigpu_workers( - num_workers=world_size, group_name=group_name) - results = ray.get( - [a.do_allreduce_multigpu.remote(group_name) for a in actors]) - assert (results[0] == cp.ones( - (10, ), dtype=cp.float32) * actual_world_size).all() - assert (results[1] == cp.ones( - (10, ), dtype=cp.float32) * actual_world_size).all() - - -@pytest.mark.parametrize("array_size", [2, 2**5, 2**10, 2**15, 2**20]) -def test_allreduce_multigpu_different_array_size( - ray_start_distributed_multigpu_2_nodes_4_gpus, array_size): - world_size = 2 - num_gpu_per_worker = 2 - actual_world_size = world_size * num_gpu_per_worker - actors, _ = create_collective_multigpu_workers(world_size) - ray.get([a.set_buffer.remote(array_size) for a in actors]) - results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) - assert (results[0] == cp.ones( - (array_size, ), dtype=cp.float32) * actual_world_size).all() - assert (results[1] == cp.ones( - (array_size, ), dtype=cp.float32) * actual_world_size).all() - - -def test_allreduce_multigpu_destroy( - ray_start_distributed_multigpu_2_nodes_4_gpus, - backend="nccl", - group_name="default"): - world_size = 2 - num_gpu_per_worker = 2 - actual_world_size = world_size * num_gpu_per_worker - actors, _ = create_collective_multigpu_workers(world_size) - - results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) - assert (results[0] == cp.ones( - (10, ), dtype=cp.float32) * actual_world_size).all() - assert (results[1] == cp.ones( - (10, ), dtype=cp.float32) * actual_world_size).all() - - # destroy the group and try do work, should fail - ray.get([a.destroy_group.remote() for a in actors]) - with pytest.raises(RuntimeError): - results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) - - # reinit the same group and all reduce - ray.get([ - actor.init_group.remote(world_size, i, backend, group_name) - for i, actor in enumerate(actors) - ]) - results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) - assert (results[0] == cp.ones((10, ), dtype=cp.float32) * actual_world_size - * actual_world_size).all() - assert (results[1] == cp.ones((10, ), dtype=cp.float32) * actual_world_size - * actual_world_size).all() - - -def test_allreduce_multigpu_multiple_group( - ray_start_distributed_multigpu_2_nodes_4_gpus, - backend="nccl", - num_groups=5): - world_size = 2 - num_gpu_per_worker = 2 - actual_world_size = world_size * num_gpu_per_worker - actors, _ = create_collective_multigpu_workers(world_size) - for group_name in range(1, num_groups): - ray.get([ - actor.init_group.remote(world_size, i, backend, str(group_name)) - for i, actor in enumerate(actors) - ]) - for i in range(num_groups): - group_name = "default" if i == 0 else str(i) - results = ray.get( - [a.do_allreduce_multigpu.remote(group_name) for a in actors]) - assert (results[0] == cp.ones( - (10, ), dtype=cp.float32) * (actual_world_size**(i + 1))).all() - - -def test_allreduce_multigpu_different_op( - ray_start_distributed_multigpu_2_nodes_4_gpus): - world_size = 2 - actors, _ = create_collective_multigpu_workers(world_size) - # check product - ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) - ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) - results = ray.get( - [a.do_allreduce_multigpu.remote(op=ReduceOp.PRODUCT) for a in actors]) - assert (results[0] == cp.ones((10, ), dtype=cp.float32) * 120).all() - assert (results[1] == cp.ones((10, ), dtype=cp.float32) * 120).all() - - # check min - ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) - ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) - results = ray.get( - [a.do_allreduce_multigpu.remote(op=ReduceOp.MIN) for a in actors]) - assert (results[0] == cp.ones((10, ), dtype=cp.float32) * 2).all() - assert (results[1] == cp.ones((10, ), dtype=cp.float32) * 2).all() - - # check max - ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) - ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) - results = ray.get( - [a.do_allreduce_multigpu.remote(op=ReduceOp.MAX) for a in actors]) - assert (results[0] == cp.ones((10, ), dtype=cp.float32) * 5).all() - assert (results[1] == cp.ones((10, ), dtype=cp.float32) * 5).all() - - -@pytest.mark.parametrize("dtype", - [cp.uint8, cp.float16, cp.float32, cp.float64]) -def test_allreduce_multigpu_different_dtype( - ray_start_distributed_multigpu_2_nodes_4_gpus, dtype): - world_size = 2 - num_gpu_per_worker = 2 - actual_world_size = world_size * num_gpu_per_worker - actors, _ = create_collective_multigpu_workers(world_size) - ray.get([a.set_buffer.remote([10], dtype=dtype) for a in actors]) - results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) - assert (results[0] == cp.ones( - (10, ), dtype=dtype) * actual_world_size).all() - assert (results[1] == cp.ones( - (10, ), dtype=dtype) * actual_world_size).all() - - -def test_allreduce_torch_cupy(ray_start_distributed_multigpu_2_nodes_4_gpus): - # import torch - world_size = 2 - actual_world_size = 4 - actors, _ = create_collective_multigpu_workers(world_size) - ray.get(actors[0].set_buffer.remote([10])) - ray.get(actors[1].set_buffer.remote( - [10], tensor_type0="torch", tensor_type1="torch")) - results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) - assert (results[0] == cp.ones((10, )) * actual_world_size).all() - - ray.get(actors[0].set_buffer.remote( - [10], tensor_type0="cupy", tensor_type1="torch")) - ray.get(actors[1].set_buffer.remote( - [10], tensor_type0="torch", tensor_type1="cupy")) - results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) - assert (results[0] == cp.ones((10, )) * actual_world_size).all() diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_basic_apis.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_basic_apis.py deleted file mode 100644 index 40be55dd2e0b..000000000000 --- a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_basic_apis.py +++ /dev/null @@ -1,117 +0,0 @@ -"""Test the collective group APIs.""" -import pytest -import ray -from random import shuffle - -from ray.util.collective.tests.util import create_collective_multigpu_workers - - -@pytest.mark.parametrize("group_name", ["default", "test", "123?34!"]) -def test_init_two_actors(ray_start_distributed_multigpu_2_nodes_4_gpus, - group_name): - world_size = 2 - actors, results = create_collective_multigpu_workers( - world_size, group_name) - for i in range(world_size): - assert (results[i]) - - -def test_report_num_gpus(ray_start_distributed_multigpu_2_nodes_4_gpus): - world_size = 2 - actors, results = create_collective_multigpu_workers(world_size) - num_gpus = ray.get([actor.report_num_gpus.remote() for actor in actors]) - assert num_gpus == [2, 2] - - -def test_get_rank(ray_start_distributed_multigpu_2_nodes_4_gpus): - world_size = 2 - actors, _ = create_collective_multigpu_workers(world_size) - actor0_rank = ray.get(actors[0].report_rank.remote()) - assert actor0_rank == 0 - actor1_rank = ray.get(actors[1].report_rank.remote()) - assert actor1_rank == 1 - - # create a second group with a different name, and different - # orders of ranks. - new_group_name = "default2" - ranks = list(range(world_size)) - shuffle(ranks) - _ = ray.get([ - actor.init_group.remote( - world_size, ranks[i], group_name=new_group_name) - for i, actor in enumerate(actors) - ]) - actor0_rank = ray.get(actors[0].report_rank.remote(new_group_name)) - assert actor0_rank == ranks[0] - actor1_rank = ray.get(actors[1].report_rank.remote(new_group_name)) - assert actor1_rank == ranks[1] - - -def test_availability(ray_start_distributed_multigpu_2_nodes_4_gpus): - world_size = 2 - actors, _ = create_collective_multigpu_workers(world_size) - actor0_nccl_availability = ray.get( - actors[0].report_nccl_availability.remote()) - assert actor0_nccl_availability - actor0_gloo_availability = ray.get( - actors[0].report_gloo_availability.remote()) - assert not actor0_gloo_availability - - -def test_is_group_initialized(ray_start_distributed_multigpu_2_nodes_4_gpus): - world_size = 2 - actors, _ = create_collective_multigpu_workers(world_size) - # check group is_init - actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote()) - assert actor0_is_init - actor0_is_init = ray.get( - actors[0].report_is_group_initialized.remote("random")) - assert not actor0_is_init - actor0_is_init = ray.get( - actors[0].report_is_group_initialized.remote("123")) - assert not actor0_is_init - actor1_is_init = ray.get(actors[0].report_is_group_initialized.remote()) - assert actor1_is_init - actor1_is_init = ray.get( - actors[0].report_is_group_initialized.remote("456")) - assert not actor1_is_init - - -def test_destroy_group(ray_start_distributed_multigpu_2_nodes_4_gpus): - world_size = 2 - actors, _ = create_collective_multigpu_workers(world_size) - # Now destroy the group at actor0 - ray.wait([actors[0].destroy_group.remote()]) - actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote()) - assert not actor0_is_init - - # should go well as the group `random` does not exist at all - ray.wait([actors[0].destroy_group.remote("random")]) - - actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote()) - assert actor1_is_init - ray.wait([actors[1].destroy_group.remote("random")]) - actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote()) - assert actor1_is_init - ray.wait([actors[1].destroy_group.remote("default")]) - actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote()) - assert not actor1_is_init - - # Now reconstruct the group using the same name - init_results = ray.get([ - actor.init_group.remote(world_size, i) - for i, actor in enumerate(actors) - ]) - for i in range(world_size): - assert init_results[i] - actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote()) - assert actor0_is_init - actor1_is_init = ray.get(actors[0].report_is_group_initialized.remote()) - assert actor1_is_init - - -if __name__ == "__main__": - import pytest - import sys - - sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_broadcast.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_broadcast.py deleted file mode 100644 index 5ded5bce35e8..000000000000 --- a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_broadcast.py +++ /dev/null @@ -1,92 +0,0 @@ -"""Test the broadcast API.""" -import pytest -import cupy as cp -import ray - -from ray.util.collective.tests.util import create_collective_multigpu_workers - - -@pytest.mark.parametrize("group_name", ["default", "test", "123?34!"]) -@pytest.mark.parametrize("src_rank", [0, 1]) -@pytest.mark.parametrize("src_gpu_index", [0, 1]) -def test_broadcast_different_name( - ray_start_distributed_multigpu_2_nodes_4_gpus, group_name, src_rank, - src_gpu_index): - world_size = 2 - num_gpu_per_worker = 2 - actors, _ = create_collective_multigpu_workers( - num_workers=world_size, group_name=group_name) - ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) - ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) - - results = ray.get([ - a.do_broadcast_multigpu.remote( - group_name=group_name, - src_rank=src_rank, - src_gpu_index=src_gpu_index) for a in actors - ]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - val = (src_rank + 1) * 2 + src_gpu_index - assert ( - results[i][j] == cp.ones([10], dtype=cp.float32) * val).all() - - -@pytest.mark.parametrize("array_size", [2, 2**5, 2**10, 2**15, 2**20]) -@pytest.mark.parametrize("src_rank", [0, 1]) -@pytest.mark.parametrize("src_gpu_index", [0, 1]) -def test_broadcast_different_array_size( - ray_start_distributed_multigpu_2_nodes_4_gpus, array_size, src_rank, - src_gpu_index): - world_size = 2 - num_gpu_per_worker = 2 - actors, _ = create_collective_multigpu_workers(world_size) - ray.get(actors[0].set_buffer.remote([array_size], value0=2, value1=3)) - ray.get(actors[1].set_buffer.remote([array_size], value0=4, value1=5)) - results = ray.get([ - a.do_broadcast_multigpu.remote( - src_rank=src_rank, src_gpu_index=src_gpu_index) for a in actors - ]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - val = (src_rank + 1) * 2 + src_gpu_index - assert (results[i][j] == cp.ones( - (array_size, ), dtype=cp.float32) * val).all() - - -@pytest.mark.parametrize("src_rank", [0, 1]) -@pytest.mark.parametrize("src_gpu_index", [0, 1]) -def test_broadcast_torch_cupy(ray_start_distributed_multigpu_2_nodes_4_gpus, - src_rank, src_gpu_index): - import torch - world_size = 2 - num_gpu_per_worker = 2 - actors, _ = create_collective_multigpu_workers(world_size) - ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) - ray.get(actors[1].set_buffer.remote( - [10], value0=4, value1=5, tensor_type0="torch", tensor_type1="torch")) - results = ray.get([ - a.do_broadcast_multigpu.remote( - src_rank=src_rank, src_gpu_index=src_gpu_index) for a in actors - ]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - val = (src_rank + 1) * 2 + src_gpu_index - if i == 0: - assert (results[i][j] == cp.ones([10], dtype=cp.float32) * - val).all() - else: - assert (results[i][j] == torch.ones([10]).cuda(j) * val).all() - - -@pytest.mark.parametrize("src_rank", [3, 4]) -@pytest.mark.parametrize("src_gpu_index", [2, 3]) -def test_broadcast_invalid_rank(ray_start_distributed_multigpu_2_nodes_4_gpus, - src_rank, src_gpu_index): - world_size = 2 - actors, _ = create_collective_multigpu_workers(world_size) - with pytest.raises(ValueError): - _ = ray.get([ - a.do_broadcast_multigpu.remote( - src_rank=src_rank, src_gpu_index=src_gpu_index) for a in actors - ]) diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reduce.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reduce.py deleted file mode 100644 index 8ac5d54c1c12..000000000000 --- a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reduce.py +++ /dev/null @@ -1,173 +0,0 @@ -"""Test the reduce API.""" -import pytest -import cupy as cp -import ray -from ray.util.collective.types import ReduceOp - -from ray.util.collective.tests.util import create_collective_multigpu_workers - - -@pytest.mark.parametrize("group_name", ["default", "test", "123?34!"]) -@pytest.mark.parametrize("dst_rank", [0, 1]) -@pytest.mark.parametrize("dst_gpu_index", [0, 1]) -def test_reduce_different_name(ray_start_distributed_multigpu_2_nodes_4_gpus, - group_name, dst_rank, dst_gpu_index): - world_size = 2 - num_gpu_per_worker = 2 - actual_world_size = world_size * num_gpu_per_worker - actors, _ = create_collective_multigpu_workers( - num_workers=world_size, group_name=group_name) - results = ray.get([ - a.do_reduce_multigpu.remote( - group_name, dst_rank=dst_rank, dst_gpu_index=dst_gpu_index) - for a in actors - ]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - if i == dst_rank and j == dst_gpu_index: - assert (results[i][j] == cp.ones( - (10, ), dtype=cp.float32) * actual_world_size).all() - else: - assert (results[i][j] == cp.ones((10, ), - dtype=cp.float32)).all() - - -@pytest.mark.parametrize("array_size", [2, 2**5, 2**10, 2**15, 2**20]) -@pytest.mark.parametrize("dst_rank", [0, 1]) -@pytest.mark.parametrize("dst_gpu_index", [0, 1]) -def test_reduce_different_array_size( - ray_start_distributed_multigpu_2_nodes_4_gpus, array_size, dst_rank, - dst_gpu_index): - world_size = 2 - num_gpu_per_worker = 2 - actual_world_size = world_size * num_gpu_per_worker - actors, _ = create_collective_multigpu_workers(num_workers=world_size) - - ray.get(actors[0].set_buffer.remote(array_size)) - ray.get(actors[1].set_buffer.remote(array_size)) - results = ray.get([ - a.do_reduce_multigpu.remote( - dst_rank=dst_rank, dst_gpu_index=dst_gpu_index) for a in actors - ]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - if i == dst_rank and j == dst_gpu_index: - assert (results[i][j] == cp.ones( - (array_size, ), dtype=cp.float32) * - actual_world_size).all() - else: - assert (results[i][j] == cp.ones( - (array_size, ), dtype=cp.float32)).all() - - -@pytest.mark.parametrize("dst_rank", [0, 1]) -@pytest.mark.parametrize("dst_gpu_index", [0, 1]) -def test_reduce_different_op(ray_start_distributed_multigpu_2_nodes_4_gpus, - dst_rank, dst_gpu_index): - world_size = 2 - num_gpu_per_worker = 2 - actors, _ = create_collective_multigpu_workers(world_size) - - # check product - ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) - ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) - results = ray.get([ - a.do_reduce_multigpu.remote( - dst_rank=dst_rank, - dst_gpu_index=dst_gpu_index, - op=ReduceOp.PRODUCT) for a in actors - ]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - if i == dst_rank and j == dst_gpu_index: - assert (results[i][j] == cp.ones( - (10, ), dtype=cp.float32) * 120).all() - else: - val = (i + 1) * 2 + j - assert (results[i][j] == cp.ones( - (10, ), dtype=cp.float32) * val).all() - - # check min - ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) - ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) - results = ray.get([ - a.do_reduce_multigpu.remote( - dst_rank=dst_rank, dst_gpu_index=dst_gpu_index, op=ReduceOp.MIN) - for a in actors - ]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - if i == dst_rank and j == dst_gpu_index: - assert (results[i][j] == cp.ones( - (10, ), dtype=cp.float32) * 2).all() - else: - val = (i + 1) * 2 + j - assert (results[i][j] == cp.ones( - (10, ), dtype=cp.float32) * val).all() - - # check max - ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) - ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) - results = ray.get([ - a.do_reduce_multigpu.remote( - dst_rank=dst_rank, dst_gpu_index=dst_gpu_index, op=ReduceOp.MAX) - for a in actors - ]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - if i == dst_rank and j == dst_gpu_index: - assert (results[i][j] == cp.ones( - (10, ), dtype=cp.float32) * 5).all() - else: - val = (i + 1) * 2 + j - assert (results[i][j] == cp.ones( - (10, ), dtype=cp.float32) * val).all() - - -@pytest.mark.parametrize("dst_rank", [0, 1]) -@pytest.mark.parametrize("dst_gpu_index", [0, 1]) -def test_reduce_torch_cupy(ray_start_distributed_multigpu_2_nodes_4_gpus, - dst_rank, dst_gpu_index): - import torch - world_size = 2 - num_gpu_per_worker = 2 - actors, _ = create_collective_multigpu_workers(world_size) - ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) - ray.get(actors[1].set_buffer.remote( - [10], value0=4, value1=5, tensor_type0="torch", tensor_type1="torch")) - - results = ray.get([ - a.do_reduce_multigpu.remote( - dst_rank=dst_rank, dst_gpu_index=dst_gpu_index) for a in actors - ]) - - for i in range(world_size): - for j in range(num_gpu_per_worker): - val = (i + 1) * 2 + j - if dst_rank == i and dst_gpu_index == j: - if i == 0: - assert (results[i][j] == cp.ones([10], dtype=cp.float32) * - 14).all() - else: - assert ( - results[i][j] == torch.ones([10]).cuda(j) * 14).all() - else: - if i == 0: - assert (results[i][j] == cp.ones([10], dtype=cp.float32) * - val).all() - else: - assert ( - results[i][j] == torch.ones([10]).cuda(j) * val).all() - - -@pytest.mark.parametrize("dst_rank", [3, 4]) -@pytest.mark.parametrize("dst_gpu_index", [2, 3]) -def test_reduce_invalid_rank(ray_start_distributed_multigpu_2_nodes_4_gpus, - dst_rank, dst_gpu_index): - world_size = 2 - actors, _ = create_collective_multigpu_workers(world_size) - with pytest.raises(ValueError): - _ = ray.get([ - a.do_reduce_multigpu.remote( - dst_rank=dst_rank, dst_gpu_index=dst_gpu_index) for a in actors - ]) diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reducescatter.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reducescatter.py deleted file mode 100644 index 48f72389bf89..000000000000 --- a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reducescatter.py +++ /dev/null @@ -1,82 +0,0 @@ -"""Test the collective reducescatter API on a distributed Ray cluster.""" -import pytest -import ray - -import cupy as cp -import torch - -from ray.util.collective.tests.util import \ - create_collective_multigpu_workers, \ - init_tensors_for_gather_scatter_multigpu - - -@pytest.mark.parametrize("tensor_backend", ["cupy", "torch"]) -@pytest.mark.parametrize("array_size", - [2, 2**5, 2**10, 2**15, 2**20, [2, 2], [5, 5, 5]]) -def test_reducescatter_different_array_size( - ray_start_distributed_multigpu_2_nodes_4_gpus, array_size, - tensor_backend): - world_size = 2 - num_gpu_per_worker = 2 - actual_world_size = world_size * num_gpu_per_worker - actors, _ = create_collective_multigpu_workers(world_size) - - init_tensors_for_gather_scatter_multigpu( - actors, array_size=array_size, tensor_backend=tensor_backend) - results = ray.get([a.do_reducescatter_multigpu.remote() for a in actors]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - if tensor_backend == "cupy": - assert (results[i][j] == cp.ones(array_size, dtype=cp.float32) - * actual_world_size).all() - else: - assert (results[i][j] == torch.ones( - array_size, dtype=torch.float32).cuda(j) * - actual_world_size).all() - - -def test_reducescatter_torch_cupy( - ray_start_distributed_multigpu_2_nodes_4_gpus): - world_size = 2 - num_gpu_per_worker = 2 - actual_world_size = world_size * num_gpu_per_worker - shape = [10, 10] - actors, _ = create_collective_multigpu_workers(world_size) - - # tensor is pytorch, list is cupy - for i, a in enumerate(actors): - ray.get([ - a.set_buffer.remote( - shape, tensor_type0="torch", tensor_type1="torch") - ]) - ray.get([ - a.set_list_buffer.remote( - shape, tensor_type0="cupy", tensor_type1="cupy") - ]) - results = ray.get([a.do_reducescatter_multigpu.remote() for a in actors]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - assert (results[i][j] == torch.ones( - shape, dtype=torch.float32).cuda(j) * actual_world_size).all() - - # tensor is cupy, list is pytorch - for i, a in enumerate(actors): - ray.get([ - a.set_buffer.remote( - shape, tensor_type0="cupy", tensor_type1="cupy") - ]) - ray.get([ - a.set_list_buffer.remote( - shape, tensor_type0="torch", tensor_type1="torch") - ]) - results = ray.get([a.do_reducescatter_multigpu.remote() for a in actors]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - assert (results[i][j] == cp.ones(shape, dtype=cp.float32) * - actual_world_size).all() - - -if __name__ == "__main__": - import pytest - import sys - sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_sendrecv.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_sendrecv.py deleted file mode 100644 index a88fdb34ec8f..000000000000 --- a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_sendrecv.py +++ /dev/null @@ -1,47 +0,0 @@ -"""Test the send/recv API.""" -import cupy as cp -import pytest -import ray - -from ray.util.collective.tests.util import create_collective_multigpu_workers - - -# @pytest.mark.parametrize("group_name", ["default", "test", "123?34!"]) -@pytest.mark.parametrize("dst_rank", [0, 1]) -@pytest.mark.parametrize("src_rank", [0, 1]) -@pytest.mark.parametrize("dst_gpu_index", [0, 1]) -@pytest.mark.parametrize("src_gpu_index", [0, 1]) -@pytest.mark.parametrize("array_size", - [2**10, 2**15, 2**20, [2, 2], [5, 9, 10, 85]]) -def test_sendrecv(ray_start_distributed_multigpu_2_nodes_4_gpus, array_size, - src_rank, dst_rank, src_gpu_index, dst_gpu_index): - if src_rank == dst_rank: - return - world_size = 2 - actors, _ = create_collective_multigpu_workers(num_workers=world_size) - - ray.get(actors[0].set_buffer.remote(array_size, value0=2, value1=3)) - ray.get(actors[1].set_buffer.remote(array_size, value0=4, value1=5)) - - refs = [] - for i in range(world_size): - refs.append(actors[i].get_buffer.remote()) - refs[src_rank][src_gpu_index] = actors[src_rank].do_send_multigpu.remote( - dst_rank=dst_rank, - dst_gpu_index=dst_gpu_index, - src_gpu_index=src_gpu_index) - refs[dst_rank][dst_gpu_index] = actors[dst_rank].do_recv_multigpu.remote( - src_rank=src_rank, - src_gpu_index=src_gpu_index, - dst_gpu_index=dst_gpu_index) - results = [] - results_flattend = ray.get(refs[0] + refs[1]) - results.append([results_flattend[0], results_flattend[1]]) - results.append([results_flattend[2], results_flattend[3]]) - assert (results[src_rank][src_gpu_index] == cp.ones( - array_size, dtype=cp.float32) * ( - (src_rank + 1) * 2 + src_gpu_index)).all() - assert (results[dst_rank][dst_gpu_index] == cp.ones( - array_size, dtype=cp.float32) * ( - (src_rank + 1) * 2 + src_gpu_index)).all() - ray.get([a.destroy_group.remote() for a in actors]) diff --git a/python/ray/util/collective/tests/distributed_tests/test_distributed_basic_apis.py b/python/ray/util/collective/tests/distributed_tests/test_distributed_basic_apis.py index a0dd4508001f..0f17b79ba63e 100644 --- a/python/ray/util/collective/tests/distributed_tests/test_distributed_basic_apis.py +++ b/python/ray/util/collective/tests/distributed_tests/test_distributed_basic_apis.py @@ -69,9 +69,9 @@ def test_availability(ray_start_distributed_2_nodes_4_gpus): actor0_nccl_availability = ray.get( actors[0].report_nccl_availability.remote()) assert actor0_nccl_availability - actor0_gloo_availability = ray.get( - actors[0].report_gloo_availability.remote()) - assert not actor0_gloo_availability + actor0_mpi_availability = ray.get( + actors[0].report_mpi_availability.remote()) + assert not actor0_mpi_availability def test_is_group_initialized(ray_start_distributed_2_nodes_4_gpus): diff --git a/python/ray/util/collective/tests/distributed_tests/test_distributed_broadcast.py b/python/ray/util/collective/tests/distributed_tests/test_distributed_broadcast.py index 5c1ecd7f14d8..408ebce76b8a 100644 --- a/python/ray/util/collective/tests/distributed_tests/test_distributed_broadcast.py +++ b/python/ray/util/collective/tests/distributed_tests/test_distributed_broadcast.py @@ -60,8 +60,7 @@ def test_broadcast_torch_cupy(ray_start_distributed_2_nodes_4_gpus, src_rank): assert (results[1] == torch.ones((10, )).cuda() * world_size).all() -def test_broadcast_invalid_rank(ray_start_distributed_2_nodes_4_gpus, - src_rank=3): +def test_broadcast_invalid_rank(ray_start_single_node_2_gpus, src_rank=3): world_size = 2 actors, _ = create_collective_workers(world_size) with pytest.raises(ValueError): diff --git a/python/ray/util/collective/tests/sinlge_node_tests/__init__.py b/python/ray/util/collective/tests/sinlge_node_tests/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/python/ray/util/collective/tests/sinlge_node_tests/test_allgather.py b/python/ray/util/collective/tests/test_allgather.py similarity index 100% rename from python/ray/util/collective/tests/sinlge_node_tests/test_allgather.py rename to python/ray/util/collective/tests/test_allgather.py diff --git a/python/ray/util/collective/tests/sinlge_node_tests/test_allreduce.py b/python/ray/util/collective/tests/test_allreduce.py similarity index 100% rename from python/ray/util/collective/tests/sinlge_node_tests/test_allreduce.py rename to python/ray/util/collective/tests/test_allreduce.py diff --git a/python/ray/util/collective/tests/sinlge_node_tests/test_basic_apis.py b/python/ray/util/collective/tests/test_basic_apis.py similarity index 97% rename from python/ray/util/collective/tests/sinlge_node_tests/test_basic_apis.py rename to python/ray/util/collective/tests/test_basic_apis.py index 29a3ec3f4a15..8c23442a3b4c 100644 --- a/python/ray/util/collective/tests/sinlge_node_tests/test_basic_apis.py +++ b/python/ray/util/collective/tests/test_basic_apis.py @@ -64,9 +64,9 @@ def test_availability(ray_start_single_node_2_gpus): actor0_nccl_availability = ray.get( actors[0].report_nccl_availability.remote()) assert actor0_nccl_availability - actor0_gloo_availability = ray.get( - actors[0].report_gloo_availability.remote()) - assert not actor0_gloo_availability + actor0_mpi_availability = ray.get( + actors[0].report_mpi_availability.remote()) + assert not actor0_mpi_availability def test_is_group_initialized(ray_start_single_node_2_gpus): diff --git a/python/ray/util/collective/tests/sinlge_node_tests/test_broadcast.py b/python/ray/util/collective/tests/test_broadcast.py similarity index 100% rename from python/ray/util/collective/tests/sinlge_node_tests/test_broadcast.py rename to python/ray/util/collective/tests/test_broadcast.py diff --git a/python/ray/util/collective/tests/sinlge_node_tests/test_reduce.py b/python/ray/util/collective/tests/test_reduce.py similarity index 100% rename from python/ray/util/collective/tests/sinlge_node_tests/test_reduce.py rename to python/ray/util/collective/tests/test_reduce.py diff --git a/python/ray/util/collective/tests/sinlge_node_tests/test_reducescatter.py b/python/ray/util/collective/tests/test_reducescatter.py similarity index 100% rename from python/ray/util/collective/tests/sinlge_node_tests/test_reducescatter.py rename to python/ray/util/collective/tests/test_reducescatter.py diff --git a/python/ray/util/collective/tests/sinlge_node_tests/test_sendrecv.py b/python/ray/util/collective/tests/test_sendrecv.py similarity index 100% rename from python/ray/util/collective/tests/sinlge_node_tests/test_sendrecv.py rename to python/ray/util/collective/tests/test_sendrecv.py diff --git a/python/ray/util/collective/tests/util.py b/python/ray/util/collective/tests/util.py index a5fb97a53ad5..259ee24c9727 100644 --- a/python/ray/util/collective/tests/util.py +++ b/python/ray/util/collective/tests/util.py @@ -1,29 +1,20 @@ import cupy as cp -import logging import ray import ray.util.collective as col from ray.util.collective.types import Backend, ReduceOp -from ray.util.collective.collective_group.nccl_util import get_num_gpus import torch -logger = logging.getLogger(__name__) - @ray.remote(num_gpus=1) class Worker: def __init__(self): - self.buffer = None - self.list_buffer = None - - def init_tensors(self): self.buffer = cp.ones((10, ), dtype=cp.float32) self.list_buffer = [ - cp.ones((10, ), dtype=cp.float32) for _ in range(2) + cp.ones((10, ), dtype=cp.float32), + cp.ones((10, ), dtype=cp.float32) ] - cp.cuda.Stream.null.synchronize() - return True def init_group(self, world_size, @@ -88,8 +79,8 @@ def report_nccl_availability(self): avail = col.nccl_available() return avail - def report_gloo_availability(self): - avail = col.gloo_available() + def report_mpi_availability(self): + avail = col.mpi_available() return avail def report_is_group_initialized(self, group_name="default"): @@ -100,11 +91,7 @@ def report_is_group_initialized(self, group_name="default"): def create_collective_workers(num_workers=2, group_name="default", backend="nccl"): - actors = [None] * num_workers - for i in range(num_workers): - actor = Worker.remote() - ray.get([actor.init_tensors.remote()]) - actors[i] = actor + actors = [Worker.remote() for _ in range(num_workers)] world_size = num_workers init_results = ray.get([ actor.init_group.remote(world_size, i, backend, group_name) @@ -125,7 +112,7 @@ def init_tensors_for_gather_scatter(actors, t = torch.ones(array_size, dtype=torch.float32).cuda() * (i + 1) else: raise RuntimeError("Unsupported tensor backend.") - ray.get([a.set_buffer.remote(t)]) + ray.wait([a.set_buffer.remote(t)]) if tensor_backend == "cupy": list_buffer = [ cp.ones(array_size, dtype=dtype) for _ in range(world_size) @@ -138,250 +125,3 @@ def init_tensors_for_gather_scatter(actors, else: raise RuntimeError("Unsupported tensor backend.") ray.get([a.set_list_buffer.remote(list_buffer) for a in actors]) - - -@ray.remote(num_gpus=2) -class MultiGPUWorker: - def __init__(self): - self.buffer0 = None - self.buffer1 = None - self.list_buffer0 = None - self.list_buffer1 = None - - def __del__(self): - self.buffer0 = None - self.buffer1 = None - self.list_buffer0 = None - self.list_buffer1 = None - - def init_tensors(self): - with cp.cuda.Device(0): - self.buffer0 = cp.ones((10, ), dtype=cp.float32) - self.list_buffer0 = [ - cp.ones((10, ), dtype=cp.float32) for _ in range(4) - ] - with cp.cuda.Device(1): - self.buffer1 = cp.ones((10, ), dtype=cp.float32) - self.list_buffer1 = [ - cp.ones((10, ), dtype=cp.float32) for _ in range(4) - ] - cp.cuda.Stream.null.synchronize() - return True - - def init_group(self, - world_size, - rank, - backend=Backend.NCCL, - group_name="default"): - col.init_collective_group(world_size, rank, backend, group_name) - return True - - def set_buffer(self, - size, - value0=1.0, - value1=1.0, - dtype=cp.float32, - tensor_type0="cupy", - tensor_type1="cupy"): - if tensor_type0 == "cupy": - with cp.cuda.Device(0): - self.buffer0 = cp.ones(size, dtype=dtype) * value0 - elif tensor_type0 == "torch": - self.buffer0 = torch.ones( - size, dtype=torch.float32).cuda(0) * value0 - else: - raise RuntimeError() - - if tensor_type1 == "cupy": - with cp.cuda.Device(1): - self.buffer1 = cp.ones(size, dtype=dtype) * value1 - elif tensor_type1 == "torch": - self.buffer1 = torch.ones( - size, dtype=torch.float32).cuda(1) * value1 - else: - raise RuntimeError() - cp.cuda.Device(0).synchronize() - cp.cuda.Device(1).synchronize() - # cp.cuda.Stream.null.synchronize() - return True - - def set_list_buffer(self, - size, - value0=1.0, - value1=1.0, - dtype=cp.float32, - tensor_type0="cupy", - tensor_type1="cupy"): - if tensor_type0 == "cupy": - with cp.cuda.Device(0): - self.list_buffer0 = [ - cp.ones(size, dtype=dtype) * value0 for _ in range(4) - ] - elif tensor_type0 == "torch": - self.list_buffer0 = [ - torch.ones(size, dtype=torch.float32).cuda(0) * value0 - for _ in range(4) - ] - else: - raise RuntimeError() - - if tensor_type1 == "cupy": - with cp.cuda.Device(1): - self.list_buffer1 = [ - cp.ones(size, dtype=dtype) * value1 for _ in range(4) - ] - elif tensor_type1 == "torch": - self.list_buffer1 = [ - torch.ones(size, dtype=torch.float32).cuda(1) * value1 - for _ in range(4) - ] - else: - raise RuntimeError() - cp.cuda.Device(0).synchronize() - cp.cuda.Device(1).synchronize() - return True - - @ray.method(num_returns=2) - def get_buffer(self): - return self.buffer0, self.buffer1 - - def do_allreduce_multigpu(self, group_name="default", op=ReduceOp.SUM): - col.allreduce_multigpu([self.buffer0, self.buffer1], group_name, op) - cp.cuda.Device(0).synchronize() - cp.cuda.Device(1).synchronize() - return self.buffer0 - - def do_reduce_multigpu(self, - group_name="default", - dst_rank=0, - dst_gpu_index=0, - op=ReduceOp.SUM): - col.reduce_multigpu([self.buffer0, self.buffer1], dst_rank, - dst_gpu_index, group_name, op) - cp.cuda.Device(0).synchronize() - cp.cuda.Device(1).synchronize() - return self.buffer0, self.buffer1 - - def do_broadcast_multigpu(self, - group_name="default", - src_rank=0, - src_gpu_index=0): - col.broadcast_multigpu([self.buffer0, self.buffer1], src_rank, - src_gpu_index, group_name) - return self.buffer0, self.buffer1 - - def do_allgather_multigpu(self, group_name="default"): - col.allgather_multigpu([self.list_buffer0, self.list_buffer1], - [self.buffer0, self.buffer1], group_name) - cp.cuda.Device(0).synchronize() - cp.cuda.Device(1).synchronize() - return self.list_buffer0, self.list_buffer1 - - def do_reducescatter_multigpu(self, group_name="default", op=ReduceOp.SUM): - col.reducescatter_multigpu([self.buffer0, self.buffer1], - [self.list_buffer0, self.list_buffer1], - group_name, op) - cp.cuda.Device(0).synchronize() - cp.cuda.Device(1).synchronize() - return self.buffer0, self.buffer1 - - def do_send_multigpu(self, - group_name="default", - dst_rank=0, - dst_gpu_index=0, - src_gpu_index=0): - if src_gpu_index == 0: - col.send_multigpu(self.buffer0, dst_rank, dst_gpu_index, - group_name) - cp.cuda.Device(0).synchronize() - return self.buffer0 - elif src_gpu_index == 1: - col.send_multigpu(self.buffer1, dst_rank, dst_gpu_index, - group_name) - cp.cuda.Device(1).synchronize() - return self.buffer1 - else: - raise RuntimeError() - - def do_recv_multigpu(self, - group_name="default", - src_rank=0, - src_gpu_index=0, - dst_gpu_index=0): - if dst_gpu_index == 0: - col.recv_multigpu(self.buffer0, src_rank, src_gpu_index, - group_name) - cp.cuda.Device(0).synchronize() - return self.buffer0 - elif dst_gpu_index == 1: - col.recv_multigpu(self.buffer1, src_rank, src_gpu_index, - group_name) - cp.cuda.Device(1).synchronize() - return self.buffer1 - else: - raise RuntimeError() - - def destroy_group(self, group_name="default"): - col.destroy_collective_group(group_name) - return True - - def report_rank(self, group_name="default"): - rank = col.get_rank(group_name) - return rank - - def report_world_size(self, group_name="default"): - ws = col.get_world_size(group_name) - return ws - - def report_nccl_availability(self): - avail = col.nccl_available() - return avail - - def report_gloo_availability(self): - avail = col.gloo_available() - return avail - - def report_is_group_initialized(self, group_name="default"): - is_init = col.is_group_initialized(group_name) - return is_init - - def report_num_gpus(self): - n_gpus = get_num_gpus() - return n_gpus - - -def create_collective_multigpu_workers(num_workers=2, - group_name="default", - backend="nccl"): - actors = [None] * num_workers - for i in range(num_workers): - actor = MultiGPUWorker.remote() - ray.get([actor.set_buffer.remote([10])], timeout=10) - ray.get([actor.set_list_buffer.remote([10])], timeout=10) - actors[i] = actor - world_size = num_workers - init_results = ray.get([ - actor.init_group.remote(world_size, i, backend, group_name) - for i, actor in enumerate(actors) - ]) - return actors, init_results - - -def init_tensors_for_gather_scatter_multigpu(actors, - array_size=10, - tensor_backend="cupy"): - for i, a in enumerate(actors): - if tensor_backend == "cupy": - ray.get([a.set_buffer.remote(array_size)]) - ray.get([a.set_list_buffer.remote(array_size)]) - elif tensor_backend == "torch": - ray.get([ - a.set_buffer.remote( - array_size, tensor_type0="torch", tensor_type1="torch") - ]) - ray.get([ - a.set_list_buffer.remote( - array_size, tensor_type0="torch", tensor_type1="torch") - ]) - else: - raise RuntimeError("Unsupported tensor backend.") diff --git a/python/ray/util/collective/types.py b/python/ray/util/collective/types.py index d3e964486f77..c12dde84cb6a 100644 --- a/python/ray/util/collective/types.py +++ b/python/ray/util/collective/types.py @@ -30,7 +30,6 @@ class Backend(object): """A class to represent different backends.""" NCCL = "nccl" MPI = "mpi" - GLOO = "gloo" UNRECOGNIZED = "unrecognized" def __new__(cls, name: str): @@ -39,8 +38,6 @@ def __new__(cls, name: str): raise ValueError("Unrecognized backend: '{}'. " "Only NCCL is supported".format(name)) if backend == Backend.MPI: - raise RuntimeError("Ray does not support MPI backend.") - if backend == Backend.GLOO: raise NotImplementedError() return backend @@ -70,7 +67,6 @@ class BarrierOptions: class ReduceOptions: reduceOp = ReduceOp.SUM root_rank = 0 - root_tensor = 0 # index for multi-gpu reduce operations timeout_ms = unset_timeout_ms @@ -89,7 +85,6 @@ class AllGatherOptions: @dataclass class BroadcastOptions: root_rank = 0 - root_tensor = 0 timeout_ms = unset_timeout_ms @@ -97,17 +92,3 @@ class BroadcastOptions: class ReduceScatterOptions: reduceOp = ReduceOp.SUM timeout_ms = unset_timeout_ms - - -@dataclass -class SendOptions: - dst_rank = 0 - dst_gpu_index = 0 - timeout_ms = unset_timeout_ms - - -@dataclass -class RecvOptions: - src_rank = 0 - src_gpu_index = 0 - unset_timeout_ms = unset_timeout_ms From 16d82414652316ade7e6ccde3c931f3df69b03b5 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 199/244] Revert "Scalability Envelope Tests (#13464)" This reverts commit eeb33abf3dc1d776e7778160c9da3c5f284c1e59. --- benchmarks/README.md | 35 ---- benchmarks/distributed/config.yaml | 58 ------ benchmarks/distributed/test_distributed.py | 204 ------------------- benchmarks/object_store/config.yaml | 48 ----- benchmarks/object_store/test_object_store.py | 61 ------ benchmarks/single_node/config.yaml | 41 ---- benchmarks/single_node/test_single_node.py | 175 ---------------- release/RELEASE_PROCESS.rst | 8 +- 8 files changed, 1 insertion(+), 629 deletions(-) delete mode 100644 benchmarks/README.md delete mode 100644 benchmarks/distributed/config.yaml delete mode 100644 benchmarks/distributed/test_distributed.py delete mode 100644 benchmarks/object_store/config.yaml delete mode 100644 benchmarks/object_store/test_object_store.py delete mode 100644 benchmarks/single_node/config.yaml delete mode 100644 benchmarks/single_node/test_single_node.py diff --git a/benchmarks/README.md b/benchmarks/README.md deleted file mode 100644 index 2167151656a9..000000000000 --- a/benchmarks/README.md +++ /dev/null @@ -1,35 +0,0 @@ -# Ray Scalability Envelope - -### Note: This document is a WIP. This is not a scalability guarantee (yet). - -## Distributed Benchmarks - -All distributed tests are run on 64 nodes with 64 cores/node. Maximum number of nodes is achieved by adding 4 core nodes. - -| Dimension | Quantity | -| --------- | -------- | -| # nodes in cluster (with trivial task workload) | 250+ | -| # actors in cluster (with trivial workload) | 10k+ | -| # simultaneously running tasks | 10k+ | -| # simultaneously running placement groups | 1k+ | - -## Object Store Benchmarks - -| Dimension | Quantity | -| --------- | -------- | -| 1 GiB object broadcast (# of nodes) | 50+ | - - -## Single Node Benchmarks. - -All single node benchmarks are run on a single m4.16xlarge. - -| Dimension | Quantity | -| --------- | -------- | -| # of object artuments to a single task | 10000+ | -| # of objects returned from a single task | 3000+ | -| # of plasma objects in a single `ray.get` call | 10000+ | -| # of tasks queued on a single node | 1,000,000+ | -| Maximum `ray.get` numpy object size | 100GiB+ | - - diff --git a/benchmarks/distributed/config.yaml b/benchmarks/distributed/config.yaml deleted file mode 100644 index 630de0eef265..000000000000 --- a/benchmarks/distributed/config.yaml +++ /dev/null @@ -1,58 +0,0 @@ -cluster_name: distributed-benchmarks -min_workers: 0 -max_workers: 999999 - -upscaling_speed: 9999999 - -provider: - type: aws - region: us-west-2 - availability_zone: us-west-2a, us-west-2b, us-west-2c, us-west-2d - -auth: - ssh_user: ubuntu - -available_node_types: - head_node: - node_config: - InstanceType: m5.16xlarge - ImageId: ami-098555c9b343eb09c - resources: - node: 1 - small: 1 - max_workers: 999999 - worker_node: - node_config: - InstanceType: m5.16xlarge - ImageId: ami-098555c9b343eb09c - resources: - node: 1 - min_workers: 63 - max_workers: 63 - small_worker_node: - node_config: - InstanceType: m5.xlarge - ImageId: ami-098555c9b343eb09c - resources: - node: 1 - max_workers: 999999 - -head_node_type: head_node - -worker_default_node_type: worker_node - -setup_commands: - - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl - - pip install tqdm - - sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 65535" >> /etc/security/limits.conf; echo "* hard nofile 65535" >> /etc/security/limits.conf;' - -idle_timeout_minutes: 1 - -head_start_ray_commands: - - ray stop - - ulimit -n 65535; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml - -# Command to start ray on worker nodes. You don't need to change this. -worker_start_ray_commands: - - ray stop - - ulimit -n 65535; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/benchmarks/distributed/test_distributed.py b/benchmarks/distributed/test_distributed.py deleted file mode 100644 index c929cdba8c1a..000000000000 --- a/benchmarks/distributed/test_distributed.py +++ /dev/null @@ -1,204 +0,0 @@ -import ray -import ray.autoscaler.sdk -from ray.test_utils import Semaphore -from ray.util.placement_group import placement_group, remove_placement_group - -from time import sleep, perf_counter -from tqdm import tqdm, trange - -TEST_NUM_NODES = 64 -MAX_ACTORS_IN_CLUSTER = 10000 -MAX_RUNNING_TASKS_IN_CLUSTER = 10000 -MAX_PLACEMENT_GROUPS = 1000 -MAX_NUM_NODES = 250 - - -def num_alive_nodes(): - n = 0 - for node in ray.nodes(): - if node["Alive"]: - n += 1 - return n - - -def scale_to(target): - while num_alive_nodes() != target: - ray.autoscaler.sdk.request_resources(bundles=[{"node": 1}] * target) - print(f"Current # nodes: {num_alive_nodes()}, target: {target}") - print("Waiting ...") - sleep(5) - - -def test_nodes(): - scale_to(MAX_NUM_NODES) - assert num_alive_nodes() == MAX_NUM_NODES - # Treat this as a trivial task to ensure the nodes are all functioning - test_max_running_tasks() - - -def test_max_actors(): - # TODO (Alex): Dynamically set this based on number of cores - cpus_per_actor = 0.25 - - @ray.remote(num_cpus=cpus_per_actor) - class Actor: - def foo(self): - pass - - actors = [ - Actor.remote() - for _ in trange(MAX_ACTORS_IN_CLUSTER, desc="Launching actors") - ] - - for actor in tqdm(actors, desc="Ensuring actors have started"): - assert ray.get(actor.foo.remote()) is None - - -def test_max_running_tasks(): - counter = Semaphore.remote(0) - blocker = Semaphore.remote(0) - - @ray.remote(num_cpus=0.25) - def task(counter, blocker): - sleep(300) - - refs = [ - task.remote(counter, blocker) - for _ in trange(MAX_RUNNING_TASKS_IN_CLUSTER, desc="Launching tasks") - ] - - max_cpus = ray.cluster_resources()["CPU"] - min_cpus_available = max_cpus - for _ in trange(int(300 / 0.1), desc="Waiting"): - try: - cur_cpus = ray.available_resources().get("CPU", 0) - min_cpus_available = min(min_cpus_available, cur_cpus) - except Exception: - # There are race conditions `.get` can fail if a new heartbeat - # comes at the same time. - pass - sleep(0.1) - - # There are some relevant magic numbers in this check. 10k tasks each - # require 1/4 cpus. Therefore, ideally 2.5k cpus will be used. - err_str = f"Only {max_cpus - min_cpus_available}/{max_cpus} cpus used." - assert max_cpus - min_cpus_available > 2000, err_str - - for _ in trange( - MAX_RUNNING_TASKS_IN_CLUSTER, - desc="Ensuring all tasks have finished"): - done, refs = ray.wait(refs) - assert ray.get(done[0]) is None - - -def test_many_placement_groups(): - @ray.remote(num_cpus=1, resources={"node": 0.02}) - def f1(): - sleep(10) - pass - - @ray.remote(num_cpus=1) - def f2(): - sleep(10) - pass - - @ray.remote(resources={"node": 0.02}) - def f3(): - sleep(10) - pass - - bundle1 = {"node": 0.02, "CPU": 1} - bundle2 = {"CPU": 1} - bundle3 = {"node": 0.02} - - pgs = [] - for _ in trange(MAX_PLACEMENT_GROUPS, desc="Creating pgs"): - pg = placement_group(bundles=[bundle1, bundle2, bundle3]) - pgs.append(pg) - - for pg in tqdm(pgs, desc="Waiting for pgs to be ready"): - ray.get(pg.ready()) - - refs = [] - for pg in tqdm(pgs, desc="Scheduling tasks"): - ref1 = f1.options(placement_group=pg).remote() - ref2 = f2.options(placement_group=pg).remote() - ref3 = f3.options(placement_group=pg).remote() - refs.extend([ref1, ref2, ref3]) - - for _ in trange(10, desc="Waiting"): - sleep(1) - - with tqdm() as p_bar: - while refs: - done, refs = ray.wait(refs) - p_bar.update() - - for pg in tqdm(pgs, desc="Cleaning up pgs"): - remove_placement_group(pg) - - -ray.init(address="auto") - -scale_to(TEST_NUM_NODES) -assert num_alive_nodes( -) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes()) - -cluster_resources = ray.cluster_resources() - -available_resources = ray.available_resources() -assert available_resources == cluster_resources, ( - str(available_resources) + " != " + str(cluster_resources)) -print("Done launching nodes") - -actor_start = perf_counter() -test_max_actors() -actor_end = perf_counter() - -sleep(1) -assert num_alive_nodes( -) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes()) -assert available_resources == cluster_resources, ( - str(available_resources) + " != " + str(cluster_resources)) -print("Done testing actors") - -task_start = perf_counter() -test_max_running_tasks() -task_end = perf_counter() - -sleep(1) -assert num_alive_nodes( -) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes()) -assert available_resources == cluster_resources, ( - str(available_resources) + " != " + str(cluster_resources)) -print("Done testing tasks") - -pg_start = perf_counter() -test_many_placement_groups() -pg_end = perf_counter() - -sleep(1) -assert num_alive_nodes( -) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes()) -assert available_resources == cluster_resources, ( - str(available_resources) + " != " + str(cluster_resources)) -print("Done testing placement groups") - -launch_start = perf_counter() -test_nodes() -launch_end = perf_counter() - -sleep(1) -assert num_alive_nodes( -) == MAX_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes()) -print("Done.") - -actor_time = actor_end - actor_start -task_time = task_end - task_start -pg_time = pg_end - pg_start -launch_time = launch_end - launch_start - -print(f"Actor time: {actor_time} ({MAX_ACTORS_IN_CLUSTER} actors)") -print(f"Task time: {task_time} ({MAX_RUNNING_TASKS_IN_CLUSTER} tasks)") -print(f"PG time: {pg_time} ({MAX_PLACEMENT_GROUPS} placement groups)") -print(f"Node launch time: {launch_time} ({MAX_NUM_NODES} nodes)") diff --git a/benchmarks/object_store/config.yaml b/benchmarks/object_store/config.yaml deleted file mode 100644 index 5ea3ce8352af..000000000000 --- a/benchmarks/object_store/config.yaml +++ /dev/null @@ -1,48 +0,0 @@ -cluster_name: object-store-benchmarks -min_workers: 0 -max_workers: 999999 - -upscaling_speed: 9999999 - -provider: - type: aws - region: us-west-2 - availability_zone: us-west-2a - -auth: - ssh_user: ubuntu - -available_node_types: - head_node: - node_config: - InstanceType: m4.4xlarge - ImageId: ami-098555c9b343eb09c - resources: - node: 1 - max_workers: 999999 - worker_node: - node_config: - InstanceType: m4.xlarge - ImageId: ami-098555c9b343eb09c - resources: - node: 1 - max_workers: 999999 - -head_node_type: head_node - -worker_default_node_type: worker_node - -setup_commands: - - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl - - pip install tqdm numpy - -idle_timeout_minutes: 5 - -head_start_ray_commands: - - ray stop - - ulimit -n 1000000; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml - -# Command to start ray on worker nodes. You don't need to change this. -worker_start_ray_commands: - - ray stop - - ulimit -n 1000000; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/benchmarks/object_store/test_object_store.py b/benchmarks/object_store/test_object_store.py deleted file mode 100644 index 83312fddd90e..000000000000 --- a/benchmarks/object_store/test_object_store.py +++ /dev/null @@ -1,61 +0,0 @@ -import numpy as np - -import ray -import ray.autoscaler.sdk - -from time import sleep, perf_counter -from tqdm import tqdm - -NUM_NODES = 50 -OBJECT_SIZE = 2**30 - - -def num_alive_nodes(): - n = 0 - for node in ray.nodes(): - if node["Alive"]: - n += 1 - return n - - -def scale_to(target): - while num_alive_nodes() != target: - ray.autoscaler.sdk.request_resources(bundles=[{"node": 1}] * target) - print(f"Current # nodes: {num_alive_nodes()}, target: {target}") - print("Waiting ...") - sleep(5) - - -def test_object_broadcast(): - scale_to(NUM_NODES) - - @ray.remote(num_cpus=1, resources={"node": 1}) - class Actor: - def foo(self): - pass - - def sum(self, arr): - return np.sum(arr) - - actors = [Actor.remote() for _ in range(NUM_NODES)] - - arr = np.ones(OBJECT_SIZE, dtype=np.uint8) - ref = ray.put(arr) - - for actor in tqdm(actors, desc="Ensure all actors have started."): - ray.get(actor.foo.remote()) - - result_refs = [] - for actor in tqdm(actors, desc="Broadcasting objects"): - result_refs.append(actor.sum.remote(ref)) - - results = ray.get(result_refs) - for result in results: - assert result == OBJECT_SIZE - - -ray.init(address="auto") -start = perf_counter() -test_object_broadcast() -end = perf_counter() -print(f"Broadcast time: {end - start} ({OBJECT_SIZE} B x {NUM_NODES} nodes)") diff --git a/benchmarks/single_node/config.yaml b/benchmarks/single_node/config.yaml deleted file mode 100644 index e5798541f9c1..000000000000 --- a/benchmarks/single_node/config.yaml +++ /dev/null @@ -1,41 +0,0 @@ -cluster_name: single-node-benchmarks -min_workers: 0 -max_workers: 0 - -upscaling_speed: 9999999 - -provider: - type: aws - region: us-west-2 - availability_zone: us-west-2a - -auth: - ssh_user: ubuntu - -available_node_types: - head_node: - node_config: - InstanceType: m4.16xlarge - ImageId: ami-098555c9b343eb09c - resources: - node: 1 - max_workers: 999999 - worker_node: - node_config: - InstanceType: m4.xlarge - ImageId: ami-098555c9b343eb09c - -head_node_type: head_node - -worker_default_node_type: worker_node - -setup_commands: - - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl - - pip install numpy tqdm - - sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1000000" >> /etc/security/limits.conf; echo "* hard nofile 1000000" >> /etc/security/limits.conf;' - -idle_timeout_minutes: 5 - -head_start_ray_commands: - - ray stop - - ulimit -n 1000000; ray start --head --port=6379 --object-manager-port=8076 --object-store-memory=128000000000 --autoscaling-config=~/ray_bootstrap_config.yaml diff --git a/benchmarks/single_node/test_single_node.py b/benchmarks/single_node/test_single_node.py deleted file mode 100644 index 75d783124523..000000000000 --- a/benchmarks/single_node/test_single_node.py +++ /dev/null @@ -1,175 +0,0 @@ -import numpy as np -import ray -import ray.autoscaler.sdk -from ray.test_utils import Semaphore - -from time import perf_counter -from tqdm import trange, tqdm - -MAX_ARGS = 10000 -MAX_RETURNS = 3000 -MAX_RAY_GET_ARGS = 10000 -MAX_QUEUED_TASKS = 1_000_000 -MAX_RAY_GET_SIZE = 100 * 2**30 - - -def test_many_args(): - @ray.remote - def sum_args(*args): - return sum(sum(arg) for arg in args) - - args = [[1 for _ in range(10000)] for _ in range(MAX_ARGS)] - result = ray.get(sum_args.remote(*args)) - assert result == MAX_ARGS * 10000 - - -def test_many_returns(): - @ray.remote(num_returns=MAX_RETURNS) - def f(): - to_return = [] - for _ in range(MAX_RETURNS): - obj = list(range(10000)) - to_return.append(obj) - - return tuple(to_return) - - returned_refs = f.remote() - assert len(returned_refs) == MAX_RETURNS - - for ref in returned_refs: - expected = list(range(10000)) - obj = ray.get(ref) - assert obj == expected - - -def test_ray_get_args(): - def with_dese(): - print("Putting test objects:") - refs = [] - for _ in trange(MAX_RAY_GET_ARGS): - obj = list(range(10000)) - refs.append(ray.put(obj)) - - print("Getting objects") - results = ray.get(refs) - assert len(results) == MAX_RAY_GET_ARGS - - print("Asserting correctness") - for obj in tqdm(results): - expected = list(range(10000)) - assert obj == expected - - def with_zero_copy(): - print("Putting test objects:") - refs = [] - for _ in trange(MAX_RAY_GET_ARGS): - obj = np.arange(10000) - refs.append(ray.put(obj)) - - print("Getting objects") - results = ray.get(refs) - assert len(results) == MAX_RAY_GET_ARGS - - print("Asserting correctness") - for obj in tqdm(results): - expected = np.arange(10000) - assert (obj == expected).all() - - with_dese() - print("Done with dese") - with_zero_copy() - print("Done with zero copy") - - -def test_many_queued_tasks(): - sema = Semaphore.remote(0) - - @ray.remote(num_cpus=1) - def block(): - ray.get(sema.acquire.remote()) - - @ray.remote(num_cpus=1) - def f(): - pass - - num_cpus = int(ray.cluster_resources()["CPU"]) - blocked_tasks = [] - for _ in range(num_cpus): - blocked_tasks.append(block.remote()) - - print("Submitting many tasks") - pending_tasks = [] - for _ in trange(MAX_QUEUED_TASKS): - pending_tasks.append(f.remote()) - - # Make sure all the tasks can actually run. - for _ in range(num_cpus): - sema.release.remote() - - print("Unblocking tasks") - for ref in tqdm(pending_tasks): - assert ray.get(ref) is None - - -def test_large_object(): - print("Generating object") - obj = np.zeros(MAX_RAY_GET_SIZE, dtype=np.int8) - print("Putting object") - ref = ray.put(obj) - del obj - print("Getting object") - big_obj = ray.get(ref) - - assert big_obj[0] == 0 - assert big_obj[-1] == 0 - - -ray.init(address="auto") - -args_start = perf_counter() -test_many_args() -args_end = perf_counter() - -assert ray.cluster_resources() == ray.available_resources() -print("Finished many args") - -returns_start = perf_counter() -test_many_returns() -returns_end = perf_counter() - -assert ray.cluster_resources() == ray.available_resources() -print("Finished many returns") - -get_start = perf_counter() -test_ray_get_args() -get_end = perf_counter() - -assert ray.cluster_resources() == ray.available_resources() -print("Finished ray.get on many objects") - -queued_start = perf_counter() -test_many_queued_tasks() -queued_end = perf_counter() - -assert ray.cluster_resources() == ray.available_resources() -print("Finished queueing many tasks") - -large_object_start = perf_counter() -test_large_object() -large_object_end = perf_counter() - -assert ray.cluster_resources() == ray.available_resources() -print("Done") - -args_time = args_end - args_start -returns_time = returns_end - returns_start -get_time = get_end - get_start -queued_time = queued_end - queued_start -large_object_time = large_object_end - large_object_start - -print(f"Many args time: {args_time} ({MAX_ARGS} args)") -print(f"Many returns time: {returns_time} ({MAX_RETURNS} returns)") -print(f"Ray.get time: {get_time} ({MAX_RAY_GET_ARGS} args)") -print(f"Queued task time: {queued_time} ({MAX_QUEUED_TASKS} tasks)") -print(f"Ray.get large object time: {large_object_time} " - f"({MAX_RAY_GET_SIZE} bytes)") diff --git a/release/RELEASE_PROCESS.rst b/release/RELEASE_PROCESS.rst index 018f56bdf941..c60e1c4aa789 100644 --- a/release/RELEASE_PROCESS.rst +++ b/release/RELEASE_PROCESS.rst @@ -134,13 +134,7 @@ is generally the easiest way to run release tests. The summaries printed by each test should be checked in under ``release_logs/`` on the **master** branch (make a pull request). -5. **Scalability envelope tests** - - - Run the tests in `benchmarks/` (with `ray submit --start cluster.yaml `) - - Record the outputted times. - - Whether the results are acceptable is a judgement call. - -6. **ASAN tests** +5. **ASAN tests** Run the ``ci/asan_tests`` with the commit. This will enable ASAN build and run the whole Python tests to detect memory leaks. From 500b3a99c2f7f5e13a4f173f85a382ac902204e7 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 200/244] Revert "[CI] Remove object_manager_test (#13703)" This reverts commit 542e524064d78c9e5a75dc02f0c71d5abb92b92f. --- .buildkite/pipeline.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index ebfd96322ecf..0544234af182 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -1,5 +1,6 @@ - label: ":cpp: Tests" commands: + - bash src/ray/test/run_object_manager_tests.sh - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only -- //:all -rllib/... -core_worker_test From 45cf7fde398dd354a5e1f1959b6e5912a527f5be Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 201/244] Revert "Add K8s test to release process (#13694)" This reverts commit 4ea837e7210deaabf8eb8d6d1ce85832d966d81d. --- release/RELEASE_CHECKLIST.md | 4 ---- release/RELEASE_PROCESS.rst | 7 ++----- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/release/RELEASE_CHECKLIST.md b/release/RELEASE_CHECKLIST.md index 9ab85f30bac0..50b30f8ff54c 100644 --- a/release/RELEASE_CHECKLIST.md +++ b/release/RELEASE_CHECKLIST.md @@ -56,10 +56,6 @@ This checklist is meant to be used in conjunction with the RELEASE_PROCESS.rst d - [ ] Results added to `release/release_logs` - [ ] stress_tests - [ ] unit_gpu_tests -- [ ] ASAN Test -- [ ] K8s Test - - [ ] K8s cluster launcher test - - [ ] K8s operator test ## Final Steps - [ ] Wheels uploaded to Test PyPI diff --git a/release/RELEASE_PROCESS.rst b/release/RELEASE_PROCESS.rst index c60e1c4aa789..287ba870c661 100644 --- a/release/RELEASE_PROCESS.rst +++ b/release/RELEASE_PROCESS.rst @@ -136,11 +136,8 @@ is generally the easiest way to run release tests. 5. **ASAN tests** - Run the ``ci/asan_tests`` with the commit. This will enable ASAN build and run the whole Python tests to detect memory leaks. - -6. **K8s operator tests** - - Run the ``python/ray/tests/test_k8s_*`` to make sure K8s cluster launcher and operator works. Make sure the docker image is the released version. + Run the ``ci/asan_tests`` with the commit. This will enable ASAN build and run the + whole Python tests to detect memory leaks. Identify and Resolve Release Blockers ------------------------------------- From 82c3f2f3bb16897d3929fa78242ed843f5415da9 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 202/244] Revert "[Buildkite] Add all Python tests (#13566)" This reverts commit e20509506c7d25e1a38b4cb3e658413f1fe90535. --- .bazelrc | 1 - .buildkite/Dockerfile | 12 +- .buildkite/pipeline.yml | 143 +----------------- ci/travis/install-dependencies.sh | 29 +--- python/ray/scripts/scripts.py | 6 +- python/ray/tests/test_stress.py | 2 +- python/ray/tests/test_stress_failure.py | 2 +- python/ray/tests/test_stress_sharded.py | 2 +- .../tests/test_unreconstructable_errors.py | 2 +- 9 files changed, 16 insertions(+), 183 deletions(-) diff --git a/.bazelrc b/.bazelrc index 8de20992a595..2baaa0fa2af5 100644 --- a/.bazelrc +++ b/.bazelrc @@ -95,7 +95,6 @@ test:asan --test_env=ASAN_OPTIONS="detect_leaks=0" test:asan --test_env=LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libasan.so.2 /usr/lib/gcc/x86_64-linux-gnu/7/libasan.so" # For example, for Ubuntu 18.04 libasan can be found here: # test:asan --test_env=LD_PRELOAD="/usr/lib/gcc/x86_64-linux-gnu/7/libasan.so" -test:asan-buildkite --test_env=LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libasan.so.5" # CI configuration: aquery:ci --color=no diff --git a/.buildkite/Dockerfile b/.buildkite/Dockerfile index 86bd28148985..2f52fb92d1d1 100644 --- a/.buildkite/Dockerfile +++ b/.buildkite/Dockerfile @@ -5,25 +5,15 @@ ARG BUILDKITE_PULL_REQUEST ENV DEBIAN_FRONTEND=noninteractive ENV TZ=America/Los_Angeles - ENV BUILDKITE=true ENV CI=true ENV PYTHON=3.6 -ENV RAY_USE_RANDOM_PORTS=1 -ENV RAY_DEFAULT_BUILD=1 RUN apt-get update -qq RUN apt-get install -y -qq \ curl python-is-python3 git build-essential \ - sudo unzip apt-utils dialog tzdata wget rsync \ - language-pack-en tmux cmake gdb vim htop \ - libgtk2.0-dev zlib1g-dev libgl1-mesa-dev - -# System conf for tests + sudo unzip apt-utils dialog tzdata wget RUN locale -a -ENV LC_ALL=en_US.utf8 -ENV LANG=en_US.utf8 -RUN echo "ulimit -c 0" >> /root/.bashrc # Setup Bazel caches RUN (echo "build --remote_cache=${REMOTE_CACHE_URL}" >> /root/.bazelrc); \ diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 0544234af182..91c673d52604 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -1,141 +1,6 @@ -- label: ":cpp: Tests" +- label: "Ray Core Tests (:buildkite: Experimental)" commands: - - bash src/ray/test/run_object_manager_tests.sh - - bazel test --config=ci $(./scripts/bazel_export_options) - --build_tests_only - -- //:all -rllib/... -core_worker_test - -- label: ":cpp: Tests (ASAN)" + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only -- //:all -rllib/... +- label: "Ray Dashboard Tests" commands: - - bazel test --config=ci --config=asan $(./scripts/bazel_export_options) - --build_tests_only - --config=asan-buildkite - --jobs=2 - -- //:all -//:core_worker_test - -- label: ":serverless: Dashboard + Serve Tests" - commands: - - TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh - - bazel test --config=ci $(./scripts/bazel_export_options) - python/ray/new_dashboard/... - - bazel test --config=ci $(./scripts/bazel_export_options) - python/ray/serve/... - -- label: ":python: (Small & Large)" - commands: - - bazel test --config=ci $(./scripts/bazel_export_options) - --test_tag_filters=-kubernetes,-jenkins_only,-medium_size_python_tests_a_to_j,-medium_size_python_tests_k_to_z - python/ray/tests/... - - bazel test --config=ci $(./scripts/bazel_export_options) - --test_tag_filters=-kubernetes,-jenkins_only,client_tests - --test_env=RAY_CLIENT_MODE=1 - python/ray/tests/... -- label: ":python: (Medium A-J)" - commands: - - bazel test --config=ci $(./scripts/bazel_export_options) - --test_tag_filters=-kubernetes,-jenkins_only,medium_size_python_tests_a_to_j - python/ray/tests/... -- label: ":python: (Medium K-Z)" - commands: - - bazel test --config=ci $(./scripts/bazel_export_options) - --test_tag_filters=-kubernetes,-jenkins_only,medium_size_python_tests_k_to_z - python/ray/tests/... - -- label: ":brain: RLlib: Learning tests (from rllib/tuned_examples/*.yaml)" - commands: - - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh - - bazel test --config=ci $(./scripts/bazel_export_options) - --build_tests_only - --test_tag_filters=learning_tests_tf - rllib/... - -- label: ":brain: RLlib: Learning tests with tf=1.x (from rllib/tuned_examples/*.yaml)" - commands: - - RLLIB_TESTING=1 TF_VERSION=1.14.0 TFP_VERSION=0.7 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh - - bazel test --config=ci $(./scripts/bazel_export_options) - --build_tests_only - --test_tag_filters=learning_tests_tf - rllib/... - -- label: ":brain: RLlib: Learning tests with Torch (from rllib/tuned_examples/*.yaml)" - commands: - - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh - - bazel test --config=ci $(./scripts/bazel_export_options) - --build_tests_only - --test_tag_filters=learning_tests_torch - rllib/... - -- label: ":brain: RLlib: Quick Agent train.py runs" - commands: - - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh - - bazel test --config=ci $(./scripts/bazel_export_options) - --build_tests_only - --test_tag_filters=quick_train - --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 - rllib/... - # Test everything that does not have any of the "main" labels: - # "learning_tests|quick_train|examples|tests_dir". - - bazel test --config=ci $(./scripts/bazel_export_options) - --build_tests_only - --test_tag_filters=-learning_tests_tf,-learning_tests_torch,-quick_train,-examples,-tests_dir - --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 - rllib/... - -- label: ":brain: RLlib: rllib/examples/" - commands: - - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only - --test_tag_filters=examples_A,examples_B --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only - --test_tag_filters=examples_C,examples_D --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only - --test_tag_filters=examples_E,examples_F,examples_G,examples_H,examples_I,examples_J,examples_K,examples_L,examples_M,examples_N,examples_O,examples_P --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 - rllib/... - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only - --test_tag_filters=examples_Q,examples_R,examples_S,examples_T,examples_U,examples_V,examples_W,examples_X,examples_Y,examples_Z --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 - rllib/... - -- label: ":brain: RLlib: rllib/tests/ (A-L)" - commands: - - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only - --test_tag_filters=tests_dir_A,tests_dir_B,tests_dir_C,tests_dir_D,tests_dir_E,tests_dir_F,tests_dir_G,tests_dir_H,tests_dir_I,tests_dir_J,tests_dir_K,tests_dir_L --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 - rllib/... - -- label: ":brain: RLlib: rllib/tests/ (M-Z)" - commands: - - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only - --test_tag_filters=tests_dir_M,tests_dir_N,tests_dir_O,tests_dir_P,tests_dir_Q,tests_dir_R,tests_dir_S,tests_dir_T,tests_dir_U,tests_dir_V,tests_dir_W,tests_dir_X,tests_dir_Y,tests_dir_Z --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 - rllib/... - - -- label: ":octopus: Tune tests and examples" - commands: - - TUNE_TESTING=1 ./ci/travis/install-dependencies.sh - - bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=-jenkins_only,-example python/ray/tune/... - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=example,-tf,-pytorch,-py37,-flaky python/ray/tune/... - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tf,-pytorch,-py37,-flaky python/ray/tune/... - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,pytorch,-py37,-flaky python/ray/tune/... - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-py37,flaky python/ray/tune/... - -- label: ":octopus: SGD tests and examples" - commands: - - SGD_TESTING=1 ./ci/travis/install-dependencies.sh - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tf,-pytorch,-py37 python/ray/util/sgd/... - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,pytorch,-py37 python/ray/util/sgd/... - -- label: ":octopus: Tune/SGD tests and examples. Python 3.7" - commands: - - TUNE_TESTING=1 PYTHON=3.7 INSTALL_HOROVOD=1 ./ci/travis/install-dependencies.sh - # Bcause Python version changed, we need to re-install Ray here - - rm -rf ./python/ray/thirdparty_files; ./ci/travis/ci.sh build - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=py37 python/ray/tune/... - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only python/ray/util/xgboost/... - -- label: ":book: Doc tests and examples" - commands: - - DOC_TESTING=1 ./ci/travis/install-dependencies.sh - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,-pytorch,-py37 doc/... - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tf,-pytorch,-py37 doc/... - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,pytorch,-py37 doc/... \ No newline at end of file + - bazel test --config=ci $(./scripts/bazel_export_options) python/ray/new_dashboard/... diff --git a/ci/travis/install-dependencies.sh b/ci/travis/install-dependencies.sh index 498aaf419533..96f4fa95a8f2 100755 --- a/ci/travis/install-dependencies.sh +++ b/ci/travis/install-dependencies.sh @@ -23,13 +23,6 @@ pkg_install_helper() { } install_bazel() { - if command -v bazel; then - if [ -n "${BUILDKITE-}" ]; then - echo "Bazel exists, skipping the install" - return - fi - fi - "${ROOT_DIR}"/install-bazel.sh if [ -f /etc/profile.d/bazel.sh ]; then . /etc/profile.d/bazel.sh @@ -37,11 +30,6 @@ install_bazel() { } install_base() { - if [ -n "${BUILDKITE-}" ]; then - echo "Skipping install_base in Buildkite" - return - fi - case "${OSTYPE}" in linux*) # Expired apt key error: https://github.com/bazelbuild/bazel/issues/11470#issuecomment-633205152 @@ -200,7 +188,9 @@ install_nvm() { > "${NVM_HOME}/nvm.sh" fi elif [ -n "${BUILDKITE-}" ]; then - echo "Skipping nvm on Buildkite because we will use apt-get." + # https://github.com/nodesource/distributions/blob/master/README.md#installation-instructions + curl -sL https://deb.nodesource.com/setup_14.x | sudo -E bash - + sudo apt-get install -y nodejs else test -f "${NVM_HOME}/nvm.sh" # double-check NVM is already available on other platforms fi @@ -226,19 +216,10 @@ install_upgrade_pip() { } install_node() { - if command -v node; then - if [ -n "${BUILDKITE-}" ]; then - echo "Node existed, skipping install"; - return - fi - fi - if [ "${OSTYPE}" = msys ] ; then { echo "WARNING: Skipping running Node.js due to incompatibilities with Windows"; } 2> /dev/null elif [ -n "${BUILDKITE-}" ] ; then - # https://github.com/nodesource/distributions/blob/master/README.md#installation-instructions - curl -sL https://deb.nodesource.com/setup_14.x | sudo -E bash - - sudo apt-get install -y nodejs + { echo "WARNING: Skipping running Node.js on buildkite because it's already there"; } 2> /dev/null else # Install the latest version of Node.js in order to build the dashboard. ( @@ -277,7 +258,7 @@ install_dependencies() { if [ -n "${PYTHON-}" ]; then # Remove this entire section once RLlib and Serve dependencies are fixed. - if [ "${DOC_TESTING-}" != 1 ] && [ "${SGD_TESTING-}" != 1 ] && [ "${TUNE_TESTING-}" != 1 ]; then + if [ -z "${BUILDKITE-}" ] && [ "${DOC_TESTING-}" != 1 ] && [ "${SGD_TESTING-}" != 1 ] && [ "${TUNE_TESTING-}" != 1 ]; then # PyTorch is installed first since we are using a "-f" directive to find the wheels. # We want to install the CPU version only. local torch_url="https://download.pytorch.org/whl/torch_stable.html" diff --git a/python/ray/scripts/scripts.py b/python/ray/scripts/scripts.py index b61c6939984c..6fecd2dc272b 100644 --- a/python/ray/scripts/scripts.py +++ b/python/ray/scripts/scripts.py @@ -739,7 +739,6 @@ def stop(force, verbose, log_style, log_color): total_found = 0 total_stopped = 0 - stopped = [] for keyword, filter_by_cmd in processes_to_kill: if filter_by_cmd and is_linux and len(keyword) > 15: # getting here is an internal bug, so we do not use cli_logger @@ -778,7 +777,6 @@ def stop(force, verbose, log_style, log_color): cf.dimmed("(via SIGTERM)")) total_stopped += 1 - stopped.append(proc) except psutil.NoSuchProcess: cli_logger.verbose( "Attempted to stop `{}`, but process was already dead.", @@ -801,8 +799,8 @@ def stop(force, verbose, log_style, log_color): cli_logger.warning("Try running the command again, or use `{}`.", cf.bold("--force")) - # Wait for the processes to actually stop. - psutil.wait_procs(stopped, timeout=2) + # TODO(maximsmol): we should probably block until the processes actually + # all died somehow @cli.command() diff --git a/python/ray/tests/test_stress.py b/python/ray/tests/test_stress.py index 99ed186716e2..2007887367ef 100644 --- a/python/ray/tests/test_stress.py +++ b/python/ray/tests/test_stress.py @@ -15,7 +15,7 @@ def ray_start_combination(request): initialize_head=True, head_node_args={ "num_cpus": 10, - "redis_max_memory": 10**8 + "redis_max_memory": 10**7 }) for i in range(num_nodes - 1): cluster.add_node(num_cpus=10) diff --git a/python/ray/tests/test_stress_failure.py b/python/ray/tests/test_stress_failure.py index 83d9f40f24ed..01d39afa8065 100644 --- a/python/ray/tests/test_stress_failure.py +++ b/python/ray/tests/test_stress_failure.py @@ -20,7 +20,7 @@ def ray_start_reconstruction(request): head_node_args={ "num_cpus": 1, "object_store_memory": plasma_store_memory // num_nodes, - "redis_max_memory": 10**8, + "redis_max_memory": 10**7, "_system_config": { "object_timeout_milliseconds": 200 } diff --git a/python/ray/tests/test_stress_sharded.py b/python/ray/tests/test_stress_sharded.py index c6e5cd484bb2..7f05f27acb37 100644 --- a/python/ray/tests/test_stress_sharded.py +++ b/python/ray/tests/test_stress_sharded.py @@ -14,7 +14,7 @@ def ray_start_sharded(request): object_store_memory=int(0.5 * 10**9), num_cpus=10, # _num_redis_shards=num_redis_shards, - _redis_max_memory=10**8) + _redis_max_memory=10**7) yield None diff --git a/python/ray/tests/test_unreconstructable_errors.py b/python/ray/tests/test_unreconstructable_errors.py index 24be89b94297..501dce905530 100644 --- a/python/ray/tests/test_unreconstructable_errors.py +++ b/python/ray/tests/test_unreconstructable_errors.py @@ -10,7 +10,7 @@ def setUp(self): ray.init( num_cpus=1, object_store_memory=150 * 1024 * 1024, - _redis_max_memory=10**8) + _redis_max_memory=10000000) def tearDown(self): ray.shutdown() From 72260846a7a14632010f1539d9dee49a47a585db Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 203/244] Revert "[tune](deps): Bump distributed in /python/requirements (#13643)" This reverts commit b3443195e4836a41777da292729b017bb1ca51a4. --- python/requirements/linux-py3.6-requirements_tune.txt | 2 +- python/requirements/linux-py3.7-requirements_tune.txt | 2 +- python/requirements/linux-py3.8-requirements_tune.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/requirements/linux-py3.6-requirements_tune.txt b/python/requirements/linux-py3.6-requirements_tune.txt index bae7f20ae363..4351d0b6386f 100644 --- a/python/requirements/linux-py3.6-requirements_tune.txt +++ b/python/requirements/linux-py3.6-requirements_tune.txt @@ -155,7 +155,7 @@ defusedxml==0.6.0 # via nbconvert dill==0.3.3 # via autogluon.core -distributed==2021.1.1 +distributed==2021.1.0 # via # autogluon.core # dask diff --git a/python/requirements/linux-py3.7-requirements_tune.txt b/python/requirements/linux-py3.7-requirements_tune.txt index bb10df777068..c7a7b9204649 100644 --- a/python/requirements/linux-py3.7-requirements_tune.txt +++ b/python/requirements/linux-py3.7-requirements_tune.txt @@ -148,7 +148,7 @@ defusedxml==0.6.0 # via nbconvert dill==0.3.3 # via autogluon.core -distributed==2021.1.1 +distributed==2021.1.0 # via # autogluon.core # dask diff --git a/python/requirements/linux-py3.8-requirements_tune.txt b/python/requirements/linux-py3.8-requirements_tune.txt index 8ef61bd51b63..195951424490 100644 --- a/python/requirements/linux-py3.8-requirements_tune.txt +++ b/python/requirements/linux-py3.8-requirements_tune.txt @@ -146,7 +146,7 @@ defusedxml==0.6.0 # via nbconvert dill==0.3.3 # via autogluon.core -distributed==2021.1.1 +distributed==2021.1.0 # via # autogluon.core # dask From 343a0f380583d871ccb3bd754deac9d1fa455fc7 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 204/244] Revert "skip test_spill (#13693)" This reverts commit e2d7ab9e4fdfd0622da3846d2eb6e7e387760f13. --- python/ray/tests/test_object_spilling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/tests/test_object_spilling.py b/python/ray/tests/test_object_spilling.py index a80a91580c6f..68824b7bb09a 100644 --- a/python/ray/tests/test_object_spilling.py +++ b/python/ray/tests/test_object_spilling.py @@ -214,7 +214,7 @@ def test_spill_objects_automatically(object_spilling_config, shutdown_only): @pytest.mark.skipif( - platform.system() in ["Darwin", "Windows"], reason="Failing on Windows.") + platform.system() == "Windows", reason="Failing on Windows.") def test_spill_stats(object_spilling_config, shutdown_only): # Limit our object store to 75 MiB of memory. object_spilling_config, _ = object_spilling_config From fa940a0c9e0dc5df4342bc93c20a089f360cad13 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 205/244] Revert "Revert "Revert "[dashboard] Fix RAY_RAYLET_PID KeyError on Windows (#12948)" (#13572)" (#13685)" This reverts commit b337a60f81fa4ef2fcc82a48a651a0c89b83064e. --- dashboard/agent.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/dashboard/agent.py b/dashboard/agent.py index 7bf5e1551a2b..f1c496b89004 100644 --- a/dashboard/agent.py +++ b/dashboard/agent.py @@ -62,13 +62,9 @@ def __init__(self, self.object_store_name = object_store_name self.raylet_name = raylet_name self.node_id = os.environ["RAY_NODE_ID"] - # TODO(edoakes): RAY_RAYLET_PID isn't properly set on Windows. This is - # only used for fate-sharing with the raylet and we need a different - # fate-sharing mechanism for Windows anyways. - if sys.platform not in ["win32", "cygwin"]: - self.ppid = int(os.environ["RAY_RAYLET_PID"]) - assert self.ppid > 0 - logger.info("Parent pid is %s", self.ppid) + self.ppid = int(os.environ["RAY_RAYLET_PID"]) + assert self.ppid > 0 + logger.info("Parent pid is %s", self.ppid) self.server = aiogrpc.server(options=(("grpc.so_reuseport", 0), )) self.grpc_port = self.server.add_insecure_port( f"[::]:{self.dashboard_agent_port}") @@ -112,8 +108,7 @@ async def _check_parent(): logger.error("Failed to check parent PID, exiting.") sys.exit(1) - if sys.platform not in ["win32", "cygwin"]: - check_parent_task = create_task(_check_parent()) + check_parent_task = create_task(_check_parent()) # Create an aioredis client for all modules. try: From dcea83f3a0fb51c366d5bad23670a53a63fa2a55 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 206/244] Revert "[docs] Remove API warning from mp.Pool (#13683)" This reverts commit b401151ddef1d6a6a36790216ca9bd7f4597d3ce. --- doc/source/multiprocessing.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/source/multiprocessing.rst b/doc/source/multiprocessing.rst index 7d027b734fd9..3e3d57292b04 100644 --- a/doc/source/multiprocessing.rst +++ b/doc/source/multiprocessing.rst @@ -10,6 +10,11 @@ using `Ray Actors `__ instead of local processes. This makes it eas to scale existing applications that use ``multiprocessing.Pool`` from a single node to a cluster. +.. note:: + + This API is new and may be revised in future Ray releases. If you encounter + any bugs, please file an `issue on GitHub`_. + .. _`multiprocessing.Pool API`: https://docs.python.org/3/library/multiprocessing.html#module-multiprocessing.pool Quickstart From 9402b8a1466140f9fb7f29b5a7319a14d33ed7bc Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 207/244] Revert "[kubernetes][operator][hotfix] Dictionary fix (#13663)" This reverts commit b5c4568b7d2742bdf775218855b32da15e3e36ba. --- python/ray/operator/operator_utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/python/ray/operator/operator_utils.py b/python/ray/operator/operator_utils.py index 5d51baebbd77..08afda94f1d4 100644 --- a/python/ray/operator/operator_utils.py +++ b/python/ray/operator/operator_utils.py @@ -95,7 +95,4 @@ def get_cluster_owner_reference( def translate(configuration: Dict[str, Any], dictionary: Dict[str, str]) -> Dict[str, Any]: - return { - dictionary[field]: configuration[field] - for field in dictionary if field in configuration - } + return {dictionary[field]: configuration[field] for field in dictionary} From 10805977711707e46f7b70b9ae3fecb80d187767 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 208/244] Revert "[Metric] Fix crashed when register metric view in multithread (#13485)" This reverts commit 18216d23a1e199ccfc28b8a0f3c9a1bf956f415e. --- src/ray/stats/metric.cc | 29 +++++++++++------------------ src/ray/stats/metric.h | 3 --- src/ray/stats/stats_test.cc | 32 -------------------------------- 3 files changed, 11 insertions(+), 53 deletions(-) diff --git a/src/ray/stats/metric.cc b/src/ray/stats/metric.cc index d4b253428b92..4a475a338408 100644 --- a/src/ray/stats/metric.cc +++ b/src/ray/stats/metric.cc @@ -22,8 +22,6 @@ namespace ray { namespace stats { -absl::Mutex Metric::registration_mutex_; - static void RegisterAsView(opencensus::stats::ViewDescriptor view_descriptor, const std::vector &keys) { // Register global keys. @@ -87,24 +85,19 @@ void Metric::Record(double value, const TagsType &tags) { return; } - // NOTE(lingxuan.zlx): Double check for recording performance while - // processing in multithread and avoid race since metrics may invoke - // record in different threads or code pathes. if (measure_ == nullptr) { - absl::MutexLock lock(®istration_mutex_); - if (measure_ == nullptr) { - // Measure could be registered before, so we try to get it first. - MeasureDouble registered_measure = - opencensus::stats::MeasureRegistry::GetMeasureDoubleByName(name_); - - if (registered_measure.IsValid()) { - measure_.reset(new MeasureDouble(registered_measure)); - } else { - measure_.reset( - new MeasureDouble(MeasureDouble::Register(name_, description_, unit_))); - } - RegisterView(); + // Measure could be registered before, so we try to get it first. + MeasureDouble registered_measure = + opencensus::stats::MeasureRegistry::GetMeasureDoubleByName(name_); + + if (registered_measure.IsValid()) { + measure_.reset(new MeasureDouble(registered_measure)); + } else { + measure_.reset( + new MeasureDouble(MeasureDouble::Register(name_, description_, unit_))); } + + RegisterView(); } // Do record. diff --git a/src/ray/stats/metric.h b/src/ray/stats/metric.h index dac50bc2d947..06e8534c4c67 100644 --- a/src/ray/stats/metric.h +++ b/src/ray/stats/metric.h @@ -129,9 +129,6 @@ class Metric { std::vector tag_keys_; std::unique_ptr> measure_; - // For making sure thread-safe to all of metric registrations. - static absl::Mutex registration_mutex_; - }; // class Metric class Gauge : public Metric { diff --git a/src/ray/stats/stats_test.cc b/src/ray/stats/stats_test.cc index 38f7952823d7..21e1627233a4 100644 --- a/src/ray/stats/stats_test.cc +++ b/src/ray/stats/stats_test.cc @@ -116,38 +116,6 @@ TEST_F(StatsTest, InitializationTest) { ASSERT_TRUE(new_first_tag.second == test_tag_value_that_shouldnt_be_applied); } -TEST(Metric, MultiThreadMetricRegisterViewTest) { - ray::stats::Shutdown(); - std::shared_ptr exporter( - new stats::StdoutExporterClient()); - ray::stats::Init({}, MetricsAgentPort, exporter); - std::vector threads; - const stats::TagKeyType tag1 = stats::TagKeyType::Register("k1"); - const stats::TagKeyType tag2 = stats::TagKeyType::Register("k2"); - for (int index = 0; index < 10; ++index) { - threads.emplace_back([tag1, tag2, index]() { - for (int i = 0; i < 100; i++) { - stats::Count random_counter( - "ray.random.counter" + std::to_string(index) + std::to_string(i), "", "", - {tag1, tag2}); - random_counter.Record(i); - stats::Gauge random_gauge( - "ray.random.gauge" + std::to_string(index) + std::to_string(i), "", "", - {tag1, tag2}); - random_gauge.Record(i); - stats::Sum random_sum( - "ray.random.sum" + std::to_string(index) + std::to_string(i), "", "", - {tag1, tag2}); - random_sum.Record(i); - } - }); - } - for (auto &thread : threads) { - thread.join(); - } - ray::stats::Shutdown(); -} - TEST_F(StatsTest, MultiThreadedInitializationTest) { // Make sure stats module is thread-safe. // Shutdown the stats module first. From 6999caaa4a9cad204e1f102c255c9b89fb6a455a Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 209/244] Revert "[Placement Group] Move PlacementGroup public method to interface. (#13629)" This reverts commit 36bf58bfc092482ceabfb5ca56e4ab65cf0dc7c7. --- .../api/placementgroup/PlacementGroup.java | 50 +------------------ .../placementgroup/PlacementGroupImpl.java | 12 ++--- .../java/io/ray/test/PlacementGroupTest.java | 40 +++++++++------ 3 files changed, 31 insertions(+), 71 deletions(-) diff --git a/java/api/src/main/java/io/ray/api/placementgroup/PlacementGroup.java b/java/api/src/main/java/io/ray/api/placementgroup/PlacementGroup.java index 0c5b31b67889..9b4080deb988 100644 --- a/java/api/src/main/java/io/ray/api/placementgroup/PlacementGroup.java +++ b/java/api/src/main/java/io/ray/api/placementgroup/PlacementGroup.java @@ -1,57 +1,9 @@ package io.ray.api.placementgroup; -import io.ray.api.id.PlacementGroupId; -import java.util.List; -import java.util.Map; - /** * A placement group is used to place interdependent actors according to a specific strategy {@link * PlacementStrategy}. When a placement group is created, the corresponding actor slots and * resources are preallocated. A placement group consists of one or more bundles plus a specific * placement strategy. */ -public interface PlacementGroup { - - /** - * Get the id of current placement group. - * - * @return Id of current placement group. - */ - PlacementGroupId getId(); - - /** - * Get the name of current placement group. - * - * @return Name of current placement group. - */ - String getName(); - - /** - * Get all bundles which key is resource name and value is resource value. - * - * @return All bundles of current placement group. - */ - List> getBundles(); - - /** - * Get the strategy of current placement group. - * - * @return Strategy of current placement group. - */ - PlacementStrategy getStrategy(); - - /** - * Get the state of current placement group. - * - * @return Creation state of current placement group. - */ - PlacementGroupState getState(); - - /** - * Wait for the placement group to be ready within the specified time. - * - * @param timeoutSeconds Timeout in seconds. - * @return True if the placement group is created. False otherwise. - */ - boolean wait(int timeoutSeconds); -} +public interface PlacementGroup {} diff --git a/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupImpl.java b/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupImpl.java index 55ca446f8423..1d0d540848bf 100644 --- a/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupImpl.java +++ b/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupImpl.java @@ -30,32 +30,32 @@ private PlacementGroupImpl( this.state = state; } - @Override public PlacementGroupId getId() { return id; } - @Override public String getName() { return name; } - @Override public List> getBundles() { return bundles; } - @Override public PlacementStrategy getStrategy() { return strategy; } - @Override public PlacementGroupState getState() { return state; } - @Override + /** + * Wait for the placement group to be ready within the specified time. + * + * @param timeoutSeconds Timeout in seconds. + * @return True if the placement group is created. False otherwise. + */ public boolean wait(int timeoutSeconds) { return Ray.internal().waitPlacementGroupReady(id, timeoutSeconds); } diff --git a/java/test/src/main/java/io/ray/test/PlacementGroupTest.java b/java/test/src/main/java/io/ray/test/PlacementGroupTest.java index edbd2c30e4d6..14bf0fd6a577 100644 --- a/java/test/src/main/java/io/ray/test/PlacementGroupTest.java +++ b/java/test/src/main/java/io/ray/test/PlacementGroupTest.java @@ -7,6 +7,7 @@ import io.ray.api.placementgroup.PlacementGroupState; import io.ray.api.placementgroup.PlacementStrategy; import io.ray.runtime.exception.RayException; +import io.ray.runtime.placementgroup.PlacementGroupImpl; import java.util.List; import org.testng.Assert; import org.testng.annotations.Test; @@ -31,7 +32,8 @@ public int getValue() { // This test just creates a placement group with one bundle. // It's not comprehensive to test all placement group test cases. public void testCreateAndCallActor() { - PlacementGroup placementGroup = PlacementGroupTestUtils.createSimpleGroup(); + PlacementGroupImpl placementGroup = + (PlacementGroupImpl) PlacementGroupTestUtils.createSimpleGroup(); Assert.assertTrue(placementGroup.wait(10)); Assert.assertEquals(placementGroup.getName(), "unnamed_group"); @@ -46,18 +48,22 @@ public void testCreateAndCallActor() { @Test(groups = {"cluster"}) public void testGetPlacementGroup() { - PlacementGroup firstPlacementGroup = - PlacementGroupTestUtils.createNameSpecifiedSimpleGroup( - "CPU", 1, PlacementStrategy.PACK, 1.0, "first_placement_group"); - - PlacementGroup secondPlacementGroup = - PlacementGroupTestUtils.createNameSpecifiedSimpleGroup( - "CPU", 1, PlacementStrategy.PACK, 1.0, "second_placement_group"); + PlacementGroupImpl firstPlacementGroup = + (PlacementGroupImpl) + PlacementGroupTestUtils.createNameSpecifiedSimpleGroup( + "CPU", 1, PlacementStrategy.PACK, 1.0, "first_placement_group"); + + PlacementGroupImpl secondPlacementGroup = + (PlacementGroupImpl) + PlacementGroupTestUtils.createNameSpecifiedSimpleGroup( + "CPU", 1, PlacementStrategy.PACK, 1.0, "second_placement_group"); Assert.assertTrue(firstPlacementGroup.wait(10)); Assert.assertTrue(secondPlacementGroup.wait(10)); - PlacementGroup firstPlacementGroupRes = Ray.getPlacementGroup((firstPlacementGroup).getId()); - PlacementGroup secondPlacementGroupRes = Ray.getPlacementGroup((secondPlacementGroup).getId()); + PlacementGroupImpl firstPlacementGroupRes = + (PlacementGroupImpl) Ray.getPlacementGroup((firstPlacementGroup).getId()); + PlacementGroupImpl secondPlacementGroupRes = + (PlacementGroupImpl) Ray.getPlacementGroup((secondPlacementGroup).getId()); Assert.assertNotNull(firstPlacementGroupRes); Assert.assertNotNull(secondPlacementGroupRes); @@ -70,9 +76,9 @@ public void testGetPlacementGroup() { List allPlacementGroup = Ray.getAllPlacementGroups(); Assert.assertEquals(allPlacementGroup.size(), 2); - PlacementGroup placementGroupRes = allPlacementGroup.get(0); + PlacementGroupImpl placementGroupRes = (PlacementGroupImpl) allPlacementGroup.get(0); Assert.assertNotNull(placementGroupRes.getId()); - PlacementGroup expectPlacementGroup = + PlacementGroupImpl expectPlacementGroup = placementGroupRes.getId().equals(firstPlacementGroup.getId()) ? firstPlacementGroup : secondPlacementGroup; @@ -88,16 +94,18 @@ public void testRemovePlacementGroup() { PlacementGroupTestUtils.createNameSpecifiedSimpleGroup( "CPU", 1, PlacementStrategy.PACK, 1.0, "first_placement_group"); - PlacementGroup secondPlacementGroup = - PlacementGroupTestUtils.createNameSpecifiedSimpleGroup( - "CPU", 1, PlacementStrategy.PACK, 1.0, "second_placement_group"); + PlacementGroupImpl secondPlacementGroup = + (PlacementGroupImpl) + PlacementGroupTestUtils.createNameSpecifiedSimpleGroup( + "CPU", 1, PlacementStrategy.PACK, 1.0, "second_placement_group"); List allPlacementGroup = Ray.getAllPlacementGroups(); Assert.assertEquals(allPlacementGroup.size(), 2); Ray.removePlacementGroup(secondPlacementGroup.getId()); - PlacementGroup removedPlacementGroup = Ray.getPlacementGroup((secondPlacementGroup).getId()); + PlacementGroupImpl removedPlacementGroup = + (PlacementGroupImpl) Ray.getPlacementGroup((secondPlacementGroup).getId()); Assert.assertEquals(removedPlacementGroup.getState(), PlacementGroupState.REMOVED); // Wait for placement group after it is removed. From 28e92255620149991413593b61d95d8b4b0727ff Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 210/244] Revert "[RLlib] move evaluation to trainer.step() such that the result is properly logged (#12708)" This reverts commit 59010e252ffb81305ad3bb4a760fd097b3606d67. --- rllib/agents/trainer.py | 8 ++++++++ rllib/agents/trainer_template.py | 12 ------------ 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/rllib/agents/trainer.py b/rllib/agents/trainer.py index 47e637f6dea7..9055fe378a36 100644 --- a/rllib/agents/trainer.py +++ b/rllib/agents/trainer.py @@ -535,6 +535,14 @@ def train(self) -> ResultDict: if hasattr(self, "workers") and isinstance(self.workers, WorkerSet): self._sync_filters_if_needed(self.workers) + if self.config["evaluation_interval"] == 1 or ( + self._iteration > 0 and self.config["evaluation_interval"] + and self._iteration % self.config["evaluation_interval"] == 0): + evaluation_metrics = self._evaluate() + assert isinstance(evaluation_metrics, dict), \ + "_evaluate() needs to return a dict." + result.update(evaluation_metrics) + return result def _sync_filters_if_needed(self, workers: WorkerSet): diff --git a/rllib/agents/trainer_template.py b/rllib/agents/trainer_template.py index 600cbef12bd9..b896958b6bf1 100644 --- a/rllib/agents/trainer_template.py +++ b/rllib/agents/trainer_template.py @@ -146,18 +146,6 @@ def _init(self, config: TrainerConfigDict, @override(Trainer) def step(self): res = next(self.train_exec_impl) - - # self._iteration gets incremented after this function returns, - # meaning that e. g. the first time this function is called, - # self._iteration will be 0. We check `self._iteration+1` in the - # if-statement below to reflect that the first training iteration - # is already over. - if (self.config["evaluation_interval"] and (self._iteration + 1) % - self.config["evaluation_interval"] == 0): - evaluation_metrics = self._evaluate() - assert isinstance(evaluation_metrics, dict), \ - "_evaluate() needs to return a dict." - res.update(evaluation_metrics) return res @override(Trainer) From 99d58ec792b68101cf53c3c774168d29079b0719 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 211/244] Revert "[RLlib] Fix bug in ModelCatalog when using custom action distribution (#12846)" This reverts commit 7889b84b55cdd3b8cfdfab7d083f111b3d992bf0. --- rllib/models/catalog.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index 6d0bfd111296..8e3e43dd08b3 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -204,8 +204,8 @@ def get_action_dist( "Using custom action distribution {}".format(action_dist_name)) dist_cls = _global_registry.get(RLLIB_ACTION_DIST, action_dist_name) - return ModelCatalog._get_multi_action_distribution( - dist_cls, action_space, config, framework) + dist_cls = ModelCatalog._get_multi_action_distribution( + dist_cls, action_space, {}, framework) # Dist_type is given directly as a class. elif type(dist_type) is type and \ @@ -740,8 +740,7 @@ def _get_multi_action_distribution(dist_class, action_space, config, action_space=action_space, child_distributions=child_dists, input_lens=input_lens), int(sum(input_lens)) - return dist_class, dist_class.required_model_output_shape( - action_space, config) + return dist_class @staticmethod def _validate_config(config: ModelConfigDict, framework: str) -> None: From 3e134689c06f412d14c64461d506e45a934baddb Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 212/244] Revert "[RLlib] MAML: Add cartpole mass test for PyTorch. (#13679)" This reverts commit c1cbe2884e5ccb87edb61bde43847bef26dbfca3. --- python/requirements_rllib.txt | 3 --- rllib/agents/maml/tests/test_maml.py | 24 ++++++++------------- rllib/examples/env/cartpole_mass.py | 31 ---------------------------- rllib/examples/env/pendulum_mass.py | 9 +++----- 4 files changed, 12 insertions(+), 55 deletions(-) delete mode 100644 rllib/examples/env/cartpole_mass.py diff --git a/python/requirements_rllib.txt b/python/requirements_rllib.txt index 0cefb02969b3..94ae9cdbb338 100644 --- a/python/requirements_rllib.txt +++ b/python/requirements_rllib.txt @@ -13,6 +13,3 @@ pettingzoo>=1.4.0 # For tests on RecSim and Kaggle envs. recsim kaggle_environments - -# For MAML on PyTorch. -higher diff --git a/rllib/agents/maml/tests/test_maml.py b/rllib/agents/maml/tests/test_maml.py index b84e02857190..e5ef3cf694b0 100644 --- a/rllib/agents/maml/tests/test_maml.py +++ b/rllib/agents/maml/tests/test_maml.py @@ -23,21 +23,15 @@ def test_maml_compilation(self): num_iterations = 1 # Test for tf framework (torch not implemented yet). - for fw in framework_iterator(config, frameworks=("tf", "torch")): - for env in [ - "pendulum_mass.PendulumMassEnv", - "cartpole_mass.CartPoleMassEnv" - ]: - if fw == "tf" and env.startswith("cartpole"): - continue - print("env={}".format(env)) - env_ = "ray.rllib.examples.env.{}".format(env) - trainer = maml.MAMLTrainer(config=config, env=env_) - for i in range(num_iterations): - trainer.train() - check_compute_single_action( - trainer, include_prev_action_reward=True) - trainer.stop() + for _ in framework_iterator(config, frameworks=("tf")): + trainer = maml.MAMLTrainer( + config=config, + env="ray.rllib.examples.env.pendulum_mass.PendulumMassEnv") + for i in range(num_iterations): + trainer.train() + check_compute_single_action( + trainer, include_prev_action_reward=True) + trainer.stop() if __name__ == "__main__": diff --git a/rllib/examples/env/cartpole_mass.py b/rllib/examples/env/cartpole_mass.py deleted file mode 100644 index a0519cb17869..000000000000 --- a/rllib/examples/env/cartpole_mass.py +++ /dev/null @@ -1,31 +0,0 @@ -import numpy as np -import gym -from gym.envs.classic_control.cartpole import CartPoleEnv -from ray.rllib.env.meta_env import MetaEnv - - -class CartPoleMassEnv(CartPoleEnv, gym.utils.EzPickle, MetaEnv): - """CartPoleMassEnv varies the weights of the cart and the pole. - """ - - def sample_tasks(self, n_tasks): - # Sample new cart- and pole masses (random floats between 0.5 and 2.0 - # (cart) and between 0.05 and 0.2 (pole)). - cart_masses = np.random.uniform(low=0.5, high=2.0, size=(n_tasks, 1)) - pole_masses = np.random.uniform(low=0.05, high=0.2, size=(n_tasks, 1)) - return np.concatenate([cart_masses, pole_masses], axis=-1) - - def set_task(self, task): - """ - Args: - task (Tuple[float]): Masses of the cart and the pole. - """ - self.masscart = task[0] - self.masspole = task[1] - - def get_task(self): - """ - Returns: - Tuple[float]: The current mass of the cart- and pole. - """ - return np.array([self.masscart, self.masspole]) diff --git a/rllib/examples/env/pendulum_mass.py b/rllib/examples/env/pendulum_mass.py index b68b283e7410..c4dc93ed7342 100644 --- a/rllib/examples/env/pendulum_mass.py +++ b/rllib/examples/env/pendulum_mass.py @@ -11,22 +11,19 @@ class PendulumMassEnv(PendulumEnv, gym.utils.EzPickle, MetaEnv): """ def sample_tasks(self, n_tasks): - # Sample new pendulum masses (random floats between 0.5 and 2). + # Mass is a random float between 0.5 and 2 return np.random.uniform(low=0.5, high=2.0, size=(n_tasks, )) def set_task(self, task): """ Args: - task (float): Task of the meta-learning environment (here: mass of - the pendulum). + task: task of the meta-learning environment """ - # self.m is the mass property of the pendulum. self.m = task def get_task(self): """ Returns: - float: The current mass of the pendulum (self.m in the PendulumEnv - object). + task: task of the meta-learning environment """ return self.m From f79df667e3c20bbe98e3bd48d66ff61922fe9f89 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 213/244] Revert "[Java] [Test] Move multi-worker config to ray.conf file (#13583)" This reverts commit 6b031e788a050bedaae7af383bc9e885d0bb406c. --- java/test.sh | 7 +++++-- java/test/src/main/java/io/ray/test/FailureTest.java | 5 ++++- java/test/src/main/java/io/ray/test/JobConfigTest.java | 5 ++++- java/test/src/main/java/io/ray/test/KillActorTest.java | 5 ++++- java/test/src/main/resources/ray.conf | 6 ------ 5 files changed, 17 insertions(+), 11 deletions(-) delete mode 100644 java/test/src/main/resources/ray.conf diff --git a/java/test.sh b/java/test.sh index 49a0d68bbdc5..f946fd91ad6f 100755 --- a/java/test.sh +++ b/java/test.sh @@ -50,15 +50,18 @@ if ! git diff --exit-code -- java src/ray/core_worker/lib/java; then exit 1 fi +# Enable multi-worker feature in Java test +TEST_ARGS=(-Dray.job.num-java-workers-per-process=10) + echo "Running tests under cluster mode." # TODO(hchen): Ideally, we should use the following bazel command to run Java tests. However, if there're skipped tests, # TestNG will exit with code 2. And bazel treats it as test failure. # bazel test //java:all_tests --config=ci || cluster_exit_code=$? -run_testng java -cp "$ROOT_DIR"/../bazel-bin/java/all_tests_deploy.jar org.testng.TestNG -d /tmp/ray_java_test_output "$ROOT_DIR"/testng.xml +run_testng java -cp "$ROOT_DIR"/../bazel-bin/java/all_tests_deploy.jar "${TEST_ARGS[@]}" org.testng.TestNG -d /tmp/ray_java_test_output "$ROOT_DIR"/testng.xml echo "Running tests under single-process mode." # bazel test //java:all_tests --jvmopt="-Dray.run-mode=SINGLE_PROCESS" --config=ci || single_exit_code=$? -run_testng java -Dray.run-mode="SINGLE_PROCESS" -cp "$ROOT_DIR"/../bazel-bin/java/all_tests_deploy.jar org.testng.TestNG -d /tmp/ray_java_test_output "$ROOT_DIR"/testng.xml +run_testng java -Dray.run-mode="SINGLE_PROCESS" -cp "$ROOT_DIR"/../bazel-bin/java/all_tests_deploy.jar "${TEST_ARGS[@]}" org.testng.TestNG -d /tmp/ray_java_test_output "$ROOT_DIR"/testng.xml echo "Running connecting existing cluster tests." case "${OSTYPE}" in diff --git a/java/test/src/main/java/io/ray/test/FailureTest.java b/java/test/src/main/java/io/ray/test/FailureTest.java index 5bfc40dd672e..218c78271023 100644 --- a/java/test/src/main/java/io/ray/test/FailureTest.java +++ b/java/test/src/main/java/io/ray/test/FailureTest.java @@ -23,17 +23,20 @@ public class FailureTest extends BaseTest { private static final String EXCEPTION_MESSAGE = "Oops"; + private String oldNumWorkersPerProcess; + @BeforeClass public void setUp() { // This is needed by `testGetThrowsQuicklyWhenFoundException`. // Set one worker per process. Otherwise, if `badFunc2` and `slowFunc` run in the same // process, `sleep` will delay `System.exit`. + oldNumWorkersPerProcess = System.getProperty("ray.job.num-java-workers-per-process"); System.setProperty("ray.job.num-java-workers-per-process", "1"); } @AfterClass public void tearDown() { - System.clearProperty("ray.job.num-java-workers-per-process"); + System.setProperty("ray.job.num-java-workers-per-process", oldNumWorkersPerProcess); } public static int badFunc() { diff --git a/java/test/src/main/java/io/ray/test/JobConfigTest.java b/java/test/src/main/java/io/ray/test/JobConfigTest.java index f5efc3377c3c..4ba9e484d5a1 100644 --- a/java/test/src/main/java/io/ray/test/JobConfigTest.java +++ b/java/test/src/main/java/io/ray/test/JobConfigTest.java @@ -10,8 +10,11 @@ @Test(groups = {"cluster"}) public class JobConfigTest extends BaseTest { + private String oldNumWorkersPerProcess; + @BeforeClass public void setupJobConfig() { + oldNumWorkersPerProcess = System.getProperty("ray.job.num-java-workers-per-process"); System.setProperty("ray.job.num-java-workers-per-process", "3"); System.setProperty("ray.job.jvm-options.0", "-DX=999"); System.setProperty("ray.job.jvm-options.1", "-DY=998"); @@ -21,7 +24,7 @@ public void setupJobConfig() { @AfterClass public void tearDownJobConfig() { - System.clearProperty("ray.job.num-java-workers-per-process"); + System.setProperty("ray.job.num-java-workers-per-process", oldNumWorkersPerProcess); System.clearProperty("ray.job.jvm-options.0"); System.clearProperty("ray.job.jvm-options.1"); System.clearProperty("ray.job.worker-env.foo1"); diff --git a/java/test/src/main/java/io/ray/test/KillActorTest.java b/java/test/src/main/java/io/ray/test/KillActorTest.java index fd92b97118ef..d862d3e1232a 100644 --- a/java/test/src/main/java/io/ray/test/KillActorTest.java +++ b/java/test/src/main/java/io/ray/test/KillActorTest.java @@ -14,14 +14,17 @@ @Test(groups = {"cluster"}) public class KillActorTest extends BaseTest { + private String oldNumWorkersPerProcess; + @BeforeClass public void setUp() { + oldNumWorkersPerProcess = System.getProperty("ray.job.num-java-workers-per-process"); System.setProperty("ray.job.num-java-workers-per-process", "1"); } @AfterClass public void tearDown() { - System.clearProperty("ray.job.num-java-workers-per-process"); + System.setProperty("ray.job.num-java-workers-per-process", oldNumWorkersPerProcess); } public static class HangActor { diff --git a/java/test/src/main/resources/ray.conf b/java/test/src/main/resources/ray.conf deleted file mode 100644 index b838c0075a3f..000000000000 --- a/java/test/src/main/resources/ray.conf +++ /dev/null @@ -1,6 +0,0 @@ -ray { - job { - # Enable multi-worker feature in Java test - num-java-workers-per-process: 10 - } -} From 06369052bae6bfe3ef5c24ed14b7961aafeec66c Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 214/244] Revert "Close #12031 (Autoscaler is overriding your resource for same quantity) (#13671)" This reverts commit 47d4a28c110dd74901d53a46f49a1dbf1a7c9aa3. --- python/ray/node.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/python/ray/node.py b/python/ray/node.py index 086865023e54..186ae3dfdbfd 100644 --- a/python/ray/node.py +++ b/python/ray/node.py @@ -292,10 +292,9 @@ def merge_resources(env_dict, params_dict): for key in set(env_dict.keys()).intersection( set(params_dict.keys())): - if params_dict[key] != env_dict[key]: - logger.warning("Autoscaler is overriding your resource:" - "{}: {} with {}.".format( - key, params_dict[key], env_dict[key])) + logger.warning("Autoscaler is overriding your resource:" + "{}: {} with {}.".format( + key, params_dict[key], env_dict[key])) return num_cpus, num_gpus, memory, object_store_memory, result if not self._resource_spec: From 69c00d4ce9235e568672f3d2c960c8305c781bee Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 215/244] Revert "[Object Spilling] Multi node file spilling V2. (#13542)" This reverts commit 7055a558a65d2b1e0158da8c19c392b36c59917c. --- python/ray/external_storage.py | 4 - python/ray/parameter.py | 3 - python/ray/tests/BUILD | 2 +- python/ray/tests/test_object_spilling.py | 183 ++++++++++-------- src/ray/common/ray_config_def.h | 4 - src/ray/gcs/accessor.h | 2 - .../gcs/gcs_client/service_based_accessor.cc | 4 +- .../gcs/gcs_client/service_based_accessor.h | 1 - src/ray/gcs/gcs_server/gcs_object_manager.cc | 10 +- src/ray/gcs/gcs_server/gcs_object_manager.h | 1 - .../gcs_server/gcs_placement_group_manager.h | 2 +- src/ray/object_manager/common.h | 5 +- src/ray/object_manager/object_buffer_pool.cc | 5 +- src/ray/object_manager/object_directory.cc | 43 ++-- src/ray/object_manager/object_directory.h | 9 +- src/ray/object_manager/object_manager.cc | 12 +- src/ray/object_manager/object_manager.h | 5 +- .../ownership_based_object_directory.cc | 6 +- src/ray/object_manager/pull_manager.cc | 60 ++---- src/ray/object_manager/pull_manager.h | 6 +- .../object_manager/test/pull_manager_test.cc | 130 +++++-------- src/ray/protobuf/gcs.proto | 10 +- src/ray/protobuf/gcs_service.proto | 5 +- src/ray/protobuf/node_manager.proto | 15 -- src/ray/raylet/local_object_manager.cc | 33 +--- src/ray/raylet/local_object_manager.h | 41 +--- src/ray/raylet/node_manager.cc | 78 ++------ src/ray/raylet/node_manager.h | 11 -- src/ray/raylet/raylet.cc | 5 +- src/ray/raylet/reconstruction_policy.cc | 3 +- src/ray/raylet/reconstruction_policy_test.cc | 5 +- .../raylet/test/local_object_manager_test.cc | 86 +------- src/ray/raylet_client/raylet_client.cc | 12 -- src/ray/raylet_client/raylet_client.h | 9 - .../rpc/node_manager/node_manager_client.h | 3 - .../rpc/node_manager/node_manager_server.h | 5 - 36 files changed, 247 insertions(+), 571 deletions(-) diff --git a/python/ray/external_storage.py b/python/ray/external_storage.py index 6e16351482cd..1b4f6fec81f1 100644 --- a/python/ray/external_storage.py +++ b/python/ray/external_storage.py @@ -345,10 +345,6 @@ def setup_external_storage(config): elif storage_type == "smart_open": _external_storage = ExternalStorageSmartOpenImpl( **config["params"]) - elif storage_type == "mock_distributed_fs": - # This storage is used to unit test distributed external storages. - # TODO(sang): Delete it after introducing the mock S3 test. - _external_storage = FileSystemStorage(**config["params"]) else: raise ValueError(f"Unknown external storage type: {storage_type}") else: diff --git a/python/ray/parameter.py b/python/ray/parameter.py index 666b82905b1e..a9b20769d1e2 100644 --- a/python/ray/parameter.py +++ b/python/ray/parameter.py @@ -330,6 +330,3 @@ def _check_usage(self): # Validate external storage usage. external_storage.setup_external_storage(object_spilling_config) external_storage.reset_external_storage() - # Configure the proper system config. - self._system_config["is_external_storage_type_fs"] = ( - object_spilling_config["type"] == "filesystem") diff --git a/python/ray/tests/BUILD b/python/ray/tests/BUILD index 2ccdb4be2644..8fe8b21c3369 100644 --- a/python/ray/tests/BUILD +++ b/python/ray/tests/BUILD @@ -53,6 +53,7 @@ py_test_module_list( "test_multinode_failures_2.py", "test_multiprocessing.py", "test_object_manager.py", + "test_object_spilling.py", "test_output.py", "test_reconstruction.py", "test_reference_counting.py", @@ -133,7 +134,6 @@ py_test_module_list( py_test_module_list( files = [ "test_placement_group.py", - "test_object_spilling.py", ], size = "large", extra_srcs = SRCS, diff --git a/python/ray/tests/test_object_spilling.py b/python/ray/tests/test_object_spilling.py index 68824b7bb09a..8319dbfcac54 100644 --- a/python/ray/tests/test_object_spilling.py +++ b/python/ray/tests/test_object_spilling.py @@ -21,15 +21,6 @@ "directory_path": spill_local_path } } -# Since we have differet protocol for a local external storage (e.g., fs) -# and distributed external storage (e.g., S3), we need to test both cases. -# This mocks the distributed fs with cluster utils. -mock_distributed_fs_object_spilling_config = { - "type": "mock_distributed_fs", - "params": { - "directory_path": spill_local_path - } -} smart_open_object_spilling_config = { "type": "smart_open", "params": { @@ -38,15 +29,6 @@ } -def create_object_spilling_config(request, tmp_path): - if (request.param["type"] == "filesystem" - or request.param["type"] == "mock_distributed_fs"): - temp_folder = tmp_path / "spill" - temp_folder.mkdir() - request.param["params"]["directory_path"] = str(temp_folder) - return json.dumps(request.param), temp_folder - - @pytest.fixture( scope="function", params=[ @@ -54,18 +36,10 @@ def create_object_spilling_config(request, tmp_path): # TODO(sang): Add a mock dependency to test S3. # smart_open_object_spilling_config, ]) -def object_spilling_config(request, tmp_path): - yield create_object_spilling_config(request, tmp_path) - - -@pytest.fixture( - scope="function", - params=[ - file_system_object_spilling_config, - mock_distributed_fs_object_spilling_config - ]) -def multi_node_object_spilling_config(request, tmp_path): - yield create_object_spilling_config(request, tmp_path) +def object_spilling_config(request, tmpdir): + if request.param["type"] == "filesystem": + request.param["params"]["directory_path"] = str(tmpdir) + yield json.dumps(request.param) def test_invalid_config_raises_exception(shutdown_only): @@ -101,17 +75,22 @@ def test_url_generation_and_parse(): @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") -def test_spilling_not_done_for_pinned_object(object_spilling_config, - shutdown_only): +def test_spilling_not_done_for_pinned_object(tmp_path, shutdown_only): # Limit our object store to 75 MiB of memory. - object_spilling_config, temp_folder = object_spilling_config + temp_folder = tmp_path / "spill" + temp_folder.mkdir() ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 4, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, - "object_spilling_config": object_spilling_config, + "object_spilling_config": json.dumps({ + "type": "filesystem", + "params": { + "directory_path": str(temp_folder) + } + }), "min_spilling_size": 0, }) arr = np.random.rand(5 * 1024 * 1024) # 40 MB @@ -131,23 +110,27 @@ def is_dir_empty(): @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") -def test_spill_remote_object(ray_start_cluster, - multi_node_object_spilling_config): - cluster = ray_start_cluster - object_spilling_config, _ = multi_node_object_spilling_config - cluster.add_node( - num_cpus=0, - object_store_memory=75 * 1024 * 1024, - _system_config={ +@pytest.mark.parametrize( + "ray_start_cluster_head", [{ + "num_cpus": 0, + "object_store_memory": 75 * 1024 * 1024, + "_system_config": { "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, "max_io_workers": 4, - "object_spilling_config": object_spilling_config, + "object_spilling_config": json.dumps({ + "type": "filesystem", + "params": { + "directory_path": "/tmp" + } + }), "min_spilling_size": 0, - }) - ray.init(address=cluster.address) + }, + }], + indirect=True) +def test_spill_remote_object(ray_start_cluster_head): + cluster = ray_start_cluster_head cluster.add_node(object_store_memory=75 * 1024 * 1024) - cluster.wait_for_nodes() @ray.remote def put(): @@ -179,7 +162,6 @@ def depends(arg): platform.system() == "Windows", reason="Failing on Windows.") def test_spill_objects_automatically(object_spilling_config, shutdown_only): # Limit our object store to 75 MiB of memory. - object_spilling_config, _ = object_spilling_config ray.init( num_cpus=1, object_store_memory=75 * 1024 * 1024, @@ -215,9 +197,10 @@ def test_spill_objects_automatically(object_spilling_config, shutdown_only): @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") -def test_spill_stats(object_spilling_config, shutdown_only): +def test_spill_stats(tmp_path, shutdown_only): # Limit our object store to 75 MiB of memory. - object_spilling_config, _ = object_spilling_config + temp_folder = tmp_path / "spill" + temp_folder.mkdir() ray.init( num_cpus=1, object_store_memory=100 * 1024 * 1024, @@ -225,7 +208,14 @@ def test_spill_stats(object_spilling_config, shutdown_only): "automatic_object_spilling_enabled": True, "max_io_workers": 100, "min_spilling_size": 1, - "object_spilling_config": object_spilling_config + "object_spilling_config": json.dumps( + { + "type": "filesystem", + "params": { + "directory_path": str(temp_folder) + } + }, + separators=(",", ":")) }, ) @@ -252,7 +242,6 @@ def f(): @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") def test_spill_during_get(object_spilling_config, shutdown_only): - object_spilling_config, _ = object_spilling_config ray.init( num_cpus=4, object_store_memory=100 * 1024 * 1024, @@ -284,7 +273,6 @@ def f(): @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") def test_spill_deadlock(object_spilling_config, shutdown_only): - object_spilling_config, _ = object_spilling_config # Limit our object store to 75 MiB of memory. ray.init( object_store_memory=75 * 1024 * 1024, @@ -314,9 +302,10 @@ def test_spill_deadlock(object_spilling_config, shutdown_only): @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") -def test_delete_objects(object_spilling_config, shutdown_only): +def test_delete_objects(tmp_path, shutdown_only): # Limit our object store to 75 MiB of memory. - object_spilling_config, temp_folder = object_spilling_config + temp_folder = tmp_path / "spill" + temp_folder.mkdir() ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ @@ -324,7 +313,12 @@ def test_delete_objects(object_spilling_config, shutdown_only): "min_spilling_size": 0, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, - "object_spilling_config": object_spilling_config, + "object_spilling_config": json.dumps({ + "type": "filesystem", + "params": { + "directory_path": str(temp_folder) + } + }), }) arr = np.random.rand(1024 * 1024) # 8 MB data replay_buffer = [] @@ -349,11 +343,13 @@ def is_dir_empty(): @pytest.mark.skipif( - platform.system() in ["Windows", "Darwin"], reason="Failing on Windows.") -def test_delete_objects_delete_while_creating(object_spilling_config, - shutdown_only): + platform.system() in ["Windows", "Darwin"], + reason="Failing on " + "Windows and Mac.") +def test_delete_objects_delete_while_creating(tmp_path, shutdown_only): # Limit our object store to 75 MiB of memory. - object_spilling_config, temp_folder = object_spilling_config + temp_folder = tmp_path / "spill" + temp_folder.mkdir() ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ @@ -361,7 +357,12 @@ def test_delete_objects_delete_while_creating(object_spilling_config, "min_spilling_size": 0, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, - "object_spilling_config": object_spilling_config, + "object_spilling_config": json.dumps({ + "type": "filesystem", + "params": { + "directory_path": str(temp_folder) + } + }), }) arr = np.random.rand(1024 * 1024) # 8 MB data replay_buffer = [] @@ -394,18 +395,25 @@ def is_dir_empty(): @pytest.mark.skipif( - platform.system() in ["Windows", "Darwin"], reason="Failing on Windows.") -def test_delete_objects_on_worker_failure(object_spilling_config, - shutdown_only): + platform.system() in ["Windows", "Darwin"], + reason="Failing on Windows " + "and Mac.") +def test_delete_objects_on_worker_failure(tmp_path, shutdown_only): # Limit our object store to 75 MiB of memory. - object_spilling_config, temp_folder = object_spilling_config + temp_folder = tmp_path / "spill" + temp_folder.mkdir() ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 4, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, - "object_spilling_config": object_spilling_config, + "object_spilling_config": json.dumps({ + "type": "filesystem", + "params": { + "directory_path": str(temp_folder) + } + }), "min_spilling_size": 0, }) @@ -461,10 +469,10 @@ def is_dir_empty(): @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") -def test_delete_objects_multi_node(multi_node_object_spilling_config, - ray_start_cluster): +def test_delete_objects_multi_node(tmp_path, ray_start_cluster): # Limit our object store to 75 MiB of memory. - object_spilling_config, temp_folder = multi_node_object_spilling_config + temp_folder = tmp_path / "spill" + temp_folder.mkdir() cluster = ray_start_cluster # Head node. cluster.add_node( @@ -475,7 +483,12 @@ def test_delete_objects_multi_node(multi_node_object_spilling_config, "min_spilling_size": 20 * 1024 * 1024, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, - "object_spilling_config": object_spilling_config, + "object_spilling_config": json.dumps({ + "type": "filesystem", + "params": { + "directory_path": str(temp_folder) + } + }), }) # Add 2 worker nodes. for _ in range(2): @@ -533,9 +546,10 @@ def is_dir_empty(): @pytest.mark.skipif(platform.system() == "Windows", reason="Flaky on Windows.") -def test_fusion_objects(object_spilling_config, shutdown_only): +def test_fusion_objects(tmp_path, shutdown_only): # Limit our object store to 75 MiB of memory. - object_spilling_config, temp_folder = object_spilling_config + temp_folder = tmp_path / "spill" + temp_folder.mkdir() min_spilling_size = 10 * 1024 * 1024 ray.init( object_store_memory=75 * 1024 * 1024, @@ -543,7 +557,12 @@ def test_fusion_objects(object_spilling_config, shutdown_only): "max_io_workers": 3, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, - "object_spilling_config": object_spilling_config, + "object_spilling_config": json.dumps({ + "type": "filesystem", + "params": { + "directory_path": str(temp_folder) + } + }), "min_spilling_size": min_spilling_size, }) replay_buffer = [] @@ -581,8 +600,8 @@ def test_fusion_objects(object_spilling_config, shutdown_only): # https://github.com/ray-project/ray/issues/12912 -def do_test_release_resource(object_spilling_config, expect_released): - object_spilling_config, temp_folder = object_spilling_config +def do_test_release_resource(tmp_path, expect_released): + temp_folder = tmp_path / "spill" ray.init( num_cpus=1, object_store_memory=75 * 1024 * 1024, @@ -590,7 +609,12 @@ def do_test_release_resource(object_spilling_config, expect_released): "max_io_workers": 1, "release_resources_during_plasma_fetch": expect_released, "automatic_object_spilling_enabled": True, - "object_spilling_config": object_spilling_config, + "object_spilling_config": json.dumps({ + "type": "filesystem", + "params": { + "directory_path": str(temp_folder) + } + }), }) plasma_obj = ray.put(np.ones(50 * 1024 * 1024, dtype=np.uint8)) for _ in range(5): @@ -619,14 +643,14 @@ def f(dep): @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") -def test_no_release_during_plasma_fetch(object_spilling_config, shutdown_only): - do_test_release_resource(object_spilling_config, expect_released=False) +def test_no_release_during_plasma_fetch(tmp_path, shutdown_only): + do_test_release_resource(tmp_path, expect_released=False) @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") -def test_release_during_plasma_fetch(object_spilling_config, shutdown_only): - do_test_release_resource(object_spilling_config, expect_released=True) +def test_release_during_plasma_fetch(tmp_path, shutdown_only): + do_test_release_resource(tmp_path, expect_released=True) @pytest.mark.skip( @@ -637,7 +661,6 @@ def test_release_during_plasma_fetch(object_spilling_config, shutdown_only): @pytest.mark.timeout(30) def test_spill_objects_on_object_transfer(object_spilling_config, ray_start_cluster): - object_spilling_config, _ = object_spilling_config # This test checks that objects get spilled to make room for transferred # objects. cluster = ray_start_cluster diff --git a/src/ray/common/ray_config_def.h b/src/ray/common/ray_config_def.h index d06a1c358196..cfbc62517d5e 100644 --- a/src/ray/common/ray_config_def.h +++ b/src/ray/common/ray_config_def.h @@ -361,10 +361,6 @@ RAY_CONFIG(bool, automatic_object_deletion_enabled, true) /// Grace period until we throw the OOM error to the application in seconds. RAY_CONFIG(int64_t, oom_grace_period_s, 10) -/// Whether or not the external storage is file system. -/// This is configured based on object_spilling_config. -RAY_CONFIG(bool, is_external_storage_type_fs, true) - /* Configuration parameters for locality-aware scheduling. */ /// Whether to enable locality-aware leasing. If enabled, then Ray will consider task /// dependency locality when choosing a worker for leasing. diff --git a/src/ray/gcs/accessor.h b/src/ray/gcs/accessor.h index 3bc7002021b3..ab0704bcadd7 100644 --- a/src/ray/gcs/accessor.h +++ b/src/ray/gcs/accessor.h @@ -303,12 +303,10 @@ class ObjectInfoAccessor { /// /// \param object_id The ID of object which location will be added to GCS. /// \param spilled_url The URL where the object has been spilled. - /// \param spilled_node_id The NodeID where the object has been spilled. /// \param callback Callback that will be called after object has been added to GCS. /// \return Status virtual Status AsyncAddSpilledUrl(const ObjectID &object_id, const std::string &spilled_url, - const NodeID &spilled_node_id, const StatusCallback &callback) = 0; /// Remove location of object from GCS asynchronously. diff --git a/src/ray/gcs/gcs_client/service_based_accessor.cc b/src/ray/gcs/gcs_client/service_based_accessor.cc index 821e0f7d930a..dfa192320976 100644 --- a/src/ray/gcs/gcs_client/service_based_accessor.cc +++ b/src/ray/gcs/gcs_client/service_based_accessor.cc @@ -1102,14 +1102,13 @@ Status ServiceBasedObjectInfoAccessor::AsyncAddLocation(const ObjectID &object_i Status ServiceBasedObjectInfoAccessor::AsyncAddSpilledUrl( const ObjectID &object_id, const std::string &spilled_url, - const NodeID &spilled_node_id, const StatusCallback &callback) { + const StatusCallback &callback) { RAY_LOG(DEBUG) << "Adding object spilled location, object id = " << object_id << ", spilled_url = " << spilled_url << ", job id = " << object_id.TaskId().JobId(); rpc::AddObjectLocationRequest request; request.set_object_id(object_id.Binary()); request.set_spilled_url(spilled_url); - request.set_spilled_node_id(spilled_node_id.Binary()); auto operation = [this, request, callback](const SequencerDoneCallback &done_callback) { client_impl_->GetGcsRpcClient().AddObjectLocation( @@ -1180,7 +1179,6 @@ Status ServiceBasedObjectInfoAccessor::AsyncSubscribeToLocations( if (!result->spilled_url().empty()) { rpc::ObjectLocationChange update; update.set_spilled_url(result->spilled_url()); - update.set_spilled_node_id(result->spilled_node_id()); update.set_size(result->size()); notification.push_back(update); } diff --git a/src/ray/gcs/gcs_client/service_based_accessor.h b/src/ray/gcs/gcs_client/service_based_accessor.h index 149fa6d2e8d4..2d362976dd22 100644 --- a/src/ray/gcs/gcs_client/service_based_accessor.h +++ b/src/ray/gcs/gcs_client/service_based_accessor.h @@ -326,7 +326,6 @@ class ServiceBasedObjectInfoAccessor : public ObjectInfoAccessor { size_t object_size, const StatusCallback &callback) override; Status AsyncAddSpilledUrl(const ObjectID &object_id, const std::string &spilled_url, - const NodeID &node_id, const StatusCallback &callback) override; Status AsyncRemoveLocation(const ObjectID &object_id, const NodeID &node_id, diff --git a/src/ray/gcs/gcs_server/gcs_object_manager.cc b/src/ray/gcs/gcs_server/gcs_object_manager.cc index 818904d65b61..73971ed7f18f 100644 --- a/src/ray/gcs/gcs_server/gcs_object_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_object_manager.cc @@ -66,7 +66,6 @@ void GcsObjectManager::HandleAddObjectLocation( NodeID node_id; std::string spilled_url; - NodeID spilled_node_id; if (!request.node_id().empty()) { node_id = NodeID::FromBinary(request.node_id()); RAY_LOG(DEBUG) << "Adding object location, job id = " << object_id.TaskId().JobId() @@ -76,14 +75,12 @@ void GcsObjectManager::HandleAddObjectLocation( absl::MutexLock lock(&mutex_); RAY_CHECK(!request.spilled_url().empty()); spilled_url = request.spilled_url(); - spilled_node_id = NodeID::FromBinary(request.spilled_node_id()); object_to_locations_[object_id].spilled_url = spilled_url; - object_to_locations_[object_id].spilled_node_id = spilled_node_id; RAY_LOG(DEBUG) << "Adding object spilled location, object id = " << object_id; } size_t size = request.size(); - auto on_done = [this, object_id, node_id, spilled_url, size, spilled_node_id, reply, + auto on_done = [this, object_id, node_id, spilled_url, size, reply, send_reply_callback](const Status &status) { if (status.ok()) { rpc::ObjectLocationChange notification; @@ -93,7 +90,6 @@ void GcsObjectManager::HandleAddObjectLocation( } if (!spilled_url.empty()) { notification.set_spilled_url(spilled_url); - notification.set_spilled_node_id(spilled_node_id.Binary()); } notification.set_size(size); RAY_CHECK_OK(gcs_pub_sub_->Publish(OBJECT_CHANNEL, object_id.Hex(), @@ -101,8 +97,7 @@ void GcsObjectManager::HandleAddObjectLocation( RAY_LOG(DEBUG) << "Finished adding object location, job id = " << object_id.TaskId().JobId() << ", object id = " << object_id << ", node id = " << node_id << ", task id = " << object_id.TaskId() - << ", spilled_url = " << spilled_url - << ", spilled_node_id = " << spilled_node_id; + << ", spilled_url = " << spilled_url; } else { RAY_LOG(ERROR) << "Failed to add object location: " << status.ToString() << ", job id = " << object_id.TaskId().JobId() @@ -296,7 +291,6 @@ const ObjectLocationInfo GcsObjectManager::GenObjectLocationInfo( object_data.add_locations()->set_manager(node_id.Binary()); } object_data.set_spilled_url(it->second.spilled_url); - object_data.set_spilled_node_id(it->second.spilled_node_id.Binary()); object_data.set_size(it->second.object_size); } return object_data; diff --git a/src/ray/gcs/gcs_server/gcs_object_manager.h b/src/ray/gcs/gcs_server/gcs_object_manager.h index 6d4d39598cb6..2afff0816850 100644 --- a/src/ray/gcs/gcs_server/gcs_object_manager.h +++ b/src/ray/gcs/gcs_server/gcs_object_manager.h @@ -65,7 +65,6 @@ class GcsObjectManager : public rpc::ObjectInfoHandler { struct LocationSet { absl::flat_hash_set locations; std::string spilled_url = ""; - NodeID spilled_node_id = NodeID::Nil(); size_t object_size = 0; }; diff --git a/src/ray/gcs/gcs_server/gcs_placement_group_manager.h b/src/ray/gcs/gcs_server/gcs_placement_group_manager.h index c76849108990..8bd36941745f 100644 --- a/src/ray/gcs/gcs_server/gcs_placement_group_manager.h +++ b/src/ray/gcs/gcs_server/gcs_placement_group_manager.h @@ -193,7 +193,7 @@ class GcsPlacementGroupManager : public rpc::PlacementGroupInfoHandler { void OnPlacementGroupCreationSuccess( const std::shared_ptr &placement_group); - /// Remove the placement group of a given id. + /// TODO-SANG Fill it up. void RemovePlacementGroup(const PlacementGroupID &placement_group_id, StatusCallback on_placement_group_removed); diff --git a/src/ray/object_manager/common.h b/src/ray/object_manager/common.h index 3cda75266ad0..9c71e2c2b5e8 100644 --- a/src/ray/object_manager/common.h +++ b/src/ray/object_manager/common.h @@ -17,8 +17,7 @@ using SpillObjectsCallback = std::function; using SpaceReleasedCallback = std::function; /// A callback to call when a spilled object needs to be returned to the object store. -using RestoreSpilledObjectCallback = - std::function)>; +using RestoreSpilledObjectCallback = std::function)>; } // namespace ray diff --git a/src/ray/object_manager/object_buffer_pool.cc b/src/ray/object_manager/object_buffer_pool.cc index 726a6fefca35..4b6a44e6b5fd 100644 --- a/src/ray/object_manager/object_buffer_pool.cc +++ b/src/ray/object_manager/object_buffer_pool.cc @@ -59,10 +59,7 @@ std::pair ObjectBufferPool::Ge plasma::ObjectBuffer object_buffer; RAY_CHECK_OK(store_client_.Get(&object_id, 1, 0, &object_buffer)); if (object_buffer.data == nullptr) { - RAY_LOG(INFO) - << "Failed to get a chunk of the object: " << object_id - << ". It is mostly because the object is already evicted or spilled when the " - "pull request is received. The caller will retry the pull request again."; + RAY_LOG(ERROR) << "Failed to get object"; return std::pair( errored_chunk_, ray::Status::IOError("Unable to obtain object chunk, object not local.")); diff --git a/src/ray/object_manager/object_directory.cc b/src/ray/object_manager/object_directory.cc index 27e6f42b0bd6..ccfda7f5a37c 100644 --- a/src/ray/object_manager/object_directory.cc +++ b/src/ray/object_manager/object_directory.cc @@ -32,7 +32,7 @@ using ray::rpc::ObjectTableData; bool UpdateObjectLocations(const std::vector &location_updates, std::shared_ptr gcs_client, std::unordered_set *node_ids, std::string *spilled_url, - NodeID *spilled_node_id, size_t *object_size) { + size_t *object_size) { // location_updates contains the updates of locations of the object. // with GcsChangeMode, we can determine whether the update mode is // addition or deletion. @@ -57,12 +57,9 @@ bool UpdateObjectLocations(const std::vector &locatio } } else { RAY_CHECK(!update.spilled_url().empty()); - const auto received_spilled_node_id = NodeID::FromBinary(update.spilled_node_id()); - RAY_LOG(DEBUG) << "Received object spilled at " << update.spilled_url() - << " spilled at " << NodeID::FromBinary(update.spilled_node_id()); + RAY_LOG(DEBUG) << "Received object spilled at " << update.spilled_url(); if (update.spilled_url() != *spilled_url) { *spilled_url = update.spilled_url(); - *spilled_node_id = received_spilled_node_id; isUpdated = true; } } @@ -131,17 +128,14 @@ void ObjectDirectory::HandleNodeRemoved(const NodeID &node_id) { // If the subscribed object has the removed node as a location, update // its locations with an empty update so that the location will be removed. UpdateObjectLocations({}, gcs_client_, &listener.second.current_object_locations, - &listener.second.spilled_url, - &listener.second.spilled_node_id, - &listener.second.object_size); + &listener.second.spilled_url, &listener.second.object_size); // Re-call all the subscribed callbacks for the object, since its // locations have changed. for (const auto &callback_pair : listener.second.callbacks) { // It is safe to call the callback directly since this is already running // in the subscription callback stack. callback_pair.second(object_id, listener.second.current_object_locations, - listener.second.spilled_url, listener.second.spilled_node_id, - listener.second.object_size); + listener.second.spilled_url, listener.second.object_size); } } } @@ -168,11 +162,11 @@ ray::Status ObjectDirectory::SubscribeObjectLocations(const UniqueID &callback_i // Once this flag is set to true, it should never go back to false. it->second.subscribed = true; + // Update entries for this object. if (!UpdateObjectLocations(object_notifications, gcs_client_, &it->second.current_object_locations, - &it->second.spilled_url, &it->second.spilled_node_id, - &it->second.object_size)) { + &it->second.spilled_url, &it->second.object_size)) { return; } // Copy the callbacks so that the callbacks can unsubscribe without interrupting @@ -186,8 +180,7 @@ ray::Status ObjectDirectory::SubscribeObjectLocations(const UniqueID &callback_i // It is safe to call the callback directly since this is already running // in the subscription callback stack. callback_pair.second(object_id, it->second.current_object_locations, - it->second.spilled_url, it->second.spilled_node_id, - it->second.object_size); + it->second.spilled_url, it->second.object_size); } }; status = gcs_client_->Objects().AsyncSubscribeToLocations( @@ -205,12 +198,10 @@ ray::Status ObjectDirectory::SubscribeObjectLocations(const UniqueID &callback_i if (listener_state.subscribed) { auto &locations = listener_state.current_object_locations; auto &spilled_url = listener_state.spilled_url; - auto &spilled_node_id = listener_state.spilled_node_id; auto object_size = it->second.object_size; - io_service_.post( - [callback, locations, spilled_url, object_size, object_id, spilled_node_id]() { - callback(object_id, locations, spilled_url, spilled_node_id, object_size); - }); + io_service_.post([callback, locations, spilled_url, object_size, object_id]() { + callback(object_id, locations, spilled_url, object_size); + }); } return status; } @@ -242,12 +233,10 @@ ray::Status ObjectDirectory::LookupLocations(const ObjectID &object_id, // cached locations. auto &locations = it->second.current_object_locations; auto &spilled_url = it->second.spilled_url; - auto &spilled_node_id = it->second.spilled_node_id; auto object_size = it->second.object_size; - io_service_.post( - [callback, object_id, spilled_url, locations, object_size, spilled_node_id]() { - callback(object_id, locations, spilled_url, spilled_node_id, object_size); - }); + io_service_.post([callback, object_id, spilled_url, locations, object_size]() { + callback(object_id, locations, spilled_url, object_size); + }); } else { // We do not have any locations cached due to a concurrent // SubscribeObjectLocations call, so look up the object's locations @@ -269,19 +258,17 @@ ray::Status ObjectDirectory::LookupLocations(const ObjectID &object_id, if (!update->spilled_url().empty()) { rpc::ObjectLocationChange change; change.set_spilled_url(update->spilled_url()); - change.set_spilled_node_id(update->spilled_node_id()); notification.push_back(change); } std::unordered_set node_ids; std::string spilled_url; - NodeID spilled_node_id; size_t object_size = 0; UpdateObjectLocations(notification, gcs_client_, &node_ids, &spilled_url, - &spilled_node_id, &object_size); + &object_size); // It is safe to call the callback directly since this is already running // in the GCS client's lookup callback stack. - callback(object_id, node_ids, spilled_url, spilled_node_id, object_size); + callback(object_id, node_ids, spilled_url, object_size); }); } return status; diff --git a/src/ray/object_manager/object_directory.h b/src/ray/object_manager/object_directory.h index 0a4c6300a81a..8f06888aee23 100644 --- a/src/ray/object_manager/object_directory.h +++ b/src/ray/object_manager/object_directory.h @@ -41,9 +41,9 @@ struct RemoteConnectionInfo { }; /// Callback for object location notifications. -using OnLocationsFound = std::function &, - const std::string &, const NodeID &, size_t object_size)>; +using OnLocationsFound = std::function &, + const std::string &, size_t object_size)>; class ObjectDirectoryInterface { public: @@ -185,9 +185,6 @@ class ObjectDirectory : public ObjectDirectoryInterface { std::unordered_set current_object_locations; /// The location where this object has been spilled, if any. std::string spilled_url = ""; - // The node id that spills the object to the disk. - // It will be Nil if it uses a distributed external storage. - NodeID spilled_node_id = NodeID::Nil(); /// The size of the object. size_t object_size = 0; /// This flag will get set to true if received any notification of the object. diff --git a/src/ray/object_manager/object_manager.cc b/src/ray/object_manager/object_manager.cc index ddd71c7665ab..467ea25675e9 100644 --- a/src/ray/object_manager/object_manager.cc +++ b/src/ray/object_manager/object_manager.cc @@ -220,10 +220,8 @@ uint64_t ObjectManager::Pull(const std::vector &object_ref const auto &callback = [this](const ObjectID &object_id, const std::unordered_set &client_ids, - const std::string &spilled_url, - const NodeID &spilled_node_id, size_t object_size) { - pull_manager_->OnLocationChange(object_id, client_ids, spilled_url, spilled_node_id, - object_size); + const std::string &spilled_url, size_t object_size) { + pull_manager_->OnLocationChange(object_id, client_ids, spilled_url, object_size); }; for (const auto &ref : objects_to_locate) { @@ -515,8 +513,7 @@ ray::Status ObjectManager::LookupRemainingWaitObjects(const UniqueID &wait_id) { object_id, wait_state.owner_addresses[object_id], [this, wait_id](const ObjectID &lookup_object_id, const std::unordered_set &node_ids, - const std::string &spilled_url, const NodeID &spilled_node_id, - size_t object_size) { + const std::string &spilled_url, size_t object_size) { auto &wait_state = active_wait_requests_.find(wait_id)->second; // Note that the object is guaranteed to be added to local_objects_ before // the notification is triggered. @@ -557,8 +554,7 @@ void ObjectManager::SubscribeRemainingWaitObjects(const UniqueID &wait_id) { wait_id, object_id, wait_state.owner_addresses[object_id], [this, wait_id](const ObjectID &subscribe_object_id, const std::unordered_set &node_ids, - const std::string &spilled_url, const NodeID &spilled_node_id, - size_t object_size) { + const std::string &spilled_url, size_t object_size) { auto object_id_wait_state = active_wait_requests_.find(wait_id); if (object_id_wait_state == active_wait_requests_.end()) { // Depending on the timing of calls to the object directory, we diff --git a/src/ray/object_manager/object_manager.h b/src/ray/object_manager/object_manager.h index 00073012213a..a114f16bc446 100644 --- a/src/ray/object_manager/object_manager.h +++ b/src/ray/object_manager/object_manager.h @@ -106,9 +106,8 @@ class ObjectManagerInterface { class ObjectManager : public ObjectManagerInterface, public rpc::ObjectManagerServiceHandler { public: - using RestoreSpilledObjectCallback = - std::function)>; + using RestoreSpilledObjectCallback = std::function)>; /// Implementation of object manager service diff --git a/src/ray/object_manager/ownership_based_object_directory.cc b/src/ray/object_manager/ownership_based_object_directory.cc index a17d3dfc66c0..efc37b3e8d8c 100644 --- a/src/ray/object_manager/ownership_based_object_directory.cc +++ b/src/ray/object_manager/ownership_based_object_directory.cc @@ -146,7 +146,7 @@ void OwnershipBasedObjectDirectory::SubscriptionCallback( // It is safe to call the callback directly since this is already running // in the subscription callback stack. callback_pair.second(object_id, it->second.current_object_locations, "", - NodeID::Nil(), it->second.object_size); + it->second.object_size); } } @@ -213,7 +213,7 @@ ray::Status OwnershipBasedObjectDirectory::LookupLocations( RAY_LOG(WARNING) << "Object " << object_id << " does not have owner. " << "LookupLocations returns an empty list of locations."; io_service_.post([callback, object_id]() { - callback(object_id, std::unordered_set(), "", NodeID::Nil(), 0); + callback(object_id, std::unordered_set(), "", 0); }); return Status::OK(); } @@ -234,7 +234,7 @@ ray::Status OwnershipBasedObjectDirectory::LookupLocations( node_ids.emplace(NodeID::FromBinary(node_id)); } FilterRemovedNodes(gcs_client_, &node_ids); - callback(object_id, node_ids, "", NodeID::Nil(), reply.object_size()); + callback(object_id, node_ids, "", reply.object_size()); }); return Status::OK(); } diff --git a/src/ray/object_manager/pull_manager.cc b/src/ray/object_manager/pull_manager.cc index 302f2f4354ef..1ebf9214a707 100644 --- a/src/ray/object_manager/pull_manager.cc +++ b/src/ray/object_manager/pull_manager.cc @@ -259,8 +259,7 @@ std::vector PullManager::CancelPull(uint64_t request_id) { void PullManager::OnLocationChange(const ObjectID &object_id, const std::unordered_set &client_ids, - const std::string &spilled_url, - const NodeID &spilled_node_id, size_t object_size) { + const std::string &spilled_url, size_t object_size) { // Exit if the Pull request has already been fulfilled or canceled. auto it = object_pull_requests_.find(object_id); if (it == object_pull_requests_.end()) { @@ -272,7 +271,7 @@ void PullManager::OnLocationChange(const ObjectID &object_id, // before. it->second.client_locations = std::vector(client_ids.begin(), client_ids.end()); it->second.spilled_url = spilled_url; - it->second.spilled_node_id = spilled_node_id; + if (!it->second.object_size_set) { RAY_LOG(DEBUG) << "Updated size of object " << object_id << " to " << object_size << ", num bytes being pulled is now " << num_bytes_being_pulled_; @@ -300,47 +299,30 @@ void PullManager::TryToMakeObjectLocal(const ObjectID &object_id) { return; } - // We always pull objects from a remote node before - // restoring it because of two reasons. - // 1. This will help reducing the load of external storages - // or remote node that spilled the object. - // 2. Also, if we use multi-node file spilling, the restoration will be - // confirmed by a object location subscription, so we should pull first - // before requesting for object restoration. - bool did_pull = PullFromRandomLocation(object_id); - if (did_pull) { - // New object locations were found, so begin trying to pull from a - // client. - UpdateRetryTimer(request); - return; - } - - // If we cannot pull, it means all objects have been evicted, so try restoring objects - // from the external storage. If the object was spilled on the current node, the - // callback will restore the object from the local the disk. - // Otherwise, it will send a request to a remote node that spilled the object. - // If external storage is a distributed storage, we always try restoring from it without - // sending RPCs. if (!request.spilled_url.empty()) { - const auto spilled_node_id = request.spilled_node_id; + // Try to restore the spilled object. restore_spilled_object_( - object_id, request.spilled_url, spilled_node_id, - [this, object_id, spilled_node_id](const ray::Status &status) { + object_id, request.spilled_url, [this, object_id](const ray::Status &status) { + bool did_pull = true; + // Fall back to fetching from another object manager. if (!status.ok()) { - const auto node_id_with_issue = - spilled_node_id.IsNil() ? self_node_id_ : spilled_node_id; - RAY_LOG(WARNING) - << "Object restoration failed and the object could " - "not be " - "found on any other nodes. This can happen if the location where the " - "object was spilled is unreachable. This job may hang if the object " - "is permanently unreachable. " - "Please check the log of node of id: " - << node_id_with_issue << " Object id: " << object_id; + did_pull = PullFromRandomLocation(object_id); + } + if (!did_pull) { + RAY_LOG(WARNING) << "Object restoration failed and the object could not be " + "found on any other nodes. Object id: " + << object_id; } }); - // We shouldn't update the timer here because restoration takes some time, and since - // we retry pull requests with exponential backoff, the delay could be large. + UpdateRetryTimer(request); + } else { + // New object locations were found, so begin trying to pull from a + // client. This will be called every time a new client location + // appears. + bool did_pull = PullFromRandomLocation(object_id); + if (did_pull) { + UpdateRetryTimer(request); + } } } diff --git a/src/ray/object_manager/pull_manager.h b/src/ray/object_manager/pull_manager.h index 26eba1a35264..e4a662eb6306 100644 --- a/src/ray/object_manager/pull_manager.h +++ b/src/ray/object_manager/pull_manager.h @@ -72,12 +72,9 @@ class PullManager { /// necessarily a super or subset of the previously available nodes. /// \param spilled_url The location of the object if it was spilled. If /// non-empty, the object may no longer be on any node. - /// \param spilled_node_id The node id of the object if it was spilled. If Nil, the - /// object may no longer be on any node. void OnLocationChange(const ObjectID &object_id, const std::unordered_set &client_ids, - const std::string &spilled_url, const NodeID &spilled_node_id, - size_t object_size); + const std::string &spilled_url, size_t object_size); /// Cancel an existing pull request. /// @@ -111,7 +108,6 @@ class PullManager { bundle_request_ids() {} std::vector client_locations; std::string spilled_url; - NodeID spilled_node_id; double next_pull_time; uint8_t num_retries; bool object_size_set = false; diff --git a/src/ray/object_manager/test/pull_manager_test.cc b/src/ray/object_manager/test/pull_manager_test.cc index ecdaa06198fb..345cc6ceadfe 100644 --- a/src/ray/object_manager/test/pull_manager_test.cc +++ b/src/ray/object_manager/test/pull_manager_test.cc @@ -24,7 +24,7 @@ class PullManagerTestWithCapacity { [this](const ObjectID &object_id, const NodeID &node_id) { num_send_pull_request_calls_++; }, - [this](const ObjectID &, const std::string &, const NodeID &, + [this](const ObjectID &, const std::string &, std::function callback) { num_restore_spilled_object_calls_++; restore_object_callback_ = callback; @@ -94,7 +94,7 @@ TEST_F(PullManagerTest, TestStaleSubscription) { ASSERT_EQ(ObjectRefsToIds(objects_to_locate), ObjectRefsToIds(refs)); std::unordered_set client_ids; - pull_manager_.OnLocationChange(oid, client_ids, "", NodeID::Nil(), 0); + pull_manager_.OnLocationChange(oid, client_ids, "", 0); AssertNumActiveRequestsEquals(1); // There are no client ids to pull from. @@ -109,7 +109,7 @@ TEST_F(PullManagerTest, TestStaleSubscription) { AssertNumActiveRequestsEquals(0); client_ids.insert(NodeID::FromRandom()); - pull_manager_.OnLocationChange(oid, client_ids, "", NodeID::Nil(), 0); + pull_manager_.OnLocationChange(oid, client_ids, "", 0); // Now we're getting a notification about an object that was already cancelled. ASSERT_EQ(num_send_pull_request_calls_, 0); @@ -128,38 +128,26 @@ TEST_F(PullManagerTest, TestRestoreSpilledObject) { ASSERT_EQ(ObjectRefsToIds(objects_to_locate), ObjectRefsToIds(refs)); std::unordered_set client_ids; - pull_manager_.OnLocationChange(obj1, client_ids, "", NodeID::Nil(), 0); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); + AssertNumActiveRequestsEquals(1); // client_ids is empty here, so there's nowhere to pull from. ASSERT_EQ(num_send_pull_request_calls_, 0); - ASSERT_EQ(num_restore_spilled_object_calls_, 0); + ASSERT_EQ(num_restore_spilled_object_calls_, 1); - NodeID node_that_object_spilled = NodeID::FromRandom(); + client_ids.insert(NodeID::FromRandom()); fake_time_ += 10.; - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", - node_that_object_spilled, 0); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); // The behavior is supposed to be to always restore the spilled object if possible (even // if it exists elsewhere in the cluster). ASSERT_EQ(num_send_pull_request_calls_, 0); - ASSERT_EQ(num_restore_spilled_object_calls_, 1); - - // The restore object call will ask the remote node to restore the object, and the - // client location is updated accordingly. - client_ids.insert(node_that_object_spilled); - fake_time_ += 10.; - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", - node_that_object_spilled, 0); - - // Now the pull requests are sent. - ASSERT_EQ(num_send_pull_request_calls_, 1); - ASSERT_EQ(num_restore_spilled_object_calls_, 1); + ASSERT_EQ(num_restore_spilled_object_calls_, 2); // Don't restore an object if it's local. object_is_local_ = true; num_restore_spilled_object_calls_ = 0; - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", - NodeID::FromRandom(), 0); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); ASSERT_EQ(num_restore_spilled_object_calls_, 0); auto objects_to_cancel = pull_manager_.CancelPull(req_id); @@ -176,78 +164,51 @@ TEST_F(PullManagerTest, TestRestoreObjectFailed) { std::vector objects_to_locate; auto req_id = pull_manager_.Pull(refs, &objects_to_locate); ASSERT_EQ(ObjectRefsToIds(objects_to_locate), ObjectRefsToIds(refs)); + std::unordered_set client_ids; - pull_manager_.OnLocationChange(obj1, client_ids, "", NodeID::Nil(), 0); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); AssertNumActiveRequestsEquals(1); // client_ids is empty here, so there's nowhere to pull from. ASSERT_EQ(num_send_pull_request_calls_, 0); - ASSERT_EQ(num_restore_spilled_object_calls_, 0); + ASSERT_EQ(num_restore_spilled_object_calls_, 1); - // Object is now spilled to a remote node, but the client_ids are still empty. - const NodeID remote_node_object_spilled = NodeID::FromRandom(); - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", - remote_node_object_spilled, 0); + restore_object_callback_(ray::Status::IOError(":(")); + // client_ids is empty here, so there's nowhere to pull from. ASSERT_EQ(num_send_pull_request_calls_, 0); ASSERT_EQ(num_restore_spilled_object_calls_, 1); - restore_object_callback_(ray::Status::IOError(":(")); + client_ids.insert(NodeID::FromRandom()); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); + + // We always assume the restore succeeded so there's only 1 restore call still. + ASSERT_EQ(num_send_pull_request_calls_, 0); + ASSERT_EQ(num_restore_spilled_object_calls_, 1); - // Now the restore request has failed, the remote object shouldn't have been properly - // restored. fake_time_ += 10.0; - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", - remote_node_object_spilled, 0); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); ASSERT_EQ(num_send_pull_request_calls_, 0); ASSERT_EQ(num_restore_spilled_object_calls_, 2); - restore_object_callback_(ray::Status::OK()); - // Now the remote restoration request succeeds, so we sholud be able to pull the object. - client_ids.insert(remote_node_object_spilled); - // Since it is the second retry, the interval gets doubled. - fake_time_ += 20.0; - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", - remote_node_object_spilled, 0); + restore_object_callback_(ray::Status::IOError(":(")); + + // Since restore failed, we can fallback to pulling from another node immediately. + ASSERT_EQ(num_send_pull_request_calls_, 1); + ASSERT_EQ(num_restore_spilled_object_calls_, 2); + + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); // Now that we've successfully sent a pull request, we need to wait for the retry period // before sending another one. ASSERT_EQ(num_send_pull_request_calls_, 1); ASSERT_EQ(num_restore_spilled_object_calls_, 2); - auto objects_to_cancel = pull_manager_.CancelPull(req_id); + pull_manager_.CancelPull(req_id); AssertNoLeaks(); } -TEST_F(PullManagerTest, TestLoadBalancingRestorationRequest) { - /* Make sure when the object copy is in other raylet, we pull object from there instead - * of requesting the owner node to restore the object. */ - - auto refs = CreateObjectRefs(1); - auto obj1 = ObjectRefsToIds(refs)[0]; - rpc::Address addr1; - ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); - std::vector objects_to_locate; - pull_manager_.Pull(refs, &objects_to_locate); - ASSERT_EQ(ObjectRefsToIds(objects_to_locate), ObjectRefsToIds(refs)); - ASSERT_EQ(pull_manager_.NumActiveRequests(), 1); - - std::unordered_set client_ids; - const auto copy_node1 = NodeID::FromRandom(); - const auto copy_node2 = NodeID::FromRandom(); - const auto remote_node_that_spilled_object = NodeID::FromRandom(); - client_ids.insert(copy_node1); - client_ids.insert(copy_node2); - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", - remote_node_that_spilled_object, 0); - - ASSERT_EQ(num_send_pull_request_calls_, 1); - // Make sure the restore request wasn't sent since there are nodes that have a copied - // object. - ASSERT_EQ(num_restore_spilled_object_calls_, 0); -} - TEST_F(PullManagerTest, TestManyUpdates) { auto refs = CreateObjectRefs(1); auto obj1 = ObjectRefsToIds(refs)[0]; @@ -261,7 +222,7 @@ TEST_F(PullManagerTest, TestManyUpdates) { client_ids.insert(NodeID::FromRandom()); for (int i = 0; i < 100; i++) { - pull_manager_.OnLocationChange(obj1, client_ids, "", NodeID::Nil(), 0); + pull_manager_.OnLocationChange(obj1, client_ids, "", 0); AssertNumActiveRequestsEquals(1); } @@ -289,7 +250,7 @@ TEST_F(PullManagerTest, TestRetryTimer) { // We need to call OnLocationChange at least once, to population the list of nodes with // the object. - pull_manager_.OnLocationChange(obj1, client_ids, "", NodeID::Nil(), 0); + pull_manager_.OnLocationChange(obj1, client_ids, "", 0); AssertNumActiveRequestsEquals(1); ASSERT_EQ(num_send_pull_request_calls_, 1); ASSERT_EQ(num_restore_spilled_object_calls_, 0); @@ -300,7 +261,7 @@ TEST_F(PullManagerTest, TestRetryTimer) { // Location changes can trigger reset timer. for (; fake_time_ <= 120 * 10; fake_time_ += 1.) { - pull_manager_.OnLocationChange(obj1, client_ids, "", NodeID::Nil(), 0); + pull_manager_.OnLocationChange(obj1, client_ids, "", 0); } // We should make a pull request every tick (even if it's a duplicate to a node we're @@ -333,7 +294,7 @@ TEST_F(PullManagerTest, TestBasic) { std::unordered_set client_ids; client_ids.insert(NodeID::FromRandom()); for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), 0); + pull_manager_.OnLocationChange(oids[i], client_ids, "", 0); } ASSERT_EQ(num_send_pull_request_calls_, oids.size()); ASSERT_EQ(num_restore_spilled_object_calls_, 0); @@ -344,7 +305,7 @@ TEST_F(PullManagerTest, TestBasic) { num_send_pull_request_calls_ = 0; fake_time_ += 10; for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), 0); + pull_manager_.OnLocationChange(oids[i], client_ids, "", 0); } ASSERT_EQ(num_send_pull_request_calls_, 0); @@ -357,7 +318,7 @@ TEST_F(PullManagerTest, TestBasic) { num_send_pull_request_calls_ = 0; fake_time_ += 10; for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), 0); + pull_manager_.OnLocationChange(oids[i], client_ids, "", 0); } ASSERT_EQ(num_send_pull_request_calls_, 0); @@ -379,7 +340,7 @@ TEST_F(PullManagerTest, TestDeduplicateBundles) { std::unordered_set client_ids; client_ids.insert(NodeID::FromRandom()); for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), 0); + pull_manager_.OnLocationChange(oids[i], client_ids, "", 0); } ASSERT_EQ(num_send_pull_request_calls_, oids.size()); ASSERT_EQ(num_restore_spilled_object_calls_, 0); @@ -393,8 +354,7 @@ TEST_F(PullManagerTest, TestDeduplicateBundles) { fake_time_ += 10; num_send_pull_request_calls_ = 0; for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), 0); - pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), 0); + pull_manager_.OnLocationChange(oids[i], client_ids, "", 0); ASSERT_EQ(num_send_pull_request_calls_, i + 1); ASSERT_EQ(num_restore_spilled_object_calls_, 0); } @@ -408,7 +368,7 @@ TEST_F(PullManagerTest, TestDeduplicateBundles) { object_is_local_ = false; num_send_pull_request_calls_ = 0; for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), 0); + pull_manager_.OnLocationChange(oids[i], client_ids, "", 0); } ASSERT_EQ(num_send_pull_request_calls_, 0); @@ -430,7 +390,7 @@ TEST_F(PullManagerWithAdmissionControlTest, TestBasic) { std::unordered_set client_ids; client_ids.insert(NodeID::FromRandom()); for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), object_size); + pull_manager_.OnLocationChange(oids[i], client_ids, "", object_size); } ASSERT_EQ(num_send_pull_request_calls_, oids.size()); ASSERT_EQ(num_restore_spilled_object_calls_, 0); @@ -446,7 +406,7 @@ TEST_F(PullManagerWithAdmissionControlTest, TestBasic) { fake_time_ += 10; auto prev_pull_requests = num_send_pull_request_calls_; for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), object_size); + pull_manager_.OnLocationChange(oids[i], client_ids, "", object_size); ASSERT_EQ(num_send_pull_request_calls_, prev_pull_requests); ASSERT_EQ(num_restore_spilled_object_calls_, 0); } @@ -489,7 +449,7 @@ TEST_F(PullManagerWithAdmissionControlTest, TestQueue) { client_ids.insert(NodeID::FromRandom()); for (auto &oids : bundles) { for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), object_size); + pull_manager_.OnLocationChange(oids[i], client_ids, "", object_size); } } @@ -540,7 +500,7 @@ TEST_F(PullManagerWithAdmissionControlTest, TestCancel) { req_ids.push_back(req_id); } for (size_t i = 0; i < object_sizes.size(); i++) { - pull_manager_.OnLocationChange(oids[i], {}, "", NodeID::Nil(), object_sizes[i]); + pull_manager_.OnLocationChange(oids[i], {}, "", object_sizes[i]); } AssertNumActiveRequestsEquals(num_active_requests_expected_before); pull_manager_.CancelPull(req_ids[cancel_idx]); @@ -548,14 +508,14 @@ TEST_F(PullManagerWithAdmissionControlTest, TestCancel) { // Request is really canceled. pull_manager_.OnLocationChange(oids[cancel_idx], {NodeID::FromRandom()}, "", - NodeID::Nil(), object_sizes[cancel_idx]); + object_sizes[cancel_idx]); ASSERT_EQ(num_send_pull_request_calls_, 0); // The expected number of requests at the head of the queue are pulled. int num_active = 0; for (size_t i = 0; i < refs.size() && num_active < num_active_requests_expected_after; i++) { - pull_manager_.OnLocationChange(oids[i], {NodeID::FromRandom()}, "", NodeID::Nil(), + pull_manager_.OnLocationChange(oids[i], {NodeID::FromRandom()}, "", object_sizes[i]); if (i != cancel_idx) { num_active++; diff --git a/src/ray/protobuf/gcs.proto b/src/ray/protobuf/gcs.proto index 1e59ae8123ca..a332a908159e 100644 --- a/src/ray/protobuf/gcs.proto +++ b/src/ray/protobuf/gcs.proto @@ -413,11 +413,8 @@ message ObjectLocationInfo { // For objects that have been spilled to external storage, the URL from which // they can be retrieved. string spilled_url = 3; - // The node id that spills the object to the disk. - // It will be Nil if it uses a distributed external storage. - bytes spilled_node_id = 4; // The size of the object in bytes. - uint64 size = 5; + uint64 size = 4; } // A notification message about one object's locations being changed. @@ -428,11 +425,8 @@ message ObjectLocationChange { // The object has been spilled to this URL. This should be set xor the above // fields are set. string spilled_url = 3; - // The node id that spills the object to the disk. - // It will be Nil if it uses a distributed external storage. - bytes spilled_node_id = 4; // The size of the object in bytes. - uint64 size = 5; + uint64 size = 4; } // A notification message about one node's resources being changed. diff --git a/src/ray/protobuf/gcs_service.proto b/src/ray/protobuf/gcs_service.proto index 8922ce6f466b..eda00b806b26 100644 --- a/src/ray/protobuf/gcs_service.proto +++ b/src/ray/protobuf/gcs_service.proto @@ -272,11 +272,8 @@ message AddObjectLocationRequest { // The spilled URL that will be added to GCS Service. Either this or the node // ID should be set. string spilled_url = 3; - // The node id that spills the object to the disk. - // It will be Nil if it uses a distributed external storage. - bytes spilled_node_id = 4; // The size of the object in bytes. - uint64 size = 5; + uint64 size = 4; } message AddObjectLocationReply { diff --git a/src/ray/protobuf/node_manager.proto b/src/ray/protobuf/node_manager.proto index 386ed988ade3..bae2a9715100 100644 --- a/src/ray/protobuf/node_manager.proto +++ b/src/ray/protobuf/node_manager.proto @@ -179,18 +179,6 @@ message RequestObjectSpillageReply { bool success = 1; } -message RestoreSpilledObjectRequest { - // ObjectID to restore. - bytes object_id = 1; - // Object URL where the object is spilled. - string object_url = 2; - // The node id of a node where the object is spilled. - bytes spilled_node_id = 3; -} - -message RestoreSpilledObjectReply { -} - message ReleaseUnusedBundlesRequest { repeated Bundle bundles_in_use = 1; } @@ -236,9 +224,6 @@ service NodeManagerService { // Ask the raylet to spill an object to external storage. rpc RequestObjectSpillage(RequestObjectSpillageRequest) returns (RequestObjectSpillageReply); - // Ask the raylet to restore the object from the external storage. - rpc RestoreSpilledObject(RestoreSpilledObjectRequest) - returns (RestoreSpilledObjectReply); // This method is only used by GCS, and the purpose is to release bundles // that may be leaked. When GCS restarts, it doesn't know which bundles it has leased // in the previous lifecycle. In this case, GCS will send a list of bundles that diff --git a/src/ray/raylet/local_object_manager.cc b/src/ray/raylet/local_object_manager.cc index 9909beb76e55..721adb6bd3eb 100644 --- a/src/ray/raylet/local_object_manager.cc +++ b/src/ray/raylet/local_object_manager.cc @@ -261,15 +261,11 @@ void LocalObjectManager::AddSpilledUrls( const ObjectID &object_id = object_ids[i]; const std::string &object_url = worker_reply.spilled_objects_url(i); RAY_LOG(DEBUG) << "Object " << object_id << " spilled at " << object_url; - // Choose a node id to report. If an external storage type is not a filesystem, we - // don't need to report where this object is spilled. - const auto node_id_object_spilled = - is_external_storage_type_fs_ ? self_node_id_ : NodeID::Nil(); // Write to object directory. Wait for the write to finish before // releasing the object to make sure that the spilled object can // be retrieved by other raylets. RAY_CHECK_OK(object_info_accessor_.AsyncAddSpilledUrl( - object_id, object_url, node_id_object_spilled, + object_id, object_url, [this, object_id, object_url, callback, num_remaining](Status status) { RAY_CHECK_OK(status); // Unpin the object. @@ -302,35 +298,14 @@ void LocalObjectManager::AddSpilledUrls( } void LocalObjectManager::AsyncRestoreSpilledObject( - const ObjectID &object_id, const std::string &object_url, const NodeID &node_id, + const ObjectID &object_id, const std::string &object_url, std::function callback) { + RAY_LOG(DEBUG) << "Restoring spilled object " << object_id << " from URL " + << object_url; if (objects_pending_restore_.count(object_id) > 0) { // If the same object is restoring, we dedup here. return; } - - if (!node_id.IsNil() && node_id != self_node_id_) { - // If we know where this object was spilled, and the current node is not that one, - // send a RPC to a remote node that spilled the object to restore it. - RAY_LOG(DEBUG) << "Send a object restoration request of id: " << object_id - << " to a remote node: " << node_id; - // TODO(sang): We need to deduplicate this remote RPC. Since restore request - // is retried every 10ms without exponential backoff, this can add huge overhead to a - // remote node that spilled the object. - restore_object_from_remote_node_(object_id, object_url, node_id); - if (callback) { - callback(Status::OK()); - } - return; - } - - // Restore the object. - RAY_LOG(DEBUG) << "Restoring spilled object " << object_id << " from URL " - << object_url; - if (!node_id.IsNil()) { - RAY_CHECK(spilled_objects_url_.count(object_id) > 0); - } - RAY_CHECK(objects_pending_restore_.emplace(object_id).second) << "Object dedupe wasn't done properly. Please report if you see this issue."; io_worker_pool_.PopRestoreWorker([this, object_id, object_url, callback]( diff --git a/src/ray/raylet/local_object_manager.h b/src/ray/raylet/local_object_manager.h index c4f157d58019..14142f5f913d 100644 --- a/src/ray/raylet/local_object_manager.h +++ b/src/ray/raylet/local_object_manager.h @@ -16,8 +16,6 @@ #include -#include -#include #include #include "ray/common/id.h" @@ -26,7 +24,6 @@ #include "ray/object_manager/common.h" #include "ray/raylet/worker_pool.h" #include "ray/rpc/worker/core_worker_client_pool.h" -#include "ray/util/util.h" #include "src/ray/protobuf/node_manager.pb.h" namespace ray { @@ -38,18 +35,15 @@ namespace raylet { class LocalObjectManager { public: LocalObjectManager( - const NodeID &node_id, size_t free_objects_batch_size, + boost::asio::io_service &io_context, size_t free_objects_batch_size, int64_t free_objects_period_ms, IOWorkerPoolInterface &io_worker_pool, gcs::ObjectInfoAccessor &object_info_accessor, rpc::CoreWorkerClientPool &owner_client_pool, bool object_pinning_enabled, bool automatic_object_deletion_enabled, int max_io_workers, - int64_t min_spilling_size, bool is_external_storage_type_fs, + int64_t min_spilling_size, std::function &)> on_objects_freed, - std::function is_plasma_object_spillable, - std::function - restore_object_from_remote_node) - : self_node_id_(node_id), - free_objects_period_ms_(free_objects_period_ms), + std::function is_plasma_object_spillable) + : free_objects_period_ms_(free_objects_period_ms), free_objects_batch_size_(free_objects_batch_size), io_worker_pool_(io_worker_pool), object_info_accessor_(object_info_accessor), @@ -61,9 +55,7 @@ class LocalObjectManager { min_spilling_size_(min_spilling_size), num_active_workers_(0), max_active_workers_(max_io_workers), - is_plasma_object_spillable_(is_plasma_object_spillable), - restore_object_from_remote_node_(restore_object_from_remote_node), - is_external_storage_type_fs_(is_external_storage_type_fs) {} + is_plasma_object_spillable_(is_plasma_object_spillable) {} /// Pin objects. /// @@ -98,15 +90,10 @@ class LocalObjectManager { /// Restore a spilled object from external storage back into local memory. /// /// \param object_id The ID of the object to restore. - /// \param object_url The URL where the object is spilled. - /// \param node_id Node id that we try restoring the object. If Nil is provided, the - /// object is restored directly from the external storage. If a node id is provided, it - /// sends a RPC request to a corresponding node if the given node_id is not equivalent - /// to a self node id. - /// \param callback A callback to call when the restoration is done. - /// Status will contain the error during restoration, if any. + /// \param object_url The URL in external storage from which the object can be restored. + /// \param callback A callback to call when the restoration is done. Status + /// will contain the error during restoration, if any. void AsyncRestoreSpilledObject(const ObjectID &object_id, const std::string &object_url, - const NodeID &node_id, std::function callback); /// Try to clear any objects that have been freed. @@ -173,8 +160,6 @@ class LocalObjectManager { /// \param urls_to_delete List of urls to delete from external storages. void DeleteSpilledObjects(std::vector &urls_to_delete); - const NodeID self_node_id_; - /// The period between attempts to eagerly evict objects from plasma. const int64_t free_objects_period_ms_; @@ -262,16 +247,6 @@ class LocalObjectManager { /// Return true if unpinned, meaning we can safely spill the object. False otherwise. std::function is_plasma_object_spillable_; - /// Callback to restore object of object id from a remote node of node id. - std::function - restore_object_from_remote_node_; - - /// Used to decide spilling protocol. - /// If it is "filesystem", it restores spilled objects only from an owner node. - /// If it is not (meaning it is distributed backend), it always restores objects - /// directly from the external storage. - bool is_external_storage_type_fs_; - /// /// Stats /// diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index 072064f4695a..1b8c50c5870e 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -158,29 +158,19 @@ NodeManager::NodeManager(boost::asio::io_service &io_service, const NodeID &self agent_manager_service_(io_service, *agent_manager_service_handler_), client_call_manager_(io_service), worker_rpc_pool_(client_call_manager_), - local_object_manager_( - self_node_id_, RayConfig::instance().free_objects_batch_size(), - RayConfig::instance().free_objects_period_milliseconds(), worker_pool_, - gcs_client_->Objects(), worker_rpc_pool_, - /* object_pinning_enabled */ config.object_pinning_enabled, - /* automatic_object_deletion_enabled */ - config.automatic_object_deletion_enabled, - /*max_io_workers*/ config.max_io_workers, - /*min_spilling_size*/ config.min_spilling_size, - /*is_external_storage_type_fs*/ - RayConfig::instance().is_external_storage_type_fs(), - /*on_objects_freed*/ - [this](const std::vector &object_ids) { - object_manager_.FreeObjects(object_ids, - /*local_only=*/false); - }, - is_plasma_object_spillable, - /*restore_object_from_remote_node*/ - [this](const ObjectID &object_id, const std::string &spilled_url, - const NodeID &node_id) { - SendSpilledObjectRestorationRequestToRemoteNode(object_id, spilled_url, - node_id); - }), + local_object_manager_(io_service_, RayConfig::instance().free_objects_batch_size(), + RayConfig::instance().free_objects_period_milliseconds(), + worker_pool_, gcs_client_->Objects(), worker_rpc_pool_, + /* object_pinning_enabled */ config.object_pinning_enabled, + /* automatic_object_deletion_enabled */ + config.automatic_object_deletion_enabled, + /*max_io_workers*/ config.max_io_workers, + /*min_spilling_size*/ config.min_spilling_size, + [this](const std::vector &object_ids) { + object_manager_.FreeObjects(object_ids, + /*local_only=*/false); + }, + is_plasma_object_spillable), report_worker_backlog_(RayConfig::instance().report_worker_backlog()), last_local_gc_ns_(absl::GetCurrentTimeNanos()), local_gc_interval_ns_(RayConfig::instance().local_gc_interval_s() * 1e9), @@ -521,24 +511,6 @@ void NodeManager::HandleRequestObjectSpillage( }); } -void NodeManager::HandleRestoreSpilledObject( - const rpc::RestoreSpilledObjectRequest &request, - rpc::RestoreSpilledObjectReply *reply, rpc::SendReplyCallback send_reply_callback) { - const auto object_id = ObjectID::FromBinary(request.object_id()); - const auto spilled_node_id = NodeID::FromBinary(request.spilled_node_id()); - const auto object_url = request.object_url(); - RAY_CHECK(spilled_node_id == self_node_id_); - RAY_LOG(DEBUG) << "Restore spilled object request received. Object id: " << object_id - << " spilled_node_id: " << self_node_id_ - << " object url: " << object_url; - local_object_manager_.AsyncRestoreSpilledObject(object_id, object_url, spilled_node_id, - nullptr); - // Just reply right away. The caller will keep hitting this RPC endpoint until - // restoration succeeds, so we can safely reply here without waiting for the - // restoreSpilledObject to be done. - send_reply_callback(Status::OK(), nullptr, nullptr); -} - void NodeManager::HandleReleaseUnusedBundles( const rpc::ReleaseUnusedBundlesRequest &request, rpc::ReleaseUnusedBundlesReply *reply, rpc::SendReplyCallback send_reply_callback) { @@ -2742,30 +2714,6 @@ void NodeManager::PublishInfeasibleTaskError(const Task &task) const { } } -void NodeManager::SendSpilledObjectRestorationRequestToRemoteNode( - const ObjectID &object_id, const std::string &spilled_url, const NodeID &node_id) { - // Fetch from a remote node. - if (!remote_node_manager_addresses_.contains(node_id)) { - // It is possible the new node information is not received at this point. - // In this case, the PullManager will handle retry, so we just return. - return; - } - const auto &entry = remote_node_manager_addresses_.find(node_id); - // TODO(sang): Use a node manager pool instead. - auto raylet_client = - std::make_shared(rpc::NodeManagerWorkerClient::make( - entry->second.first, entry->second.second, client_call_manager_)); - raylet_client->RestoreSpilledObject( - object_id, spilled_url, node_id, - [](const ray::Status &status, const rpc::RestoreSpilledObjectReply &r) { - if (!status.ok()) { - RAY_LOG(WARNING) << "Failed to send a spilled object restoration request to a " - "remote node. This request will be retried. Error message: " - << status.ToString(); - } - }); -} - } // namespace raylet } // namespace ray diff --git a/src/ray/raylet/node_manager.h b/src/ray/raylet/node_manager.h index 3a68fcbae992..d626e5246297 100644 --- a/src/ray/raylet/node_manager.h +++ b/src/ray/raylet/node_manager.h @@ -28,7 +28,6 @@ #include "ray/common/task/scheduling_resources.h" #include "ray/object_manager/object_manager.h" #include "ray/raylet/agent_manager.h" -#include "ray/raylet_client/raylet_client.h" #include "ray/raylet/local_object_manager.h" #include "ray/raylet/scheduling/scheduling_ids.h" #include "ray/raylet/scheduling/cluster_resource_scheduler.h" @@ -604,11 +603,6 @@ class NodeManager : public rpc::NodeManagerServiceHandler, rpc::RequestObjectSpillageReply *reply, rpc::SendReplyCallback send_reply_callback) override; - /// Handle a `RestoreSpilledObject` request. - void HandleRestoreSpilledObject(const rpc::RestoreSpilledObjectRequest &request, - rpc::RestoreSpilledObjectReply *reply, - rpc::SendReplyCallback send_reply_callback) override; - /// Handle a `ReleaseUnusedBundles` request. void HandleReleaseUnusedBundles(const rpc::ReleaseUnusedBundlesRequest &request, rpc::ReleaseUnusedBundlesReply *reply, @@ -639,11 +633,6 @@ class NodeManager : public rpc::NodeManagerServiceHandler, /// \param task Task that is infeasible void PublishInfeasibleTaskError(const Task &task) const; - /// Send a object restoration request to a remote node of a given node id. - void SendSpilledObjectRestorationRequestToRemoteNode(const ObjectID &object_id, - const std::string &spilled_url, - const NodeID &node_id); - std::unordered_map> MakeTasksByClass( const std::vector &tasks) const; diff --git a/src/ray/raylet/raylet.cc b/src/ray/raylet/raylet.cc index 4d9514e626da..6aeec576e1e4 100644 --- a/src/ray/raylet/raylet.cc +++ b/src/ray/raylet/raylet.cc @@ -72,11 +72,10 @@ Raylet::Raylet(boost::asio::io_service &main_service, const std::string &socket_ std::make_shared(main_service, gcs_client_))), object_manager_( main_service, self_node_id_, object_manager_config, object_directory_, - [this](const ObjectID &object_id, const std::string &object_url, - const NodeID &node_id, + [this](const ObjectID &object_id, const std::string &spilled_url, std::function callback) { node_manager_.GetLocalObjectManager().AsyncRestoreSpilledObject( - object_id, object_url, node_id, callback); + object_id, spilled_url, callback); }, [this]() { // This callback is called from the plasma store thread. diff --git a/src/ray/raylet/reconstruction_policy.cc b/src/ray/raylet/reconstruction_policy.cc index 1da422529cda..f4fd3d025fda 100644 --- a/src/ray/raylet/reconstruction_policy.cc +++ b/src/ray/raylet/reconstruction_policy.cc @@ -179,8 +179,7 @@ void ReconstructionPolicy::HandleTaskLeaseExpired(const TaskID &task_id) { created_object_id, it->second.owner_addresses[created_object_id], [this, task_id, reconstruction_attempt]( const ray::ObjectID &object_id, const std::unordered_set &nodes, - const std::string &spilled_url, const ray::NodeID &spilled_node_id, - size_t object_size) { + const std::string &spilled_url, size_t object_size) { if (nodes.empty() && spilled_url.empty()) { // The required object no longer exists on any live nodes. Attempt // reconstruction. diff --git a/src/ray/raylet/reconstruction_policy_test.cc b/src/ray/raylet/reconstruction_policy_test.cc index d4eb387a3ac0..8b5fd9d0e75c 100644 --- a/src/ray/raylet/reconstruction_policy_test.cc +++ b/src/ray/raylet/reconstruction_policy_test.cc @@ -58,10 +58,9 @@ class MockObjectDirectory : public ObjectDirectoryInterface { const ObjectID object_id = callback.first; auto it = locations_.find(object_id); if (it == locations_.end()) { - callback.second(object_id, std::unordered_set(), "", NodeID::Nil(), - 0); + callback.second(object_id, std::unordered_set(), "", 0); } else { - callback.second(object_id, it->second, "", NodeID::Nil(), 0); + callback.second(object_id, it->second, "", 0); } } callbacks_.clear(); diff --git a/src/ray/raylet/test/local_object_manager_test.cc b/src/ray/raylet/test/local_object_manager_test.cc index 8ff77250f78f..bbae5bb144b0 100644 --- a/src/ray/raylet/test/local_object_manager_test.cc +++ b/src/ray/raylet/test/local_object_manager_test.cc @@ -84,16 +84,12 @@ class MockIOWorkerClient : public rpc::CoreWorkerClientInterface { restore_callbacks.push_back(callback); } - bool ReplyRestoreObjects(int64_t bytes_restored, Status status = Status::OK()) { + void ReplyRestoreObjects(int64_t bytes_restored, Status status = Status::OK()) { rpc::RestoreSpilledObjectsReply reply; reply.set_bytes_restored_total(bytes_restored); - if (restore_callbacks.size() == 0) { - return false; - }; auto callback = restore_callbacks.front(); callback(status, reply); restore_callbacks.pop_front(); - return true; } void DeleteSpilledObjects( @@ -194,7 +190,6 @@ class MockObjectInfoAccessor : public gcs::ObjectInfoAccessor { size_t object_size, const gcs::StatusCallback &callback)); Status AsyncAddSpilledUrl(const ObjectID &object_id, const std::string &spilled_url, - const NodeID &spilled_node_id, const gcs::StatusCallback &callback) { object_urls[object_id] = spilled_url; callbacks.push_back(callback); @@ -257,15 +252,12 @@ class LocalObjectManagerTest : public ::testing::Test { LocalObjectManagerTest() : owner_client(std::make_shared()), client_pool([&](const rpc::Address &addr) { return owner_client; }), - manager_node_id_(NodeID::FromRandom()), - manager(manager_node_id_, free_objects_batch_size, + manager(io_service_, free_objects_batch_size, /*free_objects_period_ms=*/1000, worker_pool, object_table, client_pool, /*object_pinning_enabled=*/true, /*automatic_object_delete_enabled=*/true, /*max_io_workers=*/2, /*min_spilling_size=*/0, - /*is_external_storage_type_fs=*/true, - /*on_objects_freed=*/ [&](const std::vector &object_ids) { for (const auto &object_id : object_ids) { freed.insert(object_id); @@ -274,24 +266,12 @@ class LocalObjectManagerTest : public ::testing::Test { /*is_plasma_object_spillable=*/ [&](const ray::ObjectID &object_id) { return unevictable_objects_.count(object_id) == 0; - }, - /*restore_object_from_remote_node=*/ - [&](const ObjectID &object_id, const std::string spilled_url, - const NodeID &node_id) { - if (remote_node_set_restore_requested_.count(node_id) == 0) { - remote_node_set_restore_requested_.emplace( - node_id, std::unordered_set()); - } - remote_node_set_restore_requested_[node_id].emplace(object_id); }), unpins(std::make_shared>()) { RayConfig::instance().initialize({{"object_spilling_config", "mock_config"}}); } - void TearDown() { - unevictable_objects_.clear(); - remote_node_set_restore_requested_.clear(); - } + void TearDown() { unevictable_objects_.clear(); } std::string BuildURL(const std::string url, int offset = 0, int num_objects = 1) { return url + "?" + "num_objects=" + std::to_string(num_objects) + @@ -304,10 +284,7 @@ class LocalObjectManagerTest : public ::testing::Test { rpc::CoreWorkerClientPool client_pool; MockIOWorkerPool worker_pool; MockObjectInfoAccessor object_table; - NodeID manager_node_id_; LocalObjectManager manager; - std::unordered_map> - remote_node_set_restore_requested_; std::unordered_set freed; // This hashmap is incremented when objects are unpinned by destroying their @@ -346,43 +323,16 @@ TEST_F(LocalObjectManagerTest, TestPin) { } TEST_F(LocalObjectManagerTest, TestRestoreSpilledObject) { - // First, spill objects. - std::vector object_ids; - std::vector> objects; - - for (size_t i = 0; i < free_objects_batch_size; i++) { - ObjectID object_id = ObjectID::FromRandom(); - object_ids.push_back(object_id); - auto data_buffer = std::make_shared(0, object_id, unpins); - std::unique_ptr object( - new RayObject(data_buffer, nullptr, std::vector())); - objects.push_back(std::move(object)); - } - manager.PinObjects(object_ids, std::move(objects)); - - manager.SpillObjects(object_ids, - [&](const Status &status) mutable { ASSERT_TRUE(status.ok()); }); - std::vector urls; - for (size_t i = 0; i < object_ids.size(); i++) { - urls.push_back(BuildURL("url" + std::to_string(i))); - } - ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects(urls)); - for (size_t i = 0; i < object_ids.size(); i++) { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - } - - // Then try restoring objects from local. - ObjectID object_id = object_ids[0]; - const auto url = urls[0]; + ObjectID object_id = ObjectID::FromRandom(); + std::string object_url("url"); int num_times_fired = 0; EXPECT_CALL(worker_pool, PushRestoreWorker(_)); // Subsequent calls should be deduped, so that only one callback should be fired. for (int i = 0; i < 10; i++) { - manager.AsyncRestoreSpilledObject(object_id, url, manager_node_id_, - [&](const Status &status) { - ASSERT_TRUE(status.ok()); - num_times_fired++; - }); + manager.AsyncRestoreSpilledObject(object_id, object_url, [&](const Status &status) { + ASSERT_TRUE(status.ok()); + num_times_fired++; + }); } ASSERT_EQ(num_times_fired, 0); @@ -392,25 +342,7 @@ TEST_F(LocalObjectManagerTest, TestRestoreSpilledObject) { ASSERT_EQ(num_times_fired, 0); } worker_pool.io_worker_client->ReplyRestoreObjects(10); - // The restore should've been invoked. ASSERT_EQ(num_times_fired, 1); - - // If the object wasn't spilled on the current node, it should request restoration to - // remote nodes. - ObjectID remote_object_id = ObjectID::FromRandom(); - const auto remote_object_url = BuildURL("remote_url"); - NodeID remote_node_id = NodeID::FromRandom(); - manager.AsyncRestoreSpilledObject(remote_object_id, remote_object_url, remote_node_id, - [&](const Status &status) { - ASSERT_TRUE(status.ok()); - num_times_fired++; - }); - // Make sure the remote call was invoked. - ASSERT_FALSE(worker_pool.io_worker_client->ReplyRestoreObjects(10)); - ASSERT_TRUE(remote_node_set_restore_requested_.count(remote_node_id) > 0); - ASSERT_TRUE(remote_node_set_restore_requested_[remote_node_id].count(remote_object_id) > - 0); - ASSERT_EQ(num_times_fired, 2); } TEST_F(LocalObjectManagerTest, TestExplicitSpill) { diff --git a/src/ray/raylet_client/raylet_client.cc b/src/ray/raylet_client/raylet_client.cc index b3177071a144..739832b2bb40 100644 --- a/src/ray/raylet_client/raylet_client.cc +++ b/src/ray/raylet_client/raylet_client.cc @@ -311,18 +311,6 @@ void raylet::RayletClient::RequestObjectSpillage( grpc_client_->RequestObjectSpillage(request, callback); } -void raylet::RayletClient::RestoreSpilledObject( - const ObjectID &object_id, const std::string &object_url, - const NodeID &spilled_node_id, - const rpc::ClientCallback &callback) { - RAY_CHECK(!spilled_node_id.IsNil()); - rpc::RestoreSpilledObjectRequest request; - request.set_object_id(object_id.Binary()); - request.set_object_url(object_url); - request.set_spilled_node_id(spilled_node_id.Binary()); - grpc_client_->RestoreSpilledObject(request, callback); -} - Status raylet::RayletClient::ReturnWorker(int worker_port, const WorkerID &worker_id, bool disconnect_worker) { rpc::ReturnWorkerRequest request; diff --git a/src/ray/raylet_client/raylet_client.h b/src/ray/raylet_client/raylet_client.h index cf9cfea56d7f..185ca445ac3b 100644 --- a/src/ray/raylet_client/raylet_client.h +++ b/src/ray/raylet_client/raylet_client.h @@ -332,15 +332,6 @@ class RayletClient : public RayletClientInterface { const ObjectID &object_id, const rpc::ClientCallback &callback); - /// Ask the raylet to restore the object of a given id. - /// \param object_id Object id that the remote raylet needs to restore. - /// \param object_url Object URL where the object is spilled. - /// \param spilled_node_id Node id of a node where the object is spilled. - void RestoreSpilledObject( - const ObjectID &object_id, const std::string &object_url, - const NodeID &spilled_node_id, - const rpc::ClientCallback &callback); - /// Implements WorkerLeaseInterface. void RequestWorkerLease( const ray::TaskSpecification &resource_spec, diff --git a/src/ray/rpc/node_manager/node_manager_client.h b/src/ray/rpc/node_manager/node_manager_client.h index 81182ab94ab4..1c9b16c18370 100644 --- a/src/ray/rpc/node_manager/node_manager_client.h +++ b/src/ray/rpc/node_manager/node_manager_client.h @@ -100,9 +100,6 @@ class NodeManagerWorkerClient /// Ask the raylet to spill an object to external storage. VOID_RPC_CLIENT_METHOD(NodeManagerService, RequestObjectSpillage, grpc_client_, ) - /// Ask the raylet to restore an object from external storage. - VOID_RPC_CLIENT_METHOD(NodeManagerService, RestoreSpilledObject, grpc_client_, ) - /// Release unused bundles. VOID_RPC_CLIENT_METHOD(NodeManagerService, ReleaseUnusedBundles, grpc_client_, ) diff --git a/src/ray/rpc/node_manager/node_manager_server.h b/src/ray/rpc/node_manager/node_manager_server.h index 7f769150871c..08893d49f7a7 100644 --- a/src/ray/rpc/node_manager/node_manager_server.h +++ b/src/ray/rpc/node_manager/node_manager_server.h @@ -36,7 +36,6 @@ namespace rpc { RPC_SERVICE_HANDLER(NodeManagerService, CommitBundleResources) \ RPC_SERVICE_HANDLER(NodeManagerService, CancelResourceReserve) \ RPC_SERVICE_HANDLER(NodeManagerService, RequestObjectSpillage) \ - RPC_SERVICE_HANDLER(NodeManagerService, RestoreSpilledObject) \ RPC_SERVICE_HANDLER(NodeManagerService, ReleaseUnusedBundles) /// Interface of the `NodeManagerService`, see `src/ray/protobuf/node_manager.proto`. @@ -103,10 +102,6 @@ class NodeManagerServiceHandler { RequestObjectSpillageReply *reply, SendReplyCallback send_reply_callback) = 0; - virtual void HandleRestoreSpilledObject(const RestoreSpilledObjectRequest &request, - RestoreSpilledObjectReply *reply, - SendReplyCallback send_reply_callback) = 0; - virtual void HandleReleaseUnusedBundles(const ReleaseUnusedBundlesRequest &request, ReleaseUnusedBundlesReply *reply, SendReplyCallback send_reply_callback) = 0; From 5d23f967f35c0f0ecfc255e75b3388fe9156523b Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 216/244] Revert "[ray_client]: Add more retry logic (#13478)" This reverts commit bc386dd8d93d5ca81312932f22066b63b2133b62. --- python/ray/tests/BUILD | 1 - python/ray/tests/test_client.py | 51 ++++++++++++--------- python/ray/tests/test_client_init.py | 37 ---------------- python/ray/util/client/worker.py | 66 +++++----------------------- 4 files changed, 41 insertions(+), 114 deletions(-) delete mode 100644 python/ray/tests/test_client_init.py diff --git a/python/ray/tests/BUILD b/python/ray/tests/BUILD index 8fe8b21c3369..7f4c61bb1cfb 100644 --- a/python/ray/tests/BUILD +++ b/python/ray/tests/BUILD @@ -79,7 +79,6 @@ py_test_module_list( "test_asyncio.py", "test_autoscaler.py", "test_autoscaler_yaml.py", - "test_client_init.py", "test_client_metadata.py", "test_client.py", "test_client_references.py", diff --git a/python/ray/tests/test_client.py b/python/ray/tests/test_client.py index dc5de2470e6e..21bb807fda55 100644 --- a/python/ray/tests/test_client.py +++ b/python/ray/tests/test_client.py @@ -2,13 +2,42 @@ import time import sys import logging -import threading import ray.util.client.server.server as ray_client_server +from ray.util.client import RayAPIStub from ray.util.client.common import ClientObjectRef from ray.util.client.ray_client_helpers import ray_start_client_server +def test_num_clients(shutdown_only): + # Tests num clients reporting; useful if you want to build an app that + # load balances clients between Ray client servers. + server = ray_client_server.serve("localhost:50051") + try: + api1 = RayAPIStub() + info1 = api1.connect("localhost:50051") + assert info1["num_clients"] == 1, info1 + api2 = RayAPIStub() + info2 = api2.connect("localhost:50051") + assert info2["num_clients"] == 2, info2 + + # Disconnect the first two clients. + api1.disconnect() + api2.disconnect() + time.sleep(1) + + api3 = RayAPIStub() + info3 = api3.connect("localhost:50051") + assert info3["num_clients"] == 1, info3 + + # Check info contains ray and python version. + assert isinstance(info3["ray_version"], str), info3 + assert isinstance(info3["ray_commit"], str), info3 + assert isinstance(info3["python_version"], str), info3 + finally: + server.stop(0) + + @pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.") def test_real_ray_fallback(ray_start_regular_shared): with ray_start_client_server() as ray: @@ -344,25 +373,5 @@ def test_internal_kv(ray_start_regular_shared): assert ray._internal_kv_get("apple") == b"" -def test_startup_retry(ray_start_regular_shared): - from ray.util.client import ray as ray_client - ray_client._inside_client_test = True - - with pytest.raises(ConnectionError): - ray_client.connect("localhost:50051", connection_retries=1) - - def run_client(): - ray_client.connect("localhost:50051") - ray_client.disconnect() - - thread = threading.Thread(target=run_client, daemon=True) - thread.start() - time.sleep(3) - server = ray_client_server.serve("localhost:50051") - thread.join() - server.stop(0) - ray_client._inside_client_test = False - - if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_client_init.py b/python/ray/tests/test_client_init.py deleted file mode 100644 index 1949fe3fdc8f..000000000000 --- a/python/ray/tests/test_client_init.py +++ /dev/null @@ -1,37 +0,0 @@ -"""Client tests that run their own init (as with init_and_serve) live here""" -import time - -import ray.util.client.server.server as ray_client_server - -from ray.util.client import RayAPIStub - - -def test_num_clients(): - # Tests num clients reporting; useful if you want to build an app that - # load balances clients between Ray client servers. - server, _ = ray_client_server.init_and_serve("localhost:50051") - try: - api1 = RayAPIStub() - info1 = api1.connect("localhost:50051") - assert info1["num_clients"] == 1, info1 - api2 = RayAPIStub() - info2 = api2.connect("localhost:50051") - assert info2["num_clients"] == 2, info2 - - # Disconnect the first two clients. - api1.disconnect() - api2.disconnect() - time.sleep(1) - - api3 = RayAPIStub() - info3 = api3.connect("localhost:50051") - assert info3["num_clients"] == 1, info3 - - # Check info contains ray and python version. - assert isinstance(info3["ray_version"], str), info3 - assert isinstance(info3["ray_commit"], str), info3 - assert isinstance(info3["python_version"], str), info3 - api3.disconnect() - finally: - ray_client_server.shutdown_with_server(server) - time.sleep(2) diff --git a/python/ray/util/client/worker.py b/python/ray/util/client/worker.py index d62173be745f..3c6401fdafd6 100644 --- a/python/ray/util/client/worker.py +++ b/python/ray/util/client/worker.py @@ -5,7 +5,6 @@ import base64 import json import logging -import time import uuid from collections import defaultdict from typing import Any @@ -34,13 +33,6 @@ MAX_TIMEOUT_SEC = 30 -def backoff(timeout: int) -> int: - timeout = timeout + 5 - if timeout > MAX_TIMEOUT_SEC: - timeout = MAX_TIMEOUT_SEC - return timeout - - class Worker: def __init__(self, conn_str: str = "", @@ -67,59 +59,23 @@ def __init__(self, else: self.channel = grpc.insecure_channel(conn_str) - # Retry the connection until the channel responds to something - # looking like a gRPC connection, though it may be a proxy. conn_attempts = 0 timeout = INITIAL_TIMEOUT_SEC - ray_ready = False - while conn_attempts < max(connection_retries, 1): + while conn_attempts < connection_retries + 1: conn_attempts += 1 try: - # Let gRPC wait for us to see if the channel becomes ready. - # If it throws, we couldn't connect. grpc.channel_ready_future(self.channel).result(timeout=timeout) - # The HTTP2 channel is ready. Wrap the channel with the - # RayletDriverStub, allowing for unary requests. - self.server = ray_client_pb2_grpc.RayletDriverStub( - self.channel) - # Now the HTTP2 channel is ready, or proxied, but the - # servicer may not be ready. Call is_initialized() and if - # it throws, the servicer is not ready. On success, the - # `ray_ready` result is checked. - ray_ready = self.is_initialized() - if ray_ready: - # Ray is ready! Break out of the retry loop - break - # Ray is not ready yet, wait a timeout - time.sleep(timeout) + break except grpc.FutureTimeoutError: - logger.info( - f"Couldn't connect channel in {timeout} seconds, retrying") - # Note that channel_ready_future constitutes its own timeout, - # which is why we do not sleep here. - except grpc.RpcError as e: - if e.code() == grpc.StatusCode.UNAVAILABLE: - # UNAVAILABLE is gRPC's retryable error, - # so we do that here. - logger.info("Ray client server unavailable, " - f"retrying in {timeout}s...") - logger.debug(f"Received when checking init: {e.details()}") - # Ray is not ready yet, wait a timeout - time.sleep(timeout) - else: - # Any other gRPC error gets a reraise - raise e - # Fallthrough, backoff, and retry at the top of the loop - logger.info("Waiting for Ray to become ready on the server, " - f"retry in {timeout}s...") - timeout = backoff(timeout) - - # If we made it through the loop without ray_ready it means we've used - # up our retries and should error back to the user. - if not ray_ready: - raise ConnectionError("ray client connection timeout") - - # Initialize the streams to finish protocol negotiation. + if conn_attempts >= connection_retries: + raise ConnectionError("ray client connection timeout") + logger.info(f"Couldn't connect in {timeout} seconds, retrying") + timeout = timeout + 5 + if timeout > MAX_TIMEOUT_SEC: + timeout = MAX_TIMEOUT_SEC + + self.server = ray_client_pb2_grpc.RayletDriverStub(self.channel) + self.data_client = DataClient(self.channel, self._client_id, self.metadata) self.reference_count: Dict[bytes, int] = defaultdict(int) From a5acece537f68e69fae3343b134319b623fd6c31 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 217/244] Revert "deprecate useless fields in the cluster yaml. (#13637)" This reverts commit f33914f1a56185cd65913d050b0074d4eda2b4b0. --- dashboard/modules/reporter/reporter_head.py | 5 ++++- doc/examples/lm/lm-cluster.yaml | 17 +++++++++++++++++ python/ray/autoscaler/ray-schema.json | 12 ++++-------- python/ray/serve/benchmarks/cluster.yaml | 3 +++ .../test_cli_patterns/test_ray_up_config.yaml | 2 ++ .../test_ray_up_docker_config.yaml | 2 ++ python/ray/tests/test_coordinator_server.py | 2 ++ .../util/sgd/tf/examples/tf-example-sgd.yaml | 3 +++ .../sgd/torch/examples/benchmarks/README.rst | 1 + .../examples/benchmarks/horovod-benchmark.yaml | 3 +++ .../util/sgd/torch/examples/example-sgd.yaml | 3 +++ .../torch/examples/image_models/cluster.yaml | 3 +++ .../torch/examples/segmentation/example.yaml | 2 ++ .../sgd/torch/examples/sgd-development.yaml | 3 +++ .../torch/examples/transformers/cluster.yaml | 2 ++ release/horovod_tests/cluster.yaml | 2 ++ .../long_running_distributed_tests/cluster.yaml | 1 + release/rllib_tests/stress_tests/cluster.yaml | 1 + release/stress_tests/autoscaler-cluster.yaml | 7 +++++++ release/stress_tests/cluster.yaml | 7 +++++++ .../tune_tests/scalability_tests/cluster.yaml | 2 ++ release/xgboost_tests/cluster_cpu_moderate.yaml | 2 ++ release/xgboost_tests/cluster_cpu_small.yaml | 2 ++ release/xgboost_tests/cluster_gpu_small.yaml | 2 ++ 24 files changed, 80 insertions(+), 9 deletions(-) diff --git a/dashboard/modules/reporter/reporter_head.py b/dashboard/modules/reporter/reporter_head.py index 7d375c8d66c4..2d84c6b65c21 100644 --- a/dashboard/modules/reporter/reporter_head.py +++ b/dashboard/modules/reporter/reporter_head.py @@ -78,7 +78,10 @@ async def get_ray_config(self, req) -> aiohttp.web.Response: payload = { "min_workers": cfg["min_workers"], - "max_workers": cfg["max_workers"] + "max_workers": cfg["max_workers"], + "initial_workers": cfg["initial_workers"], + "autoscaling_mode": cfg["autoscaling_mode"], + "idle_timeout_minutes": cfg["idle_timeout_minutes"], } try: diff --git a/doc/examples/lm/lm-cluster.yaml b/doc/examples/lm/lm-cluster.yaml index 7ea6641f588d..3590d482aa64 100644 --- a/doc/examples/lm/lm-cluster.yaml +++ b/doc/examples/lm/lm-cluster.yaml @@ -9,6 +9,23 @@ min_workers: 1 # node. This takes precedence over min_workers. max_workers: 2 +# The initial number of worker nodes to launch in addition to the head +# node. When the cluster is first brought up (or when it is refreshed with a +# subsequent `ray up`) this number of nodes will be started. +initial_workers: 1 + +# Whether or not to autoscale aggressively. If this is enabled, if at any point +# we would start more workers, we start at least enough to bring us to +# initial_workers. +autoscaling_mode: default + + +# The autoscaler will scale up the cluster to this target fraction of resource +# usage. For example, if a cluster of 10 nodes is 100% busy and +# target_utilization is 0.8, it would resize the cluster to 13. This fraction +# can be decreased to increase the aggressiveness of upscaling. +# This value must be less than 1.0 for scaling to happen. +target_utilization_fraction: 0.48 # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 5 diff --git a/python/ray/autoscaler/ray-schema.json b/python/ray/autoscaler/ray-schema.json index 7c7b2a1ed4ba..22b21b84cb66 100644 --- a/python/ray/autoscaler/ray-schema.json +++ b/python/ray/autoscaler/ray-schema.json @@ -24,7 +24,7 @@ "type": "string" }, "min_workers": { - "description": "DEPRECATED. Use the per node_type min_workers field instead.", + "description": "The minimum number of workers nodes to launch in addition to the head node. This number should be >= 0", "type": "integer", "minimum": 0 }, @@ -34,17 +34,17 @@ "minimum": 0 }, "initial_workers": { - "description": "DEPRECATED.", + "description": "The number of workers to launch initially, in addition to the head node.", "type": "integer", "minimum": 0 }, "autoscaling_mode": { - "description": "DEPRECATED. Use upscaling_speed instead.", + "description": "The mode of the autoscaler e.g. default, aggressive", "type": "string", "enum": [ "default", "aggressive" ] }, "target_utilization_fraction": { - "description": "DEPRECATED. Use upscaling_speed instead.", + "description": "The autoscaler will scale up the cluster to this target fraction of resources usage. For example, if a cluster of 8 nodes is 100% busy # and target_utilization was 0.8, it would resize the cluster to 10.", "type": "number", "minimum": 0, "maximum": 1 @@ -254,10 +254,6 @@ "type": "string", "description": "If using multiple node types, specifies the head node type." }, - "worker_default_node_type": { - "type": "string", - "description": "DEPRECATED." - }, "head_node": { "type": "object", "description": "Provider-specific config for the head node, e.g. instance type." diff --git a/python/ray/serve/benchmarks/cluster.yaml b/python/ray/serve/benchmarks/cluster.yaml index aad50bf97d3e..d588dc06a207 100644 --- a/python/ray/serve/benchmarks/cluster.yaml +++ b/python/ray/serve/benchmarks/cluster.yaml @@ -1,10 +1,13 @@ cluster_name: default min_workers: 5 max_workers: 5 +initial_workers: 5 +autoscaling_mode: default docker: image: 'anyscale/ray-ml:latest' container_name: ray_container pull_before_run: true +target_utilization_fraction: 0.8 idle_timeout_minutes: 5 provider: type: aws diff --git a/python/ray/tests/test_cli_patterns/test_ray_up_config.yaml b/python/ray/tests/test_cli_patterns/test_ray_up_config.yaml index f3d6a03ce1b1..4d63420092e5 100644 --- a/python/ray/tests/test_cli_patterns/test_ray_up_config.yaml +++ b/python/ray/tests/test_cli_patterns/test_ray_up_config.yaml @@ -12,6 +12,7 @@ head_start_ray_commands: - ray stop - ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml idle_timeout_minutes: 5 +initial_workers: 1 initialization_commands: - echo init max_workers: 2 @@ -26,6 +27,7 @@ setup_commands: - echo a - echo b - echo ${echo hi} +target_utilization_fraction: 0.9 worker_nodes: ImageId: latest_dlami InstanceType: t1.micro diff --git a/python/ray/tests/test_cli_patterns/test_ray_up_docker_config.yaml b/python/ray/tests/test_cli_patterns/test_ray_up_docker_config.yaml index bffd0f53f2ae..8d898f749646 100644 --- a/python/ray/tests/test_cli_patterns/test_ray_up_docker_config.yaml +++ b/python/ray/tests/test_cli_patterns/test_ray_up_docker_config.yaml @@ -17,6 +17,7 @@ head_start_ray_commands: - ray stop - ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml idle_timeout_minutes: 5 +initial_workers: 1 initialization_commands: - echo init max_workers: 2 @@ -31,6 +32,7 @@ setup_commands: - echo a - echo b - echo ${echo hi} +target_utilization_fraction: 0.9 worker_nodes: ImageId: latest_dlami InstanceType: t3a.small diff --git a/python/ray/tests/test_coordinator_server.py b/python/ray/tests/test_coordinator_server.py index 0c59b909e94c..6fb654e3e550 100644 --- a/python/ray/tests/test_coordinator_server.py +++ b/python/ray/tests/test_coordinator_server.py @@ -52,6 +52,7 @@ def testClusterStateInit(self): "cluster_name": "random_name", "min_workers": 0, "max_workers": 0, + "initial_workers": 0, "provider": { "type": "local", "head_ip": "0.0.0.0:2", @@ -153,6 +154,7 @@ def testCoordinatorSenderNodeProvider(self): "cluster_name": "random_name", "min_workers": 0, "max_workers": 0, + "initial_workers": 0, "provider": { "type": "local", "coordinator_address": self.coordinator_address, diff --git a/python/ray/util/sgd/tf/examples/tf-example-sgd.yaml b/python/ray/util/sgd/tf/examples/tf-example-sgd.yaml index fcf31354b70e..846f5f10ce3c 100644 --- a/python/ray/util/sgd/tf/examples/tf-example-sgd.yaml +++ b/python/ray/util/sgd/tf/examples/tf-example-sgd.yaml @@ -4,8 +4,11 @@ cluster_name: sgd-tf # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 3 +initial_workers: 3 max_workers: 3 +target_utilization_fraction: 0.9 + # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 20 # docker: diff --git a/python/ray/util/sgd/torch/examples/benchmarks/README.rst b/python/ray/util/sgd/torch/examples/benchmarks/README.rst index 54b3ce192b68..78dd71a15f51 100644 --- a/python/ray/util/sgd/torch/examples/benchmarks/README.rst +++ b/python/ray/util/sgd/torch/examples/benchmarks/README.rst @@ -104,6 +104,7 @@ You can specify the number of nodes you want to use with the following configura # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: # Change this to a custom quantity + initial_workers: # same as above max_workers: # same as above You may want to install FP16 support for PyTorch with the following configuration in the YAML file: diff --git a/python/ray/util/sgd/torch/examples/benchmarks/horovod-benchmark.yaml b/python/ray/util/sgd/torch/examples/benchmarks/horovod-benchmark.yaml index 7e3db50510ff..04cbd520e135 100644 --- a/python/ray/util/sgd/torch/examples/benchmarks/horovod-benchmark.yaml +++ b/python/ray/util/sgd/torch/examples/benchmarks/horovod-benchmark.yaml @@ -4,8 +4,11 @@ cluster_name: horovod-pytorch # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 1 +initial_workers: 1 max_workers: 1 +target_utilization_fraction: 0.9 + # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 50 # docker: diff --git a/python/ray/util/sgd/torch/examples/example-sgd.yaml b/python/ray/util/sgd/torch/examples/example-sgd.yaml index 6bbc64423aab..fe9b18d191b0 100644 --- a/python/ray/util/sgd/torch/examples/example-sgd.yaml +++ b/python/ray/util/sgd/torch/examples/example-sgd.yaml @@ -4,8 +4,11 @@ cluster_name: sgd-pytorch # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 3 +initial_workers: 3 max_workers: 3 +target_utilization_fraction: 0.9 + # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 20 # docker: diff --git a/python/ray/util/sgd/torch/examples/image_models/cluster.yaml b/python/ray/util/sgd/torch/examples/image_models/cluster.yaml index 7d9ff9be89e0..fccd5f8625bd 100644 --- a/python/ray/util/sgd/torch/examples/image_models/cluster.yaml +++ b/python/ray/util/sgd/torch/examples/image_models/cluster.yaml @@ -4,8 +4,11 @@ cluster_name: sgd-pytorch-imagenet # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 1 +initial_workers: 1 max_workers: 1 +target_utilization_fraction: 0.9 + # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 10 # docker: diff --git a/python/ray/util/sgd/torch/examples/segmentation/example.yaml b/python/ray/util/sgd/torch/examples/segmentation/example.yaml index 33db0f445537..78cd9bcb09ba 100644 --- a/python/ray/util/sgd/torch/examples/segmentation/example.yaml +++ b/python/ray/util/sgd/torch/examples/segmentation/example.yaml @@ -4,8 +4,10 @@ cluster_name: sgd-coco-pytorch # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 1 +initial_workers: 1 max_workers: 1 +target_utilization_fraction: 0.9 # Cloud-provider specific configuration. provider: type: aws diff --git a/python/ray/util/sgd/torch/examples/sgd-development.yaml b/python/ray/util/sgd/torch/examples/sgd-development.yaml index bc79803eeadd..590cb63b0708 100644 --- a/python/ray/util/sgd/torch/examples/sgd-development.yaml +++ b/python/ray/util/sgd/torch/examples/sgd-development.yaml @@ -4,8 +4,11 @@ cluster_name: sgd-pytorch # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 2 +initial_workers: 2 max_workers: 2 +target_utilization_fraction: 0.9 + # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 10 # docker: diff --git a/python/ray/util/sgd/torch/examples/transformers/cluster.yaml b/python/ray/util/sgd/torch/examples/transformers/cluster.yaml index 434b48d3044f..4cecd3bf86a1 100644 --- a/python/ray/util/sgd/torch/examples/transformers/cluster.yaml +++ b/python/ray/util/sgd/torch/examples/transformers/cluster.yaml @@ -4,8 +4,10 @@ cluster_name: transformer-cluster # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 3 +initial_workers: 3 max_workers: 3 +target_utilization_fraction: 0.9 # Cloud-provider specific configuration. provider: type: aws diff --git a/release/horovod_tests/cluster.yaml b/release/horovod_tests/cluster.yaml index 5dbc457a78c7..880ebdba2423 100644 --- a/release/horovod_tests/cluster.yaml +++ b/release/horovod_tests/cluster.yaml @@ -10,6 +10,8 @@ min_workers: 3 # node. This takes precedence over min_workers. min_workers defaults to 0. max_workers: 3 +target_utilization_fraction: 0.8 + # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 5 diff --git a/release/long_running_distributed_tests/cluster.yaml b/release/long_running_distributed_tests/cluster.yaml index 4710a47fcc4a..f8d10549a24c 100644 --- a/release/long_running_distributed_tests/cluster.yaml +++ b/release/long_running_distributed_tests/cluster.yaml @@ -3,6 +3,7 @@ cluster_name: long-running-distributed-tests min_workers: 3 max_workers: 3 +target_utilization_fraction: 0.8 idle_timeout_minutes: 15 docker: diff --git a/release/rllib_tests/stress_tests/cluster.yaml b/release/rllib_tests/stress_tests/cluster.yaml index 4c83e27c33aa..8f20a46afb85 100644 --- a/release/rllib_tests/stress_tests/cluster.yaml +++ b/release/rllib_tests/stress_tests/cluster.yaml @@ -3,6 +3,7 @@ cluster_name: ray-rllib-stress-tests min_workers: 9 max_workers: 9 +target_utilization_fraction: 0.8 idle_timeout_minutes: 15 docker: diff --git a/release/stress_tests/autoscaler-cluster.yaml b/release/stress_tests/autoscaler-cluster.yaml index 9c17d303e4db..ed5ee2bd58f1 100644 --- a/release/stress_tests/autoscaler-cluster.yaml +++ b/release/stress_tests/autoscaler-cluster.yaml @@ -13,6 +13,13 @@ min_workers: 100 # node. This takes precedence over min_workers. max_workers: 100 +# The autoscaler will scale up the cluster to this target fraction of resource +# usage. For example, if a cluster of 10 nodes is 100% busy and +# target_utilization is 0.8, it would resize the cluster to 13. This fraction +# can be decreased to increase the aggressiveness of upscaling. +# This value must be less than 1.0 for scaling to happen. +target_utilization_fraction: 0.8 + # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 5 diff --git a/release/stress_tests/cluster.yaml b/release/stress_tests/cluster.yaml index 155ae1329c0b..a513d9764c11 100644 --- a/release/stress_tests/cluster.yaml +++ b/release/stress_tests/cluster.yaml @@ -13,6 +13,13 @@ min_workers: 100 # node. This takes precedence over min_workers. max_workers: 100 +# The autoscaler will scale up the cluster to this target fraction of resource +# usage. For example, if a cluster of 10 nodes is 100% busy and +# target_utilization is 0.8, it would resize the cluster to 13. This fraction +# can be decreased to increase the aggressiveness of upscaling. +# This value must be less than 1.0 for scaling to happen. +target_utilization_fraction: 0.8 + # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 5 diff --git a/release/tune_tests/scalability_tests/cluster.yaml b/release/tune_tests/scalability_tests/cluster.yaml index fd966898b8a7..e279efb37dab 100644 --- a/release/tune_tests/scalability_tests/cluster.yaml +++ b/release/tune_tests/scalability_tests/cluster.yaml @@ -2,7 +2,9 @@ cluster_name: ray-tune-scalability-tests min_workers: 15 max_workers: 15 +initial_workers: 15 +target_utilization_fraction: 0.8 idle_timeout_minutes: 15 docker: diff --git a/release/xgboost_tests/cluster_cpu_moderate.yaml b/release/xgboost_tests/cluster_cpu_moderate.yaml index a65c49336a1c..18a18dceb56e 100644 --- a/release/xgboost_tests/cluster_cpu_moderate.yaml +++ b/release/xgboost_tests/cluster_cpu_moderate.yaml @@ -2,7 +2,9 @@ cluster_name: ray-xgboost-release-cpu-moderate min_workers: 31 max_workers: 31 +initial_workers: 31 +target_utilization_fraction: 0.8 idle_timeout_minutes: 15 docker: diff --git a/release/xgboost_tests/cluster_cpu_small.yaml b/release/xgboost_tests/cluster_cpu_small.yaml index 4b97439b9d59..fe9e997f85aa 100644 --- a/release/xgboost_tests/cluster_cpu_small.yaml +++ b/release/xgboost_tests/cluster_cpu_small.yaml @@ -2,7 +2,9 @@ cluster_name: ray-xgboost-release-cpu-small min_workers: 3 max_workers: 3 +initial_workers: 3 +target_utilization_fraction: 0.8 idle_timeout_minutes: 15 docker: diff --git a/release/xgboost_tests/cluster_gpu_small.yaml b/release/xgboost_tests/cluster_gpu_small.yaml index 535d28490f71..5bea4f19acf2 100644 --- a/release/xgboost_tests/cluster_gpu_small.yaml +++ b/release/xgboost_tests/cluster_gpu_small.yaml @@ -2,7 +2,9 @@ cluster_name: ray-xgboost-release-gpu-small min_workers: 4 max_workers: 4 +initial_workers: 4 +target_utilization_fraction: 0.8 idle_timeout_minutes: 15 docker: From 983ea0d4aa441b784186688f7c0d95deedc9ba95 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 218/244] Revert "[tune] update Optuna integration to 2.4.0 API (#13631)" This reverts commit b272ae89bf64cc9190323654d06583cfc53d23fb. --- python/ray/tune/suggest/optuna.py | 10 +---- .../linux-py3.6-requirements_tune.txt | 45 ++++++++++--------- .../linux-py3.7-requirements_tune.txt | 43 +++++++++--------- .../linux-py3.8-requirements_tune.txt | 14 +++--- python/requirements/requirements_tune.in | 2 +- 5 files changed, 55 insertions(+), 59 deletions(-) diff --git a/python/ray/tune/suggest/optuna.py b/python/ray/tune/suggest/optuna.py index 61dd13d62646..a6468b8617dd 100644 --- a/python/ray/tune/suggest/optuna.py +++ b/python/ray/tune/suggest/optuna.py @@ -218,14 +218,8 @@ def on_trial_complete(self, error: bool = False): ot_trial = self._ot_trials[trial_id] ot_trial_id = ot_trial._trial_id - - val = result.get(self.metric, None) - if hasattr(self._storage, "set_trial_value"): - # Backwards compatibility with optuna < 2.4.0 - self._storage.set_trial_value(ot_trial_id, val) - else: - self._storage.set_trial_values(ot_trial_id, [val]) - + self._storage.set_trial_value(ot_trial_id, result.get( + self.metric, None)) self._storage.set_trial_state(ot_trial_id, ot.trial.TrialState.COMPLETE) diff --git a/python/requirements/linux-py3.6-requirements_tune.txt b/python/requirements/linux-py3.6-requirements_tune.txt index 4351d0b6386f..8d75554d451b 100644 --- a/python/requirements/linux-py3.6-requirements_tune.txt +++ b/python/requirements/linux-py3.6-requirements_tune.txt @@ -27,7 +27,7 @@ attrs==20.3.0 # pytest autocfg==0.0.6 # via gluoncv -autogluon.core==0.0.16b20210122 +autogluon.core==0.0.16b20210113 # via gluoncv autograd==1.3 # via autogluon.core @@ -35,7 +35,7 @@ ax-platform==0.1.9 ; python_version < "3.7" # via -r requirements_tune.in azure-core==1.10.0 # via azure-storage-blob -azure-storage-blob==12.7.1 +azure-storage-blob==12.6.0 # via mlflow backcall==0.2.0 # via ipython @@ -45,16 +45,16 @@ bayesian-optimization==1.2.0 # nevergrad bcrypt==3.2.0 # via paramiko -bleach==3.2.2 +bleach==3.2.1 # via nbconvert bokeh==2.2.3 # via dask -boto3==1.16.58 +boto3==1.16.53 # via # -c ../requirements.txt # autogluon.core # smart-open -botocore==1.19.58 +botocore==1.19.53 # via # boto3 # s3transfer @@ -87,7 +87,7 @@ click==7.1.2 # mlflow # sacremoses # wandb -cliff==3.6.0 +cliff==3.5.0 # via optuna cloudpickle==1.6.0 # via @@ -107,7 +107,7 @@ colorama==0.4.4 # via # -c ../requirements.txt # cmd2 -colorlog==4.7.2 +colorlog==4.6.2 # via optuna configparser==5.0.1 # via wandb @@ -129,7 +129,7 @@ cython==0.29.0 # -c ../requirements.txt # autogluon.core # configspace -dask[complete]==2021.1.0 +dask[complete]==2020.12.0 # via # -c ../requirements.txt # autogluon.core @@ -155,7 +155,7 @@ defusedxml==0.6.0 # via nbconvert dill==0.3.3 # via autogluon.core -distributed==2021.1.0 +distributed==2020.12.0 # via # autogluon.core # dask @@ -213,13 +213,13 @@ google-auth==1.24.0 # tensorboard gpy==1.9.9 # via -r requirements_tune.in -gpytorch==1.3.1 +gpytorch==1.3.0 # via botorch graphviz==0.8.4 # via # autogluon.core # mxnet -grpcio==1.35.0 +grpcio==1.34.1 # via # -c ../requirements.txt # tensorboard @@ -330,9 +330,9 @@ kubernetes==12.0.1 # -r requirements_tune.in lightgbm==3.1.1 # via -r requirements_tune.in -locket==0.2.1 +locket==0.2.0 # via partd -mako==1.1.4 +mako==1.1.3 # via alembic markdown==3.3.3 # via tensorboard @@ -366,7 +366,7 @@ nbconvert==6.0.7 # via # jupyter # notebook -nbformat==5.1.2 +nbformat==5.0.8 # via # ipywidgets # nbclient @@ -436,7 +436,7 @@ opencv-python==4.5.1.48 # via # gluoncv # gym -optuna==2.4.0 +optuna==2.3.0 # via -r requirements_tune.in packaging==20.8 # via @@ -501,7 +501,7 @@ prometheus-flask-exporter==0.18.1 # via mlflow promise==2.3 # via wandb -prompt-toolkit==3.0.13 +prompt-toolkit==3.0.10 # via # ipython # jupyter-console @@ -584,7 +584,7 @@ pytorch-lightning==1.0.3 # pytorch-lightning-bolts pytz==2020.5 # via pandas -pyyaml==5.4.1 +pyyaml==5.3.1 # via # -c ../requirements.txt # autocfg @@ -600,12 +600,12 @@ pyyaml==5.4.1 # pytorch-lightning # wandb # yacs -pyzmq==21.0.1 +pyzmq==20.0.0 # via # jupyter-client # notebook # qtconsole -qtconsole==5.0.2 +qtconsole==5.0.1 # via jupyter qtpy==1.9.0 # via qtconsole @@ -703,6 +703,7 @@ six==1.15.0 # azure-core # bcrypt # bleach + # cliff # cryptography # cycler # databricks-cli @@ -735,7 +736,7 @@ six==1.15.0 # traitlets # wandb # websocket-client -smart_open[s3]==4.0.1 +smart_open==4.0.1 # via # -c ../requirements.txt # -r requirements_tune.in @@ -762,9 +763,9 @@ tabulate==0.8.7 # databricks-cli tblib==1.7.0 # via distributed -tensorboard-plugin-wit==1.8.0 +tensorboard-plugin-wit==1.7.0 # via tensorboard -tensorboard==2.4.1 +tensorboard==2.4.0 # via pytorch-lightning tensorboardx==2.1 # via diff --git a/python/requirements/linux-py3.7-requirements_tune.txt b/python/requirements/linux-py3.7-requirements_tune.txt index c7a7b9204649..1ac1824330c0 100644 --- a/python/requirements/linux-py3.7-requirements_tune.txt +++ b/python/requirements/linux-py3.7-requirements_tune.txt @@ -27,7 +27,7 @@ attrs==20.3.0 # pytest autocfg==0.0.6 # via gluoncv -autogluon.core==0.0.16b20210122 +autogluon.core==0.0.16b20210113 # via gluoncv autograd==1.3 # via autogluon.core @@ -35,7 +35,7 @@ ax-platform==0.1.19 ; python_version >= "3.7" # via -r requirements_tune.in azure-core==1.10.0 # via azure-storage-blob -azure-storage-blob==12.7.1 +azure-storage-blob==12.6.0 # via mlflow backcall==0.2.0 # via ipython @@ -45,16 +45,16 @@ bayesian-optimization==1.2.0 # nevergrad bcrypt==3.2.0 # via paramiko -bleach==3.2.2 +bleach==3.2.1 # via nbconvert bokeh==2.2.3 # via dask -boto3==1.16.58 +boto3==1.16.53 # via # -c ../requirements.txt # autogluon.core # smart-open -botocore==1.19.58 +botocore==1.19.53 # via # boto3 # s3transfer @@ -87,7 +87,7 @@ click==7.1.2 # mlflow # sacremoses # wandb -cliff==3.6.0 +cliff==3.5.0 # via optuna cloudpickle==1.6.0 # via @@ -107,7 +107,7 @@ colorama==0.4.4 # via # -c ../requirements.txt # cmd2 -colorlog==4.7.2 +colorlog==4.6.2 # via optuna configparser==5.0.1 # via wandb @@ -127,7 +127,7 @@ cython==0.29.0 # -c ../requirements.txt # autogluon.core # configspace -dask[complete]==2021.1.0 +dask[complete]==2020.12.0 # via # -c ../requirements.txt # autogluon.core @@ -148,7 +148,7 @@ defusedxml==0.6.0 # via nbconvert dill==0.3.3 # via autogluon.core -distributed==2021.1.0 +distributed==2020.12.0 # via # autogluon.core # dask @@ -206,13 +206,13 @@ google-auth==1.24.0 # tensorboard gpy==1.9.9 # via -r requirements_tune.in -gpytorch==1.3.1 +gpytorch==1.3.0 # via botorch graphviz==0.8.4 # via # autogluon.core # mxnet -grpcio==1.35.0 +grpcio==1.34.0 # via # -c ../requirements.txt # tensorboard @@ -321,9 +321,9 @@ kubernetes==12.0.1 # -r requirements_tune.in lightgbm==3.1.1 # via -r requirements_tune.in -locket==0.2.1 +locket==0.2.0 # via partd -mako==1.1.4 +mako==1.1.3 # via alembic markdown==3.3.3 # via tensorboard @@ -357,7 +357,7 @@ nbconvert==6.0.7 # via # jupyter # notebook -nbformat==5.1.2 +nbformat==5.0.8 # via # ipywidgets # nbclient @@ -427,7 +427,7 @@ opencv-python==4.5.1.48 # via # gluoncv # gym -optuna==2.4.0 +optuna==2.3.0 # via -r requirements_tune.in packaging==20.8 # via @@ -492,7 +492,7 @@ prometheus-flask-exporter==0.18.1 # via mlflow promise==2.3 # via wandb -prompt-toolkit==3.0.13 +prompt-toolkit==3.0.10 # via # ipython # jupyter-console @@ -575,7 +575,7 @@ pytorch-lightning==1.0.3 # pytorch-lightning-bolts pytz==2020.5 # via pandas -pyyaml==5.4.1 +pyyaml==5.3.1 # via # -c ../requirements.txt # autocfg @@ -591,12 +591,12 @@ pyyaml==5.4.1 # pytorch-lightning # wandb # yacs -pyzmq==21.0.1 +pyzmq==20.0.0 # via # jupyter-client # notebook # qtconsole -qtconsole==5.0.2 +qtconsole==5.0.1 # via jupyter qtpy==1.9.0 # via qtconsole @@ -694,6 +694,7 @@ six==1.15.0 # azure-core # bcrypt # bleach + # cliff # cryptography # cycler # databricks-cli @@ -752,9 +753,9 @@ tabulate==0.8.7 # databricks-cli tblib==1.7.0 # via distributed -tensorboard-plugin-wit==1.8.0 +tensorboard-plugin-wit==1.7.0 # via tensorboard -tensorboard==2.4.1 +tensorboard==2.4.0 # via pytorch-lightning tensorboardx==2.1 # via diff --git a/python/requirements/linux-py3.8-requirements_tune.txt b/python/requirements/linux-py3.8-requirements_tune.txt index 195951424490..36dbb1dce9ad 100644 --- a/python/requirements/linux-py3.8-requirements_tune.txt +++ b/python/requirements/linux-py3.8-requirements_tune.txt @@ -27,7 +27,7 @@ attrs==20.3.0 # pytest autocfg==0.0.6 # via gluoncv -autogluon.core==0.0.16b20210122 +autogluon.core==0.0.16b20210121 # via gluoncv autograd==1.3 # via autogluon.core @@ -49,12 +49,12 @@ bleach==3.2.2 # via nbconvert bokeh==2.2.3 # via dask -boto3==1.16.58 +boto3==1.16.57 # via # -c ../requirements.txt # autogluon.core # smart-open -botocore==1.19.58 +botocore==1.19.57 # via # boto3 # s3transfer @@ -216,7 +216,7 @@ grpcio==1.35.0 # tensorboard gunicorn==20.0.4 # via mlflow -gym==0.18.0 +gym[atari]==0.18.0 # via # -c ../requirements.txt # -r requirements_tune.in @@ -417,7 +417,7 @@ opencv-python==4.5.1.48 # via # gluoncv # gym -optuna==2.4.0 +optuna==2.3.0 # via -r requirements_tune.in packaging==20.8 # via @@ -482,7 +482,7 @@ prometheus-flask-exporter==0.18.1 # via mlflow promise==2.3 # via wandb -prompt-toolkit==3.0.13 +prompt-toolkit==3.0.11 # via # ipython # jupyter-console @@ -586,7 +586,7 @@ pyzmq==21.0.1 # jupyter-client # notebook # qtconsole -qtconsole==5.0.2 +qtconsole==5.0.1 # via jupyter qtpy==1.9.0 # via qtconsole diff --git a/python/requirements/requirements_tune.in b/python/requirements/requirements_tune.in index 96a263204e97..9bb83cbeec73 100644 --- a/python/requirements/requirements_tune.in +++ b/python/requirements/requirements_tune.in @@ -20,7 +20,7 @@ matplotlib==3.3.3 mlflow==1.13.1 mxnet==1.7.0.post1 nevergrad==0.4.2.post5 -optuna==2.4.0 +optuna==2.3.0 pytest-remotedata==0.3.2 pytorch-lightning-bolts==0.2.5 pytorch-lightning==1.0.3 From fe1a970426ff55168513e0808c126420b0b7d176 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 219/244] Revert "Remove idle actor from worker pool. (#13523)" This reverts commit be8d9ae881ef5f4b66e2b1cc0e4b5188ea50535f. --- src/ray/raylet/worker_pool.cc | 32 ++++++++++++++++--------- src/ray/raylet/worker_pool.h | 2 ++ src/ray/raylet/worker_pool_test.cc | 38 ++++++++++++++++++++++++++---- 3 files changed, 56 insertions(+), 16 deletions(-) diff --git a/src/ray/raylet/worker_pool.cc b/src/ray/raylet/worker_pool.cc index 4ed257f4602e..93a568748e80 100644 --- a/src/ray/raylet/worker_pool.cc +++ b/src/ray/raylet/worker_pool.cc @@ -159,8 +159,9 @@ Process WorkerPool::StartWorkerProcess( return Process(); } // Either there are no workers pending registration or the worker start is being forced. - RAY_LOG(DEBUG) << "Starting new worker process, current pool has " << state.idle.size() - << " workers"; + RAY_LOG(DEBUG) << "Starting new worker process, current pool has " + << state.idle_actor.size() << " actor workers, and " << state.idle.size() + << " non-actor workers"; int workers_to_start = 1; if (dynamic_options.empty()) { @@ -624,11 +625,15 @@ void WorkerPool::PushWorker(const std::shared_ptr &worker) { state.idle_dedicated_workers[task_id] = worker; } else { // The worker is not used for the actor creation task with dynamic options. - // Put the worker to the idle pool. - state.idle.insert(worker); - int64_t now = current_time_ms(); - idle_of_all_languages_.emplace_back(worker, now); - idle_of_all_languages_map_[worker] = now; + // Put the worker to the corresponding idle pool. + if (worker->GetActorId().IsNil()) { + state.idle.insert(worker); + int64_t now = current_time_ms(); + idle_of_all_languages_.emplace_back(worker, now); + idle_of_all_languages_map_[worker] = now; + } else { + state.idle_actor[worker->GetActorId()] = worker; + } } } @@ -782,10 +787,7 @@ std::shared_ptr WorkerPool::PopWorker( state.tasks_to_dedicated_workers[task_spec.TaskId()] = proc; } } - } else if (task_spec.IsActorTask()) { - // Code path of actor task. - RAY_CHECK(false) << "Direct call shouldn't reach here."; - } else { + } else if (!task_spec.IsActorTask()) { // Code path of normal task or actor creation task without dynamic worker options. // Find an available worker which is already assigned to this job. // Try to pop the most recently pushed worker. @@ -810,6 +812,14 @@ std::shared_ptr WorkerPool::PopWorker( proc = StartWorkerProcess(task_spec.GetLanguage(), rpc::WorkerType::WORKER, task_spec.JobId()); } + } else { + // Code path of actor task. + const auto &actor_id = task_spec.ActorId(); + auto actor_entry = state.idle_actor.find(actor_id); + if (actor_entry != state.idle_actor.end()) { + worker = std::move(actor_entry->second); + state.idle_actor.erase(actor_entry); + } } if (worker == nullptr && proc.IsValid()) { diff --git a/src/ray/raylet/worker_pool.h b/src/ray/raylet/worker_pool.h index 703fbf77b781..66d4b94c7700 100644 --- a/src/ray/raylet/worker_pool.h +++ b/src/ray/raylet/worker_pool.h @@ -358,6 +358,8 @@ class WorkerPool : public WorkerPoolInterface, public IOWorkerPoolInterface { std::unordered_map> idle_dedicated_workers; /// The pool of idle non-actor workers. std::unordered_set> idle; + /// The pool of idle actor workers. + std::unordered_map> idle_actor; // States for io workers used for spilling objects. IOWorkerState spill_io_worker_state; // States for io workers used for restoring objects. diff --git a/src/ray/raylet/worker_pool_test.cc b/src/ray/raylet/worker_pool_test.cc index 0d2c0e314f34..ee8f3356bb77 100644 --- a/src/ray/raylet/worker_pool_test.cc +++ b/src/ray/raylet/worker_pool_test.cc @@ -343,6 +343,28 @@ TEST_F(WorkerPoolTest, HandleWorkerPushPop) { ASSERT_EQ(popped_worker, nullptr); } +TEST_F(WorkerPoolTest, PopActorWorker) { + // Create a worker. + auto worker = CreateWorker(Process::CreateNewDummy()); + // Add the worker to the pool. + worker_pool_->PushWorker(worker); + + // Assign an actor ID to the worker. + const auto task_spec = ExampleTaskSpec(); + auto actor = worker_pool_->PopWorker(task_spec); + auto actor_id = ActorID::Of(JOB_ID, TaskID::ForDriverTask(JOB_ID), 1); + actor->AssignActorId(actor_id); + worker_pool_->PushWorker(actor); + + // Check that there are no more non-actor workers. + ASSERT_EQ(worker_pool_->PopWorker(task_spec), nullptr); + // Check that we can pop the actor worker. + const auto actor_task_spec = ExampleTaskSpec(actor_id); + actor = worker_pool_->PopWorker(actor_task_spec); + ASSERT_EQ(actor, worker); + ASSERT_EQ(actor->GetActorId(), actor_id); +} + TEST_F(WorkerPoolTest, PopWorkersOfMultipleLanguages) { // Create a Python Worker, and add it to the pool auto py_worker = CreateWorker(Process::CreateNewDummy(), Language::PYTHON); @@ -406,19 +428,25 @@ TEST_F(WorkerPoolTest, PopWorkerMultiTenancy) { worker_pool_->PushWorker(worker); } } + std::unordered_set worker_ids; for (int round = 0; round < 2; round++) { std::vector> workers; - // Pop workers for actor. + // Pop workers for actor (creation) tasks. for (auto job_id : job_ids) { - auto actor_creation_id = ActorID::Of(job_id, TaskID::ForDriverTask(job_id), 1); - // Pop workers for actor creation tasks. - auto task_spec = ExampleTaskSpec(/*actor_id=*/ActorID::Nil(), Language::PYTHON, - job_id, actor_creation_id); + auto actor_id = ActorID::Of(job_id, TaskID::ForDriverTask(job_id), 1); + // For the first round, we pop for actor creation tasks. + // For the second round, we pop for actor tasks. + auto task_spec = + ExampleTaskSpec(round == 0 ? ActorID::Nil() : actor_id, Language::PYTHON, + job_id, round == 0 ? actor_id : ActorID::Nil()); auto worker = worker_pool_->PopWorker(task_spec); ASSERT_TRUE(worker); ASSERT_EQ(worker->GetAssignedJobId(), job_id); + if (round == 0) { + worker->AssignActorId(actor_id); + } workers.push_back(worker); } From 03222d24cb2251e45405c88a200329b1f292851a Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 220/244] Revert "[horovod] Horovod+Ray Pytorch Lightning Accelerator (#13458)" This reverts commit 7340cf1cbfa911c819b27e56359b9955d763fa09. --- .travis.yml | 1 - python/ray/tune/examples/mnist_ptl_mini.py | 3 +- python/ray/util/lightning_accelerators/BUILD | 33 --- .../util/lightning_accelerators/__init__.py | 4 - .../examples/ptl_horovod_ray_example.py | 195 ------------------ .../horovod_ray_accelerator.py | 121 ----------- .../tests/test_horovod_ray_accelerator.py | 191 ----------------- 7 files changed, 1 insertion(+), 547 deletions(-) delete mode 100644 python/ray/util/lightning_accelerators/BUILD delete mode 100644 python/ray/util/lightning_accelerators/__init__.py delete mode 100644 python/ray/util/lightning_accelerators/examples/ptl_horovod_ray_example.py delete mode 100644 python/ray/util/lightning_accelerators/horovod_ray_accelerator.py delete mode 100644 python/ray/util/lightning_accelerators/tests/test_horovod_ray_accelerator.py diff --git a/.travis.yml b/.travis.yml index 4d8f8ddd1255..5170ed0864b8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -420,7 +420,6 @@ matrix: script: - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=py37 python/ray/tune/... - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only python/ray/util/xgboost/... - - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only python/ray/util/lightning_accelerators/... # There are no python 3.7 tests for RaySGD at the moment # - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=py37 python/ray/util/sgd/... # - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=py37 doc/... diff --git a/python/ray/tune/examples/mnist_ptl_mini.py b/python/ray/tune/examples/mnist_ptl_mini.py index e3b226d44566..b1c2e2aa9a09 100644 --- a/python/ray/tune/examples/mnist_ptl_mini.py +++ b/python/ray/tune/examples/mnist_ptl_mini.py @@ -1,7 +1,7 @@ import torch from torch.nn import functional as F import pytorch_lightning as pl -from pl_bolts.datamodules.mnist_datamodule import MNISTDataModule +from pl_bolts.datamodules import MNISTDataModule import os from ray.tune.integration.pytorch_lightning import TuneReportCallback @@ -16,7 +16,6 @@ def __init__(self, config, data_dir=None): self.data_dir = data_dir or os.getcwd() self.lr = config["lr"] layer_1, layer_2 = config["layer_1"], config["layer_2"] - self.batch_size = config["batch_size"] # mnist images are (1, 28, 28) (channels, width, height) self.layer_1 = torch.nn.Linear(28 * 28, layer_1) diff --git a/python/ray/util/lightning_accelerators/BUILD b/python/ray/util/lightning_accelerators/BUILD deleted file mode 100644 index 4355c6d33bb4..000000000000 --- a/python/ray/util/lightning_accelerators/BUILD +++ /dev/null @@ -1,33 +0,0 @@ -# -------------------------------------------------------------------- -# Tests from the python/ray/util/lightning_accelerators/tests directory. -# Please keep these sorted alphabetically. -# -------------------------------------------------------------------- - -py_test( - name = "test_horovod_ray_accelerator", - size = "medium", - srcs = ["tests/test_horovod_ray_accelerator.py"], - tags = ["exclusive", "pytorch-lightning", "pytorch", "horovod"], - deps = [":accelerator_lib"], -) - -# -------------------------------------------------------------------- -# Tests from the python/ray/util/lightning_accelerators/examples directory. -# Please keep these sorted alphabetically. -# -------------------------------------------------------------------- - -py_test( - name = "ptl_horovod_ray_example", - size = "medium", - srcs = ["examples/ptl_horovod_ray_example.py"], - tags = ["exclusive", "example", "pytorch-lightning", "pytorch", "horovod"], - deps = [":accelerator_lib"], - args = ["--smoke-test"] -) - -# # This is a dummy test dependency that causes the above tests to be -# # re-run if any of these files changes. -py_library( - name = "accelerator_lib", - srcs = glob(["**/*.py"], exclude=["tests/*.py"]), -) diff --git a/python/ray/util/lightning_accelerators/__init__.py b/python/ray/util/lightning_accelerators/__init__.py deleted file mode 100644 index 038180e016ef..000000000000 --- a/python/ray/util/lightning_accelerators/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from ray.util.lightning_accelerators.horovod_ray_accelerator import \ - HorovodRayAccelerator - -__all__ = ["HorovodRayAccelerator"] diff --git a/python/ray/util/lightning_accelerators/examples/ptl_horovod_ray_example.py b/python/ray/util/lightning_accelerators/examples/ptl_horovod_ray_example.py deleted file mode 100644 index fffcfb01f54b..000000000000 --- a/python/ray/util/lightning_accelerators/examples/ptl_horovod_ray_example.py +++ /dev/null @@ -1,195 +0,0 @@ -"""Example using Pytorch Lightning with a Horovod on Ray Accelerator.""" -import os -import tempfile - -import pytorch_lightning as pl -import torch -from torch.utils.data import random_split, DataLoader -from torchvision.datasets import MNIST -from torchvision import transforms - -import ray -from ray import tune -from ray.tune.examples.mnist_ptl_mini import LightningMNISTClassifier -from ray.tune.integration.pytorch_lightning import TuneReportCallback -from ray.util.lightning_accelerators import HorovodRayAccelerator - - -class MNISTClassifier(LightningMNISTClassifier): - def prepare_data(self): - self.dataset = MNIST( - self.data_dir, - train=True, - download=True, - transform=transforms.ToTensor()) - - def train_dataloader(self): - dataset = self.dataset - train_length = len(dataset) - dataset_train, _ = random_split( - dataset, [train_length - 5000, 5000], - generator=torch.Generator().manual_seed(0)) - loader = DataLoader( - dataset_train, - batch_size=self.batch_size, - shuffle=True, - num_workers=1, - drop_last=True, - pin_memory=True, - ) - return loader - - def val_dataloader(self): - dataset = self.dataset - train_length = len(dataset) - _, dataset_val = random_split( - dataset, [train_length - 5000, 5000], - generator=torch.Generator().manual_seed(0)) - loader = DataLoader( - dataset_val, - batch_size=self.batch_size, - shuffle=False, - num_workers=1, - drop_last=True, - pin_memory=True, - ) - return loader - - -def train_mnist(config, - data_dir=None, - num_epochs=10, - num_hosts=1, - num_slots=4, - use_gpu=False, - callbacks=None): - model = MNISTClassifier(config, data_dir) - - callbacks = callbacks or [] - - trainer = pl.Trainer( - max_epochs=num_epochs, - gpus=int(use_gpu), - callbacks=callbacks, - accelerator=HorovodRayAccelerator( - num_hosts=num_hosts, num_slots=num_slots, use_gpu=use_gpu)) - trainer.fit(model) - - -def tune_mnist(data_dir, - num_samples=10, - num_epochs=10, - num_hosts=1, - num_slots=4, - use_gpu=False): - config = { - "layer_1": tune.choice([32, 64, 128]), - "layer_2": tune.choice([64, 128, 256]), - "lr": tune.loguniform(1e-4, 1e-1), - "batch_size": tune.choice([32, 64, 128]), - } - - # Add Tune callback. - metrics = {"loss": "ptl/val_loss", "acc": "ptl/val_accuracy"} - callbacks = [TuneReportCallback(metrics, on="validation_end")] - trainable = tune.with_parameters( - train_mnist, - data_dir=data_dir, - num_epochs=num_epochs, - num_hosts=num_hosts, - num_slots=num_slots, - use_gpu=use_gpu, - callbacks=callbacks) - analysis = tune.run( - trainable, - metric="loss", - mode="min", - config=config, - num_samples=num_samples, - resources_per_trial={ - "cpu": 1, - # Assume 1 cpu per slot. - "extra_cpu": num_hosts * num_slots, - # Assume 1 gpu per slot. - "extra_gpu": num_hosts * num_slots * int(use_gpu) - }, - name="tune_mnist") - - print("Best hyperparameters found were: ", analysis.best_config) - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser() - parser.add_argument( - "--num-hosts", - type=int, - help="Number of machines to train on. If using Tune, then each " - "trial will use this many machines.", - default=1) - parser.add_argument( - "--num-slots", - type=int, - help="Number of workers to " - "place on each " - "machine. If using " - "Tune, then each trial will use this many slots per machine.", - default=1) - parser.add_argument( - "--use-gpu", action="store_true", help="Use GPU for " - "training.") - parser.add_argument( - "--tune", - action="store_true", - help="Use Ray Tune " - "for " - "hyperparameter " - "tuning.") - parser.add_argument( - "--num-samples", - type=int, - default=10, - help="Number " - "of " - "samples to tune.") - parser.add_argument( - "--num-epochs", - type=int, - default=10, - help="Number " - "of " - "epochs " - "to train for.") - parser.add_argument( - "--smoke-test", action="store_true", help="Finish quickly for testing") - parser.add_argument( - "--address", - required=False, - type=str, - help="the address to use for Ray") - args, _ = parser.parse_known_args() - - num_epochs = 1 if args.smoke_test else args.num_epochs - num_hosts = 1 if args.smoke_test else args.num_hosts - num_slots = 1 if args.smoke_test else args.num_slots - use_gpu = False if args.smoke_test else args.use_gpu - num_samples = 1 if args.smoke_test else args.num_samples - - if args.smoke_test: - ray.init(num_cpus=2) - else: - ray.init(address=args.address) - - data_dir = os.path.join(tempfile.gettempdir(), "mnist_data_") - - if args.tune: - raise NotImplementedError("Using Tune + Pytorch Lightning with " - "distributed training is currently not " - "supported.") - tune_mnist(data_dir, num_samples, num_epochs, num_hosts, num_slots, - use_gpu) - else: - config = {"layer_1": 32, "layer_2": 64, "lr": 1e-1, "batch_size": 32} - train_mnist(config, data_dir, num_epochs, num_hosts, num_slots, - use_gpu) diff --git a/python/ray/util/lightning_accelerators/horovod_ray_accelerator.py b/python/ray/util/lightning_accelerators/horovod_ray_accelerator.py deleted file mode 100644 index 04f73317a923..000000000000 --- a/python/ray/util/lightning_accelerators/horovod_ray_accelerator.py +++ /dev/null @@ -1,121 +0,0 @@ -import ray -from pytorch_lightning.accelerators.horovod_accelerator import \ - HorovodAccelerator - -try: - import horovod.torch as hvd - from horovod.ray import RayExecutor -except (ModuleNotFoundError, ImportError): - HOROVOD_AVAILABLE = False -else: - HOROVOD_AVAILABLE = True - - -def get_executable_cls(): - # Only used for testing purposes, currently. - # We need to override this in tests to ensure test path is set correctly. - return None - - -class HorovodRayAccelerator(HorovodAccelerator): - """Pytorch Lightning Accelerator for Horovod training on a Ray cluster. - - This accelerator is used to manage distributed training on a Ray cluster - via the Horovod training framework. Internally, the specified number of - Ray actors are launched in the cluster and are configured as part of the - Horovod ring. The Pytorch Lightning trainer is instantiated on the - driver and sent to each of these training workers where training is - executed. The distributed training protocol is handled by Horovod. - - Each training worker is configured to reserve 1 CPU and if 1 GPU if - ``use_gpu`` is set to ``True``. - - If using this accelerator, you should run your code like a normal Python - script: ``python train.py``, and not with ``horovodrun``. - - Args: - num_hosts (int): The number of nodes/machines to execute the job on. - num_slots (int): Number of workers to be placed on each machine. - use_gpu (bool): Whether to use GPU for allocation. For GPU to be - used, you must also set the ``gpus`` arg in your Pytorch Lightning - Trainer to a value > 0. - - Example: - - .. code_block:: python - - import pytorch_lightning as ptl - from ray.util.lightning_accelerators import HorovodRayAccelerator - - ptl_model = MNISTClassifier(...) - # 2 nodes, 4 workers per node, each using 1 CPU and 1 GPU. - accelerator = HorovodRayAccelerator(num_hosts=2, num_slots=4, - use_gpu=True). - - # If using GPUs, set the ``gpus`` arg to a value > 0. - # The actual number of GPUs is determined by ``num_slots``. - trainer = pl.Trainer(..., gpus=1, accelerator=accelerator). - trainer.fit(ptl_model). - - """ - - def __init__(self, - *args, - num_hosts=1, - num_slots=1, - use_gpu=False, - **kwargs): - super().__init__(*args, trainer=None, **kwargs) - self.nickname = "horovod_ray" - self.num_hosts = num_hosts - self.num_slots = num_slots - self.use_gpu = use_gpu - - def setup(self, model): - self.trainer.use_horovod = True - settings = RayExecutor.create_settings(timeout_s=30) - self.executor = RayExecutor( - settings, - num_hosts=self.num_hosts, - num_slots=self.num_slots, - use_gpu=self.use_gpu) - self.trainer.model = model - self.executor.start(executable_cls=get_executable_cls()) - - def train(self): - trainer = self.trainer - trainer_ref = ray.put(self.trainer) - self.trainer = None - results = self.executor.run(self.train_remote, args=[trainer_ref]) - results, state_dict, best_path = results[0] - - self.trainer = trainer - self.trainer.model.load_state_dict(state_dict) - if self.trainer.checkpoint_callback: - self.trainer.checkpoint_callback.best_model_path = best_path - - return results - - def train_remote(self, trainer_ref): - self.trainer = ray.get(trainer_ref) - hvd.init() - if self.trainer.on_gpu: - # Horovod assigns one local GPU per process. - self.trainer.root_gpu = hvd.local_rank() - - # TODO: Make changes in PTL to clean this up. - super(HorovodRayAccelerator, self).setup(self.trainer.model) - results = super(HorovodRayAccelerator, self).train() - if hvd.rank() != 0: - # Only want results from the first worker. - return None - - best_model_path = None - if self.trainer.checkpoint_callback is not None: - best_model_path = self.trainer.checkpoint_callback.best_model_path - - model = self.trainer.model - return results, model.state_dict(), best_model_path - - def teardown(self): - self.executor.shutdown() diff --git a/python/ray/util/lightning_accelerators/tests/test_horovod_ray_accelerator.py b/python/ray/util/lightning_accelerators/tests/test_horovod_ray_accelerator.py deleted file mode 100644 index 1d8bb9d5e71c..000000000000 --- a/python/ray/util/lightning_accelerators/tests/test_horovod_ray_accelerator.py +++ /dev/null @@ -1,191 +0,0 @@ -import os - -import torch -import pytest -import ray -from pl_bolts.datamodules.mnist_datamodule import MNISTDataModule -from ray.util.sgd.tests.test_ptl import PTL_Module -from ray.tune.examples.mnist_ptl_mini import LightningMNISTClassifier -from ray.util.lightning_accelerators import HorovodRayAccelerator -import pytorch_lightning as pl - -try: - import horovod # noqa: F401 - from horovod.common.util import nccl_built -except ImportError: - HOROVOD_AVAILABLE = False -else: - HOROVOD_AVAILABLE = True - - -def _nccl_available(): - if not HOROVOD_AVAILABLE: - return False - try: - return nccl_built() - except AttributeError: - return False - - -@pytest.fixture -def ray_start_2_cpus(): - address_info = ray.init(num_cpus=2) - yield address_info - ray.shutdown() - - -@pytest.fixture -def ray_start_2_gpus(): - address_info = ray.init(num_cpus=2, num_gpus=2) - yield address_info - ray.shutdown() - # This env var is set by Pytorch Lightning. - # Make sure to reset it after each test. - # TODO: Upstream to PTL to not set this env var if using Ray. - del os.environ["CUDA_VISIBLE_DEVICES"] - - -@pytest.fixture -def seed(): - pl.seed_everything(0) - - -def get_model(lr=1e-2, hidden_size=1, data_size=10, val_size=10, batch_size=2): - config = { - "lr": lr, - "hidden_size": hidden_size, - "data_size": data_size, - "val_size": val_size, - "batch_size": batch_size - } - return PTL_Module(config) - - -def get_trainer(dir, - num_slots=2, - use_gpu=False, - max_epochs=1, - limit_train_batches=10, - limit_val_batches=10, - progress_bar_refresh_rate=0): - accelerator = HorovodRayAccelerator(num_slots=num_slots, use_gpu=use_gpu) - trainer = pl.Trainer( - default_root_dir=dir, - gpus=1 if use_gpu else 0, - max_epochs=max_epochs, - limit_train_batches=limit_train_batches, - limit_val_batches=limit_val_batches, - progress_bar_refresh_rate=progress_bar_refresh_rate, - checkpoint_callback=True, - accelerator=accelerator) - return trainer - - -def train_test(trainer, model): - initial_values = torch.tensor( - [torch.sum(torch.abs(x)) for x in model.parameters()]) - result = trainer.fit(model) - post_train_values = torch.tensor( - [torch.sum(torch.abs(x)) for x in model.parameters()]) - assert result == 1, "trainer failed" - # Check that the model is actually changed post-training. - assert torch.norm(initial_values - post_train_values) > 0.1 - - -@pytest.mark.parametrize("num_slots", [1, 2]) -def test_train(tmpdir, ray_start_2_cpus, seed, num_slots): - model = get_model() - - trainer = get_trainer(tmpdir, num_slots=num_slots) - train_test(trainer, model) - - -def load_test(trainer, model): - trainer.fit(model) - trained_model = PTL_Module.load_from_checkpoint( - trainer.checkpoint_callback.best_model_path, config=model.config) - assert trained_model is not None, "loading model failed" - - -@pytest.mark.parametrize("num_slots", [1, 2]) -def test_load(tmpdir, ray_start_2_cpus, seed, num_slots): - model = get_model() - trainer = get_trainer(tmpdir, num_slots=num_slots) - load_test(trainer, model) - - -def predict_test(trainer, model, dm): - trainer.fit(model, dm) - test_loader = dm.test_dataloader() - acc = pl.metrics.Accuracy() - for batch in test_loader: - x, y = batch - with torch.no_grad(): - y_hat = model(x) - y_hat = y_hat.cpu() - acc.update(y_hat, y) - average_acc = acc.compute() - assert average_acc >= 0.5, f"This model is expected to get > {0.5} in " \ - f"test set (it got {average_acc})" - - -@pytest.mark.parametrize("num_slots", [1, 2]) -def test_predict(tmpdir, ray_start_2_cpus, seed, num_slots): - config = { - "layer_1": 32, - "layer_2": 32, - "lr": 1e-2, - "batch_size": 32, - } - model = LightningMNISTClassifier(config, tmpdir) - dm = MNISTDataModule( - data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"]) - trainer = get_trainer( - tmpdir, limit_train_batches=10, max_epochs=1, num_slots=num_slots) - predict_test(trainer, model, dm) - - -@pytest.mark.skipif( - not _nccl_available(), reason="test requires Horovod with NCCL support") -@pytest.mark.skipif( - torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -@pytest.mark.parametrize("num_slots", [1, 2]) -def test_train_gpu(tmpdir, ray_start_2_gpus, seed, num_slots): - model = get_model() - trainer = get_trainer(tmpdir, num_slots=num_slots, use_gpu=True) - train_test(trainer, model) - - -@pytest.mark.skipif( - not _nccl_available(), reason="test requires Horovod with NCCL support") -@pytest.mark.skipif( - torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -@pytest.mark.parametrize("num_slots", [1, 2]) -def test_load_gpu(tmpdir, ray_start_2_gpus, seed, num_slots): - model = get_model() - trainer = get_trainer(tmpdir, num_slots=num_slots, use_gpu=True) - load_test(trainer, model) - - -@pytest.mark.skipif( - not _nccl_available(), reason="test requires Horovod with NCCL support") -@pytest.mark.skipif( - torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -@pytest.mark.parametrize("num_slots", [1, 2]) -def test_predict_gpu(tmpdir, ray_start_2_gpus, seed, num_slots): - config = { - "layer_1": 32, - "layer_2": 32, - "lr": 1e-2, - "batch_size": 32, - } - model = LightningMNISTClassifier(config, tmpdir) - dm = MNISTDataModule( - data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"]) - trainer = get_trainer( - tmpdir, - limit_train_batches=10, - max_epochs=1, - num_slots=num_slots, - use_gpu=True) - predict_test(trainer, model, dm) From 5c08eda9d29e51620f7ff94b1a97dd0ede4b0138 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 221/244] Revert "[Dependencies] Move requirements.txt to requirements directory. (#13636)" This reverts commit a0df936cc21a138b9654d34c453b1f20156b7a03. --- .github/dependabot.yml | 12 ------------ ci/travis/install-dependencies.sh | 2 +- python/{requirements => }/requirements.txt | 0 python/requirements/requirements_tune.in | 2 +- python/setup.py | 4 ++-- 5 files changed, 4 insertions(+), 16 deletions(-) rename python/{requirements => }/requirements.txt (100%) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 3074b6042bc9..9f8b6b7a730a 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -21,15 +21,3 @@ updates: open-pull-requests-limit: 3 reviewers: - "ray-project/ray-tune" - ignore: - # Ignore pinned dependencies in requirements.txt. - - dependency-name: aiohttp - - dependency-name: msgpack - - dependency-name: opencv-python-headless - - dependency-name: pandas - - dependency-name: scipy - - dependency-name: pydantic - - dependency-name: cython - - dependency-name: llmvlite - - dependency-name: pytest - - dependency-name: scikit-learn diff --git a/ci/travis/install-dependencies.sh b/ci/travis/install-dependencies.sh index 96f4fa95a8f2..8c42f694ce57 100755 --- a/ci/travis/install-dependencies.sh +++ b/ci/travis/install-dependencies.sh @@ -274,7 +274,7 @@ install_dependencies() { local status="0"; local errmsg=""; for _ in {1..3}; do - errmsg=$(CC=gcc pip install -r "${WORKSPACE_DIR}"/python/requirements/requirements.txt 2>&1) && break; + errmsg=$(CC=gcc pip install -r "${WORKSPACE_DIR}"/python/requirements.txt 2>&1) && break; status=$errmsg && echo "'pip install ...' failed, will retry after n seconds!" && sleep 30; done if [ "$status" != "0" ]; then diff --git a/python/requirements/requirements.txt b/python/requirements.txt similarity index 100% rename from python/requirements/requirements.txt rename to python/requirements.txt diff --git a/python/requirements/requirements_tune.in b/python/requirements/requirements_tune.in index 9bb83cbeec73..40ccf4be43d1 100644 --- a/python/requirements/requirements_tune.in +++ b/python/requirements/requirements_tune.in @@ -1,5 +1,5 @@ # Use base requirements to constrain these requirements. --c ./requirements.txt +-c ../requirements.txt ax-platform==0.1.9; python_version < '3.7' ax-platform==0.1.19; python_version >= '3.7' diff --git a/python/setup.py b/python/setup.py index a1542a7a292c..18d012b99e52 100644 --- a/python/setup.py +++ b/python/setup.py @@ -92,7 +92,7 @@ ] # If you're adding dependencies for ray extras, please -# also update the matching section of requirements/requirements.txt +# also update the matching section of requirements.txt # in this directory extras = { "serve": [ @@ -120,7 +120,7 @@ # These are the main dependencies for users of ray. This list # should be carefully curated. If you change it, please reflect -# the change in the matching section of requirements/requirements.txt +# the change in the matching section of requirements.txt install_requires = [ # TODO(alex) Pin the version once this PR is # included in the stable release. From 49584cfa631a671cd1c94e93b3514a05800010c9 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 222/244] Revert "[Metrics] Fix serialization for custom metrics (#13571)" This reverts commit 2f8fdce2c7807373d255178d194f8f4b2a995595. --- python/ray/tests/test_metrics_agent.py | 5 +---- python/ray/util/metrics.py | 16 ---------------- 2 files changed, 1 insertion(+), 20 deletions(-) diff --git a/python/ray/tests/test_metrics_agent.py b/python/ray/tests/test_metrics_agent.py index 8e02c4ae360b..86670b8a32cc 100644 --- a/python/ray/tests/test_metrics_agent.py +++ b/python/ray/tests/test_metrics_agent.py @@ -37,8 +37,6 @@ def _setup_cluster_for_test(ray_start_cluster): def f(): counter = Count("test_counter", description="desc") counter.record(1) - counter = ray.get(ray.put(counter)) # Test serialization. - counter.record(1) ray.get(worker_should_exit.wait.remote()) @ray.remote @@ -46,7 +44,6 @@ class A: async def ping(self): histogram = Histogram( "test_histogram", description="desc", boundaries=[0.1, 1.6]) - histogram = ray.get(ray.put(histogram)) # Test serialization. histogram.record(1.5) ray.get(worker_should_exit.wait.remote()) @@ -103,7 +100,7 @@ def test_cases(): test_counter_sample = [ m for m in metric_samples if "test_counter" in m.name ][0] - assert test_counter_sample.value == 2.0 + assert test_counter_sample.value == 1.0 test_driver_counter_sample = [ m for m in metric_samples if "test_driver_counter" in m.name diff --git a/python/ray/util/metrics.py b/python/ray/util/metrics.py index 57a01cf7aa0b..d287a503fa73 100644 --- a/python/ray/util/metrics.py +++ b/python/ray/util/metrics.py @@ -147,11 +147,6 @@ def __init__(self, self._metric = CythonCount(self._name, self._description, self._unit, self._tag_keys) - def __reduce__(self): - deserializer = Count - serialized_data = (self._name, self._description, self._tag_keys) - return deserializer, serialized_data - class Histogram(Metric): """Histogram distribution of metric points. @@ -182,12 +177,6 @@ def __init__(self, self._unit, self.boundaries, self._tag_keys) - def __reduce__(self): - deserializer = Histogram - serialized_data = (self._name, self._description, self.boundaries, - self._tag_keys) - return deserializer, serialized_data - @property def info(self): """Return information about histogram metric.""" @@ -215,11 +204,6 @@ def __init__(self, self._metric = CythonGauge(self._name, self._description, self._unit, self._tag_keys) - def __reduce__(self): - deserializer = Gauge - serialized_data = (self._name, self._description, self._tag_keys) - return deserializer, serialized_data - __all__ = [ "Count", From 50dee4911b18efe85c5c40a1e0086821e603fa14 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 223/244] Revert "Revert "[dashboard] Fix RAY_RAYLET_PID KeyError on Windows (#12948)" (#13572)" This reverts commit f09325a5d34efa78924c396501b148cd96c5d4e5. --- dashboard/agent.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/dashboard/agent.py b/dashboard/agent.py index f1c496b89004..7bf5e1551a2b 100644 --- a/dashboard/agent.py +++ b/dashboard/agent.py @@ -62,9 +62,13 @@ def __init__(self, self.object_store_name = object_store_name self.raylet_name = raylet_name self.node_id = os.environ["RAY_NODE_ID"] - self.ppid = int(os.environ["RAY_RAYLET_PID"]) - assert self.ppid > 0 - logger.info("Parent pid is %s", self.ppid) + # TODO(edoakes): RAY_RAYLET_PID isn't properly set on Windows. This is + # only used for fate-sharing with the raylet and we need a different + # fate-sharing mechanism for Windows anyways. + if sys.platform not in ["win32", "cygwin"]: + self.ppid = int(os.environ["RAY_RAYLET_PID"]) + assert self.ppid > 0 + logger.info("Parent pid is %s", self.ppid) self.server = aiogrpc.server(options=(("grpc.so_reuseport", 0), )) self.grpc_port = self.server.add_insecure_port( f"[::]:{self.dashboard_agent_port}") @@ -108,7 +112,8 @@ async def _check_parent(): logger.error("Failed to check parent PID, exiting.") sys.exit(1) - check_parent_task = create_task(_check_parent()) + if sys.platform not in ["win32", "cygwin"]: + check_parent_task = create_task(_check_parent()) # Create an aioredis client for all modules. try: From 12bb1b5afaab932472be0251048a379591897c92 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 224/244] Revert "[kubernetes][operator][minutiae] Backwards compatibility of operator (#13623)" This reverts commit c51fd89704e5f9068ed216b82275a482d633f7d1. --- python/ray/operator/operator_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/operator/operator_utils.py b/python/ray/operator/operator_utils.py index 08afda94f1d4..94d2a00cf34e 100644 --- a/python/ray/operator/operator_utils.py +++ b/python/ray/operator/operator_utils.py @@ -95,4 +95,4 @@ def get_cluster_owner_reference( def translate(configuration: Dict[str, Any], dictionary: Dict[str, str]) -> Dict[str, Any]: - return {dictionary[field]: configuration[field] for field in dictionary} + return {dictionary[field]: configuration[field] for field in configuration} From c3dd55366e5128e1a33f944aab60761db54dc7e8 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 225/244] Revert "[RLlib] Add grad_clip config option to MARWIL and stabilize grad clipping against inf global_norms. (#13634)" This reverts commit cf043621129b99eccbed7127bb7f93e370e07646. --- rllib/agents/marwil/marwil.py | 2 -- rllib/agents/marwil/marwil_tf_policy.py | 4 +--- rllib/agents/marwil/marwil_torch_policy.py | 3 +-- rllib/agents/ppo/ppo_tf_policy.py | 10 ++-------- 4 files changed, 4 insertions(+), 15 deletions(-) diff --git a/rllib/agents/marwil/marwil.py b/rllib/agents/marwil/marwil.py index d123b3ef5f5f..c4f88fdb8b30 100644 --- a/rllib/agents/marwil/marwil.py +++ b/rllib/agents/marwil/marwil.py @@ -21,8 +21,6 @@ "beta": 1.0, # Balancing value estimation loss and policy optimization loss. "vf_coeff": 1.0, - # If specified, clip the global norm of gradients by this amount. - "grad_clip": None, # Whether to calculate cumulative rewards. "postprocess_inputs": True, # Whether to rollout "complete_episodes" or "truncate_episodes". diff --git a/rllib/agents/marwil/marwil_tf_policy.py b/rllib/agents/marwil/marwil_tf_policy.py index 211f9467e7b0..44352be4f883 100644 --- a/rllib/agents/marwil/marwil_tf_policy.py +++ b/rllib/agents/marwil/marwil_tf_policy.py @@ -1,7 +1,6 @@ import logging import ray -from ray.rllib.agents.ppo.ppo_tf_policy import compute_and_clip_gradients from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.evaluation.postprocessing import compute_advantages, \ Postprocessing @@ -134,7 +133,7 @@ def __init__(self, policy, value_estimates, action_dist, actions, # Exponentially weighted advantages. c = tf.math.sqrt(policy._moving_average_sqd_adv_norm) - exp_advs = tf.math.exp(beta * (adv / (1e-8 + c))) + exp_advs = tf.math.exp(beta * (adv / c)) # Static graph. else: update_adv_norm = tf1.assign_add( @@ -201,5 +200,4 @@ def setup_mixins(policy, obs_space, action_space, config): stats_fn=stats, postprocess_fn=postprocess_advantages, before_loss_init=setup_mixins, - gradients_fn=compute_and_clip_gradients, mixins=[ValueNetworkMixin]) diff --git a/rllib/agents/marwil/marwil_torch_policy.py b/rllib/agents/marwil/marwil_torch_policy.py index 14ae943ecaf5..ef3558378794 100644 --- a/rllib/agents/marwil/marwil_torch_policy.py +++ b/rllib/agents/marwil/marwil_torch_policy.py @@ -4,7 +4,7 @@ from ray.rllib.policy.policy_template import build_policy_class from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.torch_ops import apply_grad_clipping, explained_variance +from ray.rllib.utils.torch_ops import explained_variance torch, _ = try_import_torch() @@ -98,6 +98,5 @@ def setup_mixins(policy, obs_space, action_space, config): get_default_config=lambda: ray.rllib.agents.marwil.marwil.DEFAULT_CONFIG, stats_fn=stats, postprocess_fn=postprocess_advantages, - extra_grad_process_fn=apply_grad_clipping, before_loss_init=setup_mixins, mixins=[ValueNetworkMixin]) diff --git a/rllib/agents/ppo/ppo_tf_policy.py b/rllib/agents/ppo/ppo_tf_policy.py index 5991da84e328..57874ba296b3 100644 --- a/rllib/agents/ppo/ppo_tf_policy.py +++ b/rllib/agents/ppo/ppo_tf_policy.py @@ -182,15 +182,9 @@ def compute_and_clip_gradients(policy: Policy, optimizer: LocalOptimizer, # Clip by global norm, if necessary. if policy.config["grad_clip"] is not None: - # Defuse inf gradients (due to super large losses). grads = [g for (g, v) in grads_and_vars] - grads, _ = tf.clip_by_global_norm(grads, policy.config["grad_clip"]) - # If the global_norm is inf -> All grads will be NaN. Stabilize this - # here by setting them to 0.0. This will simply ignore destructive loss - # calculations. - policy.grads = [ - tf.where(tf.math.is_nan(g), tf.zeros_like(g), g) for g in grads - ] + policy.grads, _ = tf.clip_by_global_norm(grads, + policy.config["grad_clip"]) clipped_grads_and_vars = list(zip(policy.grads, variables)) return clipped_grads_and_vars else: From 398a77eef22fa33bd7119217b41f1640cc7b92d4 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 226/244] Revert "[Metrics] Cache metrics ports in a file at each node (#13501)" This reverts commit 4d32c72a6b22d364b47ac61747b09d811bb23cf4. --- python/ray/node.py | 64 ++------------ python/ray/tests/test_metrics_agent.py | 114 +++++++++++-------------- 2 files changed, 60 insertions(+), 118 deletions(-) diff --git a/python/ray/node.py b/python/ray/node.py index 186ae3dfdbfd..425965021240 100644 --- a/python/ray/node.py +++ b/python/ray/node.py @@ -13,9 +13,6 @@ import tempfile import time -from typing import Optional, Dict -from collections import defaultdict - import ray import ray.ray_constants as ray_constants import ray._private.services @@ -124,10 +121,18 @@ def __init__(self, self._raylet_ip_address = raylet_ip_address + self.metrics_agent_port = (ray_params.metrics_agent_port + or self._get_unused_port()[0]) + self._metrics_export_port = ray_params.metrics_export_port + if self._metrics_export_port is None: + self._metrics_export_port = self._get_unused_port()[0] + ray_params.update_if_absent( include_log_monitor=True, resources={}, temp_dir=ray.utils.get_ray_temp_dir(), + metrics_agent_port=self.metrics_agent_port, + metrics_export_port=self._metrics_export_port, worker_path=os.path.join( os.path.dirname(os.path.abspath(__file__)), "workers/default_worker.py")) @@ -185,15 +190,6 @@ def __init__(self, self._raylet_socket_name = self._prepare_socket_file( self._ray_params.raylet_socket_name, default_prefix="raylet") - self.metrics_agent_port = self._get_cached_port( - "metrics_agent_port", default_port=ray_params.metrics_agent_port) - self._metrics_export_port = self._get_cached_port( - "metrics_export_port", default_port=ray_params.metrics_export_port) - - ray_params.update_if_absent( - metrics_agent_port=self.metrics_agent_port, - metrics_export_port=self._metrics_export_port) - if head: ray_params.update_if_absent(num_redis_shards=1) self._webui_url = None @@ -559,50 +555,6 @@ def _prepare_socket_file(self, socket_path, default_prefix): "{} bytes: {!r}".format(maxlen, result)) return result - def _get_cached_port(self, - port_name: str, - default_port: Optional[int] = None) -> int: - """Get a port number from a cache on this node. - - Different driver processes on a node should use the same ports for - some purposes, e.g. exporting metrics. This method returns a port - number for the given port name and caches it in a file. If the - port isn't already cached, an unused port is generated and cached. - - Args: - port_name (str): the name of the port, e.g. metrics_export_port - default_port (Optional[int]): The port to return and cache if no - port has already been cached for the given port_name. If None, an - unused port is generated and cached. - Returns: - port (int): the port number. - """ - file_path = os.path.join(self.get_session_dir_path(), - "ports_by_node.json") - - # Maps a Node.unique_id to a dict that maps port names to port numbers. - ports_by_node: Dict[str, Dict[str, int]] = defaultdict(dict) - - if not os.path.exists(file_path): - with open(file_path, "w") as f: - json.dump({}, f) - - with open(file_path, "r") as f: - ports_by_node.update(json.load(f)) - - if (self.unique_id in ports_by_node - and port_name in ports_by_node[self.unique_id]): - # The port has already been cached at this node, so use it. - port = int(ports_by_node[self.unique_id][port_name]) - else: - # Pick a new port to use and cache it at this node. - port = (default_port or self._get_unused_port()[0]) - ports_by_node[self.unique_id][port_name] = port - with open(file_path, "w") as f: - json.dump(ports_by_node, f) - - return port - def start_reaper_process(self): """ Start the reaper process. diff --git a/python/ray/tests/test_metrics_agent.py b/python/ray/tests/test_metrics_agent.py index 86670b8a32cc..b52f472efc26 100644 --- a/python/ray/tests/test_metrics_agent.py +++ b/python/ray/tests/test_metrics_agent.py @@ -15,6 +15,54 @@ from ray.test_utils import wait_for_condition, SignalActor, fetch_prometheus +def test_prometheus_file_based_service_discovery(ray_start_cluster): + # Make sure Prometheus service discovery file is correctly written + # when number of nodes are dynamically changed. + NUM_NODES = 5 + cluster = ray_start_cluster + nodes = [cluster.add_node() for _ in range(NUM_NODES)] + cluster.wait_for_nodes() + addr = ray.init(address=cluster.address) + redis_address = addr["redis_address"] + writer = PrometheusServiceDiscoveryWriter( + redis_address, ray.ray_constants.REDIS_DEFAULT_PASSWORD, "/tmp/ray") + + def get_metrics_export_address_from_node(nodes): + return [ + "{}:{}".format(node.node_ip_address, node.metrics_export_port) + for node in nodes + ] + + loaded_json_data = json.loads(writer.get_file_discovery_content())[0] + assert (set(get_metrics_export_address_from_node(nodes)) == set( + loaded_json_data["targets"])) + + # Let's update nodes. + for _ in range(3): + nodes.append(cluster.add_node()) + + # Make sure service discovery file content is correctly updated. + loaded_json_data = json.loads(writer.get_file_discovery_content())[0] + assert (set(get_metrics_export_address_from_node(nodes)) == set( + loaded_json_data["targets"])) + + +@pytest.mark.skipif( + platform.system() == "Windows", reason="Failing on Windows.") +def test_prome_file_discovery_run_by_dashboard(shutdown_only): + ray.init(num_cpus=0) + global_node = ray.worker._global_node + temp_dir = global_node.get_temp_dir_path() + + def is_service_discovery_exist(): + for path in pathlib.Path(temp_dir).iterdir(): + if PROMETHEUS_SERVICE_DISCOVERY_FILE in str(path): + return True + return False + + wait_for_condition(is_service_discovery_exist) + + @pytest.fixture def _setup_cluster_for_test(ray_start_cluster): NUM_NODES = 2 @@ -28,10 +76,6 @@ def _setup_cluster_for_test(ray_start_cluster): worker_should_exit = SignalActor.remote() - # Generate a metric in the driver. - counter = Count("test_driver_counter", description="desc") - counter.record(1) - # Generate some metrics from actor & tasks. @ray.remote def f(): @@ -88,25 +132,19 @@ def test_cases(): for components in components_dict.values()) # Make sure our user defined metrics exist - for metric_name in [ - "test_counter", "test_histogram", "test_driver_counter" - ]: + for metric_name in ["test_counter", "test_histogram"]: assert any(metric_name in full_name for full_name in metric_names) # Make sure GCS server metrics are recorded. assert "ray_outbound_heartbeat_size_kb_sum" in metric_names - # Make sure the numeric values are correct + # Make sure the numeric value is correct test_counter_sample = [ m for m in metric_samples if "test_counter" in m.name ][0] assert test_counter_sample.value == 1.0 - test_driver_counter_sample = [ - m for m in metric_samples if "test_driver_counter" in m.name - ][0] - assert test_driver_counter_sample.value == 1.0 - + # Make sure the numeric value is correct test_histogram_samples = [ m for m in metric_samples if "test_histogram" in m.name ] @@ -140,58 +178,10 @@ def wrap_test_case_for_retry(): ) except RuntimeError: print( - f"The components are {pformat(fetch_prometheus(prom_addresses))}") + f"The compoenents are {pformat(fetch_prometheus(prom_addresses))}") test_cases() # Should fail assert -def test_prometheus_file_based_service_discovery(ray_start_cluster): - # Make sure Prometheus service discovery file is correctly written - # when number of nodes are dynamically changed. - NUM_NODES = 5 - cluster = ray_start_cluster - nodes = [cluster.add_node() for _ in range(NUM_NODES)] - cluster.wait_for_nodes() - addr = ray.init(address=cluster.address) - redis_address = addr["redis_address"] - writer = PrometheusServiceDiscoveryWriter( - redis_address, ray.ray_constants.REDIS_DEFAULT_PASSWORD, "/tmp/ray") - - def get_metrics_export_address_from_node(nodes): - return [ - "{}:{}".format(node.node_ip_address, node.metrics_export_port) - for node in nodes - ] - - loaded_json_data = json.loads(writer.get_file_discovery_content())[0] - assert (set(get_metrics_export_address_from_node(nodes)) == set( - loaded_json_data["targets"])) - - # Let's update nodes. - for _ in range(3): - nodes.append(cluster.add_node()) - - # Make sure service discovery file content is correctly updated. - loaded_json_data = json.loads(writer.get_file_discovery_content())[0] - assert (set(get_metrics_export_address_from_node(nodes)) == set( - loaded_json_data["targets"])) - - -@pytest.mark.skipif( - platform.system() == "Windows", reason="Failing on Windows.") -def test_prome_file_discovery_run_by_dashboard(shutdown_only): - ray.init(num_cpus=0) - global_node = ray.worker._global_node - temp_dir = global_node.get_temp_dir_path() - - def is_service_discovery_exist(): - for path in pathlib.Path(temp_dir).iterdir(): - if PROMETHEUS_SERVICE_DISCOVERY_FILE in str(path): - return True - return False - - wait_for_condition(is_service_discovery_exist) - - @pytest.fixture def metric_mock(): mock = MagicMock() From 1e7a71fc0fe26c8b2b1039b58d8db59f3a2d4d81 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 227/244] Revert "[Java] Add `fetchLocal` parameter in `Ray.wait()` (#13604)" This reverts commit 108d560670e300fb4cced7e0252338caa9850b82. --- java/api/src/main/java/io/ray/api/Ray.java | 31 ++++------------ .../java/io/ray/api/runtime/RayRuntime.java | 11 ++---- .../io/ray/runtime/AbstractRayRuntime.java | 5 ++- .../runtime/object/LocalModeObjectStore.java | 3 +- .../ray/runtime/object/NativeObjectStore.java | 7 ++-- .../io/ray/runtime/object/ObjectStore.java | 23 ++++-------- java/test.sh | 9 ----- java/test/pom.xml | 35 +++++++++++++++++++ .../main/java/io/ray/test/PlasmaFreeTest.java | 2 +- .../io/ray/test/ReferenceCountingTest.java | 2 +- .../java/io_ray_runtime_RayNativeRuntime.h | 2 +- .../io_ray_runtime_object_NativeObjectStore.h | 6 ++-- .../io_ray_runtime_task_NativeTaskExecutor.h | 19 ++++++++++ .../io_ray_runtime_task_NativeTaskSubmitter.h | 10 +++--- 14 files changed, 87 insertions(+), 78 deletions(-) diff --git a/java/api/src/main/java/io/ray/api/Ray.java b/java/api/src/main/java/io/ray/api/Ray.java index fb71a3bacbdf..da9047a66075 100644 --- a/java/api/src/main/java/io/ray/api/Ray.java +++ b/java/api/src/main/java/io/ray/api/Ray.java @@ -87,24 +87,6 @@ public static List get(List> objectList) { return internal().get(objectList); } - /** - * Wait for a list of RayObjects to be available, until specified number of objects are ready, or - * specified timeout has passed. - * - * @param waitList A list of object references to wait for. - * @param numReturns The number of objects that should be returned. - * @param timeoutMs The maximum time in milliseconds to wait before returning. - * @param fetchLocal If true, wait for the object to be downloaded onto the local node before - * returning it as ready. If false, ray.wait() will not trigger fetching of objects to the - * local node and will return immediately once the object is available anywhere in the - * cluster. - * @return Two lists, one containing locally available objects, one containing the rest. - */ - public static WaitResult wait( - List> waitList, int numReturns, int timeoutMs, boolean fetchLocal) { - return internal().wait(waitList, numReturns, timeoutMs, fetchLocal); - } - /** * Wait for a list of RayObjects to be locally available, until specified number of objects are * ready, or specified timeout has passed. @@ -115,29 +97,30 @@ public static WaitResult wait( * @return Two lists, one containing locally available objects, one containing the rest. */ public static WaitResult wait(List> waitList, int numReturns, int timeoutMs) { - return wait(waitList, numReturns, timeoutMs, true); + return internal().wait(waitList, numReturns, timeoutMs); } /** - * Wait for a list of RayObjects to be locally available, until specified number of objects are - * ready. + * A convenient helper method for Ray.wait. It will wait infinitely until specified number of + * objects are locally available. * * @param waitList A list of object references to wait for. * @param numReturns The number of objects that should be returned. * @return Two lists, one containing locally available objects, one containing the rest. */ public static WaitResult wait(List> waitList, int numReturns) { - return wait(waitList, numReturns, Integer.MAX_VALUE); + return internal().wait(waitList, numReturns, Integer.MAX_VALUE); } /** - * Wait for a list of RayObjects to be locally available. + * A convenient helper method for Ray.wait. It will wait infinitely until all objects are locally + * available. * * @param waitList A list of object references to wait for. * @return Two lists, one containing locally available objects, one containing the rest. */ public static WaitResult wait(List> waitList) { - return wait(waitList, waitList.size()); + return internal().wait(waitList, waitList.size(), Integer.MAX_VALUE); } /** diff --git a/java/api/src/main/java/io/ray/api/runtime/RayRuntime.java b/java/api/src/main/java/io/ray/api/runtime/RayRuntime.java index ac5f44f3f139..53da3d48dae8 100644 --- a/java/api/src/main/java/io/ray/api/runtime/RayRuntime.java +++ b/java/api/src/main/java/io/ray/api/runtime/RayRuntime.java @@ -53,20 +53,15 @@ public interface RayRuntime { List get(List> objectRefs); /** - * Wait for a list of RayObjects to be available, until specified number of objects are ready, or - * specified timeout has passed. + * Wait for a list of RayObjects to be locally available, until specified number of objects are + * ready, or specified timeout has passed. * * @param waitList A list of ObjectRef to wait for. * @param numReturns The number of objects that should be returned. * @param timeoutMs The maximum time in milliseconds to wait before returning. - * @param fetchLocal If true, wait for the object to be downloaded onto the local node before - * returning it as ready. If false, ray.wait() will not trigger fetching of objects to the - * local node and will return immediately once the object is available anywhere in the - * cluster. * @return Two lists, one containing locally available objects, one containing the rest. */ - WaitResult wait( - List> waitList, int numReturns, int timeoutMs, boolean fetchLocal); + WaitResult wait(List> waitList, int numReturns, int timeoutMs); /** * Free a list of objects from Plasma Store. diff --git a/java/runtime/src/main/java/io/ray/runtime/AbstractRayRuntime.java b/java/runtime/src/main/java/io/ray/runtime/AbstractRayRuntime.java index 15d9e9d76a53..f3478e4c6c68 100644 --- a/java/runtime/src/main/java/io/ray/runtime/AbstractRayRuntime.java +++ b/java/runtime/src/main/java/io/ray/runtime/AbstractRayRuntime.java @@ -105,9 +105,8 @@ public void free(List> objectRefs, boolean localOnly) { } @Override - public WaitResult wait( - List> waitList, int numReturns, int timeoutMs, boolean fetchLocal) { - return objectStore.wait(waitList, numReturns, timeoutMs, fetchLocal); + public WaitResult wait(List> waitList, int numReturns, int timeoutMs) { + return objectStore.wait(waitList, numReturns, timeoutMs); } @Override diff --git a/java/runtime/src/main/java/io/ray/runtime/object/LocalModeObjectStore.java b/java/runtime/src/main/java/io/ray/runtime/object/LocalModeObjectStore.java index cb5752d00a81..e1bfc64faa62 100644 --- a/java/runtime/src/main/java/io/ray/runtime/object/LocalModeObjectStore.java +++ b/java/runtime/src/main/java/io/ray/runtime/object/LocalModeObjectStore.java @@ -60,8 +60,7 @@ public List getRaw(List objectIds, long timeoutMs) { } @Override - public List wait( - List objectIds, int numObjects, long timeoutMs, boolean fetchLocal) { + public List wait(List objectIds, int numObjects, long timeoutMs) { waitInternal(objectIds, numObjects, timeoutMs); return objectIds.stream().map(pool::containsKey).collect(Collectors.toList()); } diff --git a/java/runtime/src/main/java/io/ray/runtime/object/NativeObjectStore.java b/java/runtime/src/main/java/io/ray/runtime/object/NativeObjectStore.java index c68709e10e68..24dd5b8a2699 100644 --- a/java/runtime/src/main/java/io/ray/runtime/object/NativeObjectStore.java +++ b/java/runtime/src/main/java/io/ray/runtime/object/NativeObjectStore.java @@ -45,9 +45,8 @@ public List getRaw(List objectIds, long timeoutMs) { } @Override - public List wait( - List objectIds, int numObjects, long timeoutMs, boolean fetchLocal) { - return nativeWait(toBinaryList(objectIds), numObjects, timeoutMs, fetchLocal); + public List wait(List objectIds, int numObjects, long timeoutMs) { + return nativeWait(toBinaryList(objectIds), numObjects, timeoutMs); } @Override @@ -114,7 +113,7 @@ private static List toBinaryList(List ids) { private static native List nativeGet(List ids, long timeoutMs); private static native List nativeWait( - List objectIds, int numObjects, long timeoutMs, boolean fetchLocal); + List objectIds, int numObjects, long timeoutMs); private static native void nativeDelete(List objectIds, boolean localOnly); diff --git a/java/runtime/src/main/java/io/ray/runtime/object/ObjectStore.java b/java/runtime/src/main/java/io/ray/runtime/object/ObjectStore.java index 5e7b626033a2..8711811b24ad 100644 --- a/java/runtime/src/main/java/io/ray/runtime/object/ObjectStore.java +++ b/java/runtime/src/main/java/io/ray/runtime/object/ObjectStore.java @@ -117,36 +117,25 @@ public List get(List ids, Class elementType) { } /** - * Wait for a list of RayObjects to be available, until specified number of objects are ready, or - * specified timeout has passed. + * Wait for a list of objects to appear in the object store. * * @param objectIds IDs of the objects to wait for. * @param numObjects Number of objects that should appear. * @param timeoutMs Timeout in milliseconds, wait infinitely if it's negative. - * @param fetchLocal If true, wait for the object to be downloaded onto the local node before - * returning it as ready. If false, ray.wait() will not trigger fetching of objects to the - * local node and will return immediately once the object is available anywhere in the - * cluster. * @return A bitset that indicates each object has appeared or not. */ - public abstract List wait( - List objectIds, int numObjects, long timeoutMs, boolean fetchLocal); + public abstract List wait(List objectIds, int numObjects, long timeoutMs); /** - * Wait for a list of RayObjects to be available, until specified number of objects are ready, or - * specified timeout has passed. + * Wait for a list of RayObjects to be locally available, until specified number of objects are + * ready, or specified timeout has passed. * * @param waitList A list of object references to wait for. * @param numReturns The number of objects that should be returned. * @param timeoutMs The maximum time in milliseconds to wait before returning. - * @param fetchLocal If true, wait for the object to be downloaded onto the local node before - * returning it as ready. If false, ray.wait() will not trigger fetching of objects to the - * local node and will return immediately once the object is available anywhere in the - * cluster. * @return Two lists, one containing locally available objects, one containing the rest. */ - public WaitResult wait( - List> waitList, int numReturns, int timeoutMs, boolean fetchLocal) { + public WaitResult wait(List> waitList, int numReturns, int timeoutMs) { Preconditions.checkNotNull(waitList); if (waitList.isEmpty()) { return new WaitResult<>(Collections.emptyList(), Collections.emptyList()); @@ -155,7 +144,7 @@ public WaitResult wait( List ids = waitList.stream().map(ref -> ((ObjectRefImpl) ref).getId()).collect(Collectors.toList()); - List ready = wait(ids, numReturns, timeoutMs, fetchLocal); + List ready = wait(ids, numReturns, timeoutMs); List> readyList = new ArrayList<>(); List> unreadyList = new ArrayList<>(); diff --git a/java/test.sh b/java/test.sh index f946fd91ad6f..8336c1da1c5f 100755 --- a/java/test.sh +++ b/java/test.sh @@ -41,15 +41,6 @@ bazel build //java:gen_maven_deps echo "Build test jar." bazel build //java:all_tests_deploy.jar -java/generate_jni_header_files.sh - -if ! git diff --exit-code -- java src/ray/core_worker/lib/java; then - echo "Files are changed after build. Common cases are:" - echo " * Java native methods doesn't match JNI files. You need to either update Java code or JNI code." - echo " * pom_template.xml and pom.xml doesn't match. You need to either update pom_template.xml or pom.xml." - exit 1 -fi - # Enable multi-worker feature in Java test TEST_ARGS=(-Dray.job.num-java-workers-per-process=10) diff --git a/java/test/pom.xml b/java/test/pom.xml index f401f3cff5ab..c9e34821b544 100644 --- a/java/test/pom.xml +++ b/java/test/pom.xml @@ -117,6 +117,41 @@ + + + com.diffplug.spotless + spotless-maven-plugin + 2.6.1 + + + + + + + + + .java + + + + + + + + true + 4 + + + + + + + 1.7 + + + + + diff --git a/java/test/src/main/java/io/ray/test/PlasmaFreeTest.java b/java/test/src/main/java/io/ray/test/PlasmaFreeTest.java index b8235b8d84fa..3e49ff798630 100644 --- a/java/test/src/main/java/io/ray/test/PlasmaFreeTest.java +++ b/java/test/src/main/java/io/ray/test/PlasmaFreeTest.java @@ -25,7 +25,7 @@ public void testDeleteObjects() { () -> !TestUtils.getRuntime() .getObjectStore() - .wait(ImmutableList.of(((ObjectRefImpl) helloId).getId()), 1, 0, true) + .wait(ImmutableList.of(((ObjectRefImpl) helloId).getId()), 1, 0) .get(0), 50); if (TestUtils.isSingleProcessMode()) { diff --git a/java/test/src/main/java/io/ray/test/ReferenceCountingTest.java b/java/test/src/main/java/io/ray/test/ReferenceCountingTest.java index a98f9595914b..aa56581951e6 100644 --- a/java/test/src/main/java/io/ray/test/ReferenceCountingTest.java +++ b/java/test/src/main/java/io/ray/test/ReferenceCountingTest.java @@ -119,7 +119,7 @@ private static void fillObjectStoreAndGet( TestUtils.getRuntime().getObjectStore().getRaw(ImmutableList.of(objectId), Long.MAX_VALUE); } else { List result = - TestUtils.getRuntime().getObjectStore().wait(ImmutableList.of(objectId), 1, 100, true); + TestUtils.getRuntime().getObjectStore().wait(ImmutableList.of(objectId), 1, 100); Assert.assertFalse(result.get(0)); } } diff --git a/src/ray/core_worker/lib/java/io_ray_runtime_RayNativeRuntime.h b/src/ray/core_worker/lib/java/io_ray_runtime_RayNativeRuntime.h index daa4e05a9300..69c05cf9315f 100644 --- a/src/ray/core_worker/lib/java/io_ray_runtime_RayNativeRuntime.h +++ b/src/ray/core_worker/lib/java/io_ray_runtime_RayNativeRuntime.h @@ -25,7 +25,7 @@ extern "C" { * Class: io_ray_runtime_RayNativeRuntime * Method: nativeInitialize * Signature: - * (ILjava/lang/String;ILjava/lang/String;Ljava/lang/String;Ljava/lang/String;[BLio/ray/runtime/gcs/GcsClientOptions;ILjava/lang/String;Ljava/util/Map;[B)V + * (ILjava/lang/String;ILjava/lang/String;Ljava/lang/String;Ljava/lang/String;[BLio/ray/runtime/gcs/GcsClientOptions;ILjava/lang/String;Ljava/util/Map;)V */ JNIEXPORT void JNICALL Java_io_ray_runtime_RayNativeRuntime_nativeInitialize( JNIEnv *, jclass, jint, jstring, jint, jstring, jstring, jstring, jbyteArray, jobject, diff --git a/src/ray/core_worker/lib/java/io_ray_runtime_object_NativeObjectStore.h b/src/ray/core_worker/lib/java/io_ray_runtime_object_NativeObjectStore.h index fd194de55701..b1da06e57068 100644 --- a/src/ray/core_worker/lib/java/io_ray_runtime_object_NativeObjectStore.h +++ b/src/ray/core_worker/lib/java/io_ray_runtime_object_NativeObjectStore.h @@ -52,7 +52,7 @@ JNIEXPORT jobject JNICALL Java_io_ray_runtime_object_NativeObjectStore_nativeGet /* * Class: io_ray_runtime_object_NativeObjectStore * Method: nativeWait - * Signature: (Ljava/util/List;IJZ)Ljava/util/List; + * Signature: (Ljava/util/List;IJ)Ljava/util/List; */ JNIEXPORT jobject JNICALL Java_io_ray_runtime_object_NativeObjectStore_nativeWait( JNIEnv *, jclass, jobject, jint, jlong, jboolean); @@ -68,7 +68,7 @@ JNIEXPORT void JNICALL Java_io_ray_runtime_object_NativeObjectStore_nativeDelete /* * Class: io_ray_runtime_object_NativeObjectStore * Method: nativeAddLocalReference - * Signature: ([B[B)V + * Signature: ([B)V */ JNIEXPORT void JNICALL Java_io_ray_runtime_object_NativeObjectStore_nativeAddLocalReference(JNIEnv *, jclass, @@ -78,7 +78,7 @@ Java_io_ray_runtime_object_NativeObjectStore_nativeAddLocalReference(JNIEnv *, j /* * Class: io_ray_runtime_object_NativeObjectStore * Method: nativeRemoveLocalReference - * Signature: ([B[B)V + * Signature: ([B)V */ JNIEXPORT void JNICALL Java_io_ray_runtime_object_NativeObjectStore_nativeRemoveLocalReference(JNIEnv *, jclass, diff --git a/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskExecutor.h b/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskExecutor.h index ab7ec077d453..bf376aa12e64 100644 --- a/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskExecutor.h +++ b/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskExecutor.h @@ -21,6 +21,25 @@ #ifdef __cplusplus extern "C" { #endif +#undef io_ray_runtime_task_NativeTaskExecutor_NUM_ACTOR_CHECKPOINTS_TO_KEEP +#define io_ray_runtime_task_NativeTaskExecutor_NUM_ACTOR_CHECKPOINTS_TO_KEEP 20L +/* + * Class: io_ray_runtime_task_NativeTaskExecutor + * Method: nativePrepareCheckpoint + * Signature: ()[B + */ +JNIEXPORT jbyteArray JNICALL +Java_io_ray_runtime_task_NativeTaskExecutor_nativePrepareCheckpoint(JNIEnv *, jclass); + +/* + * Class: io_ray_runtime_task_NativeTaskExecutor + * Method: nativeNotifyActorResumedFromCheckpoint + * Signature: ([B)V + */ +JNIEXPORT void JNICALL +Java_io_ray_runtime_task_NativeTaskExecutor_nativeNotifyActorResumedFromCheckpoint( + JNIEnv *, jclass, jbyteArray); + #ifdef __cplusplus } #endif diff --git a/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.h b/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.h index d57e2d573188..8ea517b60cf9 100644 --- a/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.h +++ b/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.h @@ -74,13 +74,13 @@ Java_io_ray_runtime_task_NativeTaskSubmitter_nativeRemovePlacementGroup(JNIEnv * /* * Class: io_ray_runtime_task_NativeTaskSubmitter * Method: nativeWaitPlacementGroupReady - * Signature: ([BI)Z + * Signature: (J)Z */ JNIEXPORT jboolean JNICALL -Java_io_ray_runtime_task_NativeTaskSubmitter_nativeWaitPlacementGroupReady(JNIEnv *, - jclass, - jbyteArray, - jint); +Java_io_ray_runtime_task_NativeTaskSubmitter__nativeWaitPlacementGroupReady(JNIEnv *, + jclass, + jbyteArray, + jint); #ifdef __cplusplus } From 450cf1b523a3f9b3d29c65aa20070ef4844116b1 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 228/244] Revert "[Object Spilling] Skip flaky tests (#13628)" This reverts commit 4d3c8d1145b060603376e94827cf1139908d8d2a. --- python/ray/tests/test_object_spilling.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/python/ray/tests/test_object_spilling.py b/python/ray/tests/test_object_spilling.py index 8319dbfcac54..745eb3bafc1d 100644 --- a/python/ray/tests/test_object_spilling.py +++ b/python/ray/tests/test_object_spilling.py @@ -343,9 +343,7 @@ def is_dir_empty(): @pytest.mark.skipif( - platform.system() in ["Windows", "Darwin"], - reason="Failing on " - "Windows and Mac.") + platform.system() == "Windows", reason="Failing on Windows.") def test_delete_objects_delete_while_creating(tmp_path, shutdown_only): # Limit our object store to 75 MiB of memory. temp_folder = tmp_path / "spill" @@ -395,9 +393,7 @@ def is_dir_empty(): @pytest.mark.skipif( - platform.system() in ["Windows", "Darwin"], - reason="Failing on Windows " - "and Mac.") + platform.system() == "Windows", reason="Failing on Windows.") def test_delete_objects_on_worker_failure(tmp_path, shutdown_only): # Limit our object store to 75 MiB of memory. temp_folder = tmp_path / "spill" @@ -545,7 +541,6 @@ def is_dir_empty(): wait_for_condition(is_dir_empty) -@pytest.mark.skipif(platform.system() == "Windows", reason="Flaky on Windows.") def test_fusion_objects(tmp_path, shutdown_only): # Limit our object store to 75 MiB of memory. temp_folder = tmp_path / "spill" From ba1d3dc484ffcaa53d209345b1aa5899291657fe Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 229/244] Revert "Revert "[Serve] Refactor BackendState" (#13626)" This reverts commit 32d8be69ff7454bb2f3b86a21db7dde3d74786d6. --- python/ray/serve/backend_state.py | 533 +++++++++++++++++------------ python/ray/serve/config.py | 4 +- python/ray/serve/controller.py | 4 +- python/ray/serve/tests/test_api.py | 3 + 4 files changed, 327 insertions(+), 217 deletions(-) diff --git a/python/ray/serve/backend_state.py b/python/ray/serve/backend_state.py index 673c4b2cfbc8..4aad2671ea4e 100644 --- a/python/ray/serve/backend_state.py +++ b/python/ray/serve/backend_state.py @@ -1,7 +1,8 @@ import asyncio -from asyncio.futures import Future from collections import defaultdict -from typing import Dict, Any, List, Optional, Set, Tuple +from enum import Enum +import time +from typing import Dict, List, Optional, Tuple import ray import ray.cloudpickle as pickle @@ -17,7 +18,6 @@ ) from ray.serve.config import BackendConfig, ReplicaConfig from ray.serve.constants import LongPollKey -from ray.serve.exceptions import RayServeException from ray.serve.kv_store import RayInternalKVStore from ray.serve.long_poll import LongPollHost from ray.serve.utils import (format_actor_name, get_random_letters, logger, @@ -30,6 +30,150 @@ _RESOURCE_CHECK_ENABLED = True +class ReplicaState(Enum): + SHOULD_START = 1 + STARTING = 2 + RUNNING = 3 + SHOULD_STOP = 4 + STOPPING = 5 + STOPPED = 6 + + +class BackendReplica: + def __init__(self, controller_name: str, detached: bool, + replica_tag: ReplicaTag, backend_tag: BackendTag): + self._actor_name = format_actor_name(replica_tag, controller_name) + self._controller_name = controller_name + self._detached = detached + self._replica_tag = replica_tag + self._backend_tag = backend_tag + self._actor_handle = None + self._startup_obj_ref = None + self._drain_obj_ref = None + self._state = ReplicaState.SHOULD_START + + def __get_state__(self): + clean_dict = self.__dict__.copy() + del clean_dict["_actor_handle"] + del clean_dict["_startup_obj_ref"] + del clean_dict["_drain_obj_ref"] + return clean_dict + + def __set_state__(self, d): + self.__dict__ = d + self._actor_handle = None + self._startup_obj_ref = None + self._drain_obj_ref = None + self._recover_from_checkpoint() + + def _recover_from_checkpoint(self): + if self._state == ReplicaState.STARTING: + # We do not need to pass in the class here because the actor + # creation has already been started if this class was checkpointed + # in the STARTING state. + self.start() + elif self._state == ReplicaState.RUNNING: + # Fetch actor handles for all backend replicas in the system. + # The actors must exist if this class was checkpointed in the + # RUNNING state. + self._actor_handle = ray.get_actor(self._actor_name) + elif self._state == ReplicaState.STOPPING: + self.stop() + + def start(self, backend_info: Optional[BackendInfo]): + assert self._state in { + ReplicaState.SHOULD_START, ReplicaState.STARTING + }, (f"State must be {ReplicaState.SHOULD_START} or " + f"{ReplicaState.STARTING}, *not* {self._state}") + try: + self._actor_handle = ray.get_actor(self._actor_name) + except ValueError: + logger.debug("Starting replica '{}' for backend '{}'.".format( + self._replica_tag, self._backend_tag)) + self._actor_handle = ray.remote(backend_info.worker_class).options( + name=self._actor_name, + lifetime="detached" if self._detached else None, + max_restarts=-1, + max_task_retries=-1, + **backend_info.replica_config.ray_actor_options).remote( + self._backend_tag, self._replica_tag, + backend_info.replica_config.actor_init_args, + backend_info.backend_config, self._controller_name) + self._startup_obj_ref = self._actor_handle.ready.remote() + self._state = ReplicaState.STARTING + + def check_started(self): + if self._state == ReplicaState.RUNNING: + return True + assert self._state == ReplicaState.STARTING, ( + f"State must be {ReplicaState.STARTING}, *not* {self._state}") + ready, _ = ray.wait([self._startup_obj_ref], timeout=0) + if len(ready) == 1: + self._state = ReplicaState.RUNNING + return True + return False + + def set_should_stop(self, graceful_shutdown_timeout_s: Duration): + self._state = ReplicaState.SHOULD_STOP + self._graceful_shutdown_timeout_s = graceful_shutdown_timeout_s + + def stop(self): + # We need to handle transitions from: + # SHOULD_START -> SHOULD_STOP -> STOPPING + # This means that the replica_handle may not have been created. + + assert self._state in { + ReplicaState.SHOULD_STOP, ReplicaState.STOPPING + }, (f"State must be {ReplicaState.SHOULD_STOP} or " + f"{ReplicaState.STOPPING}, *not* {self._state}") + + def drain_actor(actor_name): + # NOTE: the replicas may already be stopped if we failed + # after stopping them but before writing a checkpoint. + try: + replica = ray.get_actor(actor_name) + except ValueError: + return None + return replica.drain_pending_queries.remote() + + self._state = ReplicaState.STOPPING + self._drain_obj_ref = drain_actor(self._actor_name) + self._shutdown_deadline = time.time( + ) + self._graceful_shutdown_timeout_s + + def check_stopped(self): + if self._state == ReplicaState.STOPPED: + return True + assert self._state == ReplicaState.STOPPING, ( + f"State must be {ReplicaState.STOPPING}, *not* {self._state}") + + try: + replica = ray.get_actor(self._actor_name) + except ValueError: + self._state = ReplicaState.STOPPED + return True + + ready, _ = ray.wait([self._drain_obj_ref], timeout=0) + timeout_passed = time.time() > self._shutdown_deadline + + if len(ready) == 1 or timeout_passed: + if timeout_passed: + # Graceful period passed, kill it forcefully. + logger.debug( + f"{self._actor_name} did not shutdown after " + f"{self._graceful_shutdown_timeout_s}s, force-killing.") + + ray.kill(replica, no_restart=True) + self._state = ReplicaState.STOPPED + return True + return False + + def get_actor_handle(self): + assert self._state == ReplicaState.RUNNING, ( + f"State must be {ReplicaState.RUNNING}, *not* {self._state}") + return self._actor_handle + + class BackendState: """Manages all state for backends in the system. @@ -46,79 +190,65 @@ def __init__(self, controller_name: str, detached: bool, self._long_poll_host = long_poll_host self._goal_manager = goal_manager - # Non-checkpointed state. - self.currently_starting_replicas: Dict[asyncio.Future, Tuple[ - BackendTag, ReplicaTag, ActorHandle]] = dict() - self.currently_stopping_replicas: Dict[asyncio.Future, Tuple[ - BackendTag, ReplicaTag]] = dict() - - # Checkpointed state. - self.backends: Dict[BackendTag, BackendInfo] = dict() - self.backend_replicas: Dict[BackendTag, Dict[ - ReplicaTag, ActorHandle]] = defaultdict(dict) + self._replicas: Dict[BackendTag, Dict[ReplicaState, List[ + BackendReplica]]] = defaultdict(lambda: defaultdict(list)) + self._backend_metadata: Dict[BackendTag, BackendInfo] = dict() + self._target_replicas: Dict[BackendTag, int] = defaultdict(int) self.backend_goals: Dict[BackendTag, GoalId] = dict() - self.backend_replicas_to_start: Dict[BackendTag, List[ - ReplicaTag]] = defaultdict(list) - self.backend_replicas_to_stop: Dict[BackendTag, List[Tuple[ - ReplicaTag, Duration]]] = defaultdict(list) - self.backends_to_remove: List[BackendTag] = list() + + # Un-Checkpointed state. + self.pending_goals: Dict[GoalId, asyncio.Event] = dict() checkpoint = self._kv_store.get(CHECKPOINT_KEY) if checkpoint is not None: - (self.backends, self.backend_replicas, self.backend_goals, - self.backend_replicas_to_start, self.backend_replicas_to_stop, - self.backend_to_remove, - pending_goal_ids) = pickle.loads(checkpoint) + (self._replicas, self._backend_metadata, self._target_replicas, + self.backend_goals, pending_goal_ids) = pickle.loads(checkpoint) for goal_id in pending_goal_ids: self._goal_manager.create_goal(goal_id) - # Fetch actor handles for all backend replicas in the system. - # All of these backend_replicas are guaranteed to already exist - # because they would not be written to a checkpoint in - # self.backend_replicas until they were created. - for backend_tag, replica_dict in self.backend_replicas.items(): - for replica_tag in replica_dict.keys(): - replica_name = format_actor_name(replica_tag, - self._controller_name) - self.backend_replicas[backend_tag][ - replica_tag] = ray.get_actor(replica_name) - self._notify_backend_configs_changed() self._notify_replica_handles_changed() def _checkpoint(self) -> None: self._kv_store.put( CHECKPOINT_KEY, - pickle.dumps( - (self.backends, self.backend_replicas, self.backend_goals, - self.backend_replicas_to_start, self.backend_replicas_to_stop, - self.backends_to_remove, - self._goal_manager.get_pending_goal_ids()))) + pickle.dumps((self._replicas, self._backend_metadata, + self._target_replicas, self.backend_goals, + self._goal_manager.get_pending_goal_ids()))) def _notify_backend_configs_changed(self) -> None: self._long_poll_host.notify_changed(LongPollKey.BACKEND_CONFIGS, self.get_backend_configs()) + def get_running_replica_handles( + self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]: + return { + backend_tag: { + backend_replica._replica_tag: + backend_replica.get_actor_handle() + for backend_replica in state_to_replica_dict[ + ReplicaState.RUNNING] + } + for backend_tag, state_to_replica_dict in self._replicas.items() + } + def _notify_replica_handles_changed(self) -> None: self._long_poll_host.notify_changed( LongPollKey.REPLICA_HANDLES, { backend_tag: list(replica_dict.values()) - for backend_tag, replica_dict in self.backend_replicas.items() + for backend_tag, replica_dict in + self.get_running_replica_handles().items() }) def get_backend_configs(self) -> Dict[BackendTag, BackendConfig]: return { tag: info.backend_config - for tag, info in self.backends.items() + for tag, info in self._backend_metadata.items() } - def get_replica_handles( - self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]: - return self.backend_replicas - def get_backend(self, backend_tag: BackendTag) -> Optional[BackendInfo]: - return self.backends.get(backend_tag) + return self._backend_metadata.get(backend_tag) def _set_backend_goal(self, backend_tag: BackendTag, backend_info: BackendInfo) -> None: @@ -126,7 +256,11 @@ def _set_backend_goal(self, backend_tag: BackendTag, new_goal_id = self._goal_manager.create_goal() if backend_info is not None: - self.backends[backend_tag] = backend_info + self._backend_metadata[backend_tag] = backend_info + self._target_replicas[ + backend_tag] = backend_info.backend_config.num_replicas + else: + self._target_replicas[backend_tag] = 0 self.backend_goals[backend_tag] = new_goal_id @@ -136,31 +270,25 @@ def create_backend(self, backend_tag: BackendTag, backend_config: BackendConfig, replica_config: ReplicaConfig) -> Optional[GoalId]: # Ensures this method is idempotent. - backend_info = self.backends.get(backend_tag) + backend_info = self._backend_metadata.get(backend_tag) if backend_info is not None: if (backend_info.backend_config == backend_config and backend_info.replica_config == replica_config): return None - backend_replica = create_backend_replica(replica_config.func_or_class) + backend_replica_class = create_backend_replica( + replica_config.func_or_class) # Save creator that starts replicas, the arguments to be passed in, # and the configuration for the backends. backend_info = BackendInfo( - worker_class=backend_replica, + worker_class=backend_replica_class, backend_config=backend_config, replica_config=replica_config) new_goal_id, existing_goal_id = self._set_backend_goal( backend_tag, backend_info) - try: - self.scale_backend_replicas(backend_tag, - backend_config.num_replicas) - except RayServeException as e: - del self.backends[backend_tag] - raise e - # NOTE(edoakes): we must write a checkpoint before starting new # or pushing the updated config to avoid inconsistent state if we # crash while making the change. @@ -175,20 +303,15 @@ def delete_backend(self, backend_tag: BackendTag, force_kill: bool = False) -> Optional[GoalId]: # This method must be idempotent. We should validate that the # specified backend exists on the client. - if backend_tag not in self.backends: + if backend_tag not in self._backend_metadata: return None - # Scale its replicas down to 0. - self.scale_backend_replicas(backend_tag, 0, force_kill) - - # Remove the backend's metadata. - del self.backends[backend_tag] - - # Add the intention to remove the backend from the routers. - self.backends_to_remove.append(backend_tag) - new_goal_id, existing_goal_id = self._set_backend_goal( backend_tag, None) + if force_kill: + self._backend_metadata[ + backend_tag].backend_config.\ + experimental_graceful_shutdown_timeout_s = 0 self._checkpoint() if existing_goal_id is not None: @@ -197,20 +320,18 @@ def delete_backend(self, backend_tag: BackendTag, def update_backend_config(self, backend_tag: BackendTag, config_options: BackendConfig): - if backend_tag not in self.backends: + if backend_tag not in self._backend_metadata: raise ValueError(f"Backend {backend_tag} is not registered") - stored_backend_config = self.backends[backend_tag].backend_config + stored_backend_config = self._backend_metadata[ + backend_tag].backend_config updated_config = stored_backend_config.copy( update=config_options.dict(exclude_unset=True)) updated_config._validate_complete() - self.backends[backend_tag].backend_config = updated_config + self._backend_metadata[backend_tag].backend_config = updated_config new_goal_id, existing_goal_id = self._set_backend_goal( - backend_tag, self.backends[backend_tag]) - - # Scale the replicas with the new configuration. - self.scale_backend_replicas(backend_tag, updated_config.num_replicas) + backend_tag, self._backend_metadata[backend_tag]) # NOTE(edoakes): we must write a checkpoint before pushing the # update to avoid inconsistent state if we crash after pushing the @@ -260,31 +381,38 @@ def _start_backend_replica(self, backend_tag: BackendTag, def scale_backend_replicas( self, backend_tag: BackendTag, - num_replicas: int, - force_kill: bool = False, - ) -> None: + ) -> bool: """Scale the given backend to the number of replicas. NOTE: this does not actually start or stop the replicas, but instead - adds the intention to start/stop them to self.backend_replicas_to_start - and self.backend_replicas_to_stop. The caller is responsible for then - first writing a checkpoint and then actually starting/stopping the - intended replicas. This avoids inconsistencies with starting/stopping a - replica and then crashing before writing a checkpoint. + adds them to ReplicaState.SHOULD_START or ReplicaState.SHOULD_STOP. + The caller is responsible for then first writing a checkpoint and then + actually starting/stopping the intended replicas. This avoids + inconsistencies with starting/stopping a replica and then crashing + before writing a checkpoint. """ + num_replicas = self._target_replicas.get(backend_tag, 0) logger.debug("Scaling backend '{}' to {} replicas".format( backend_tag, num_replicas)) - assert (backend_tag in self.backends + assert (backend_tag in self._backend_metadata ), "Backend {} is not registered.".format(backend_tag) assert num_replicas >= 0, ("Number of replicas must be" " greater than or equal to 0.") - current_num_replicas = len(self.backend_replicas[backend_tag]) + current_num_replicas = sum([ + len(self._replicas[backend_tag][ReplicaState.SHOULD_START]), + len(self._replicas[backend_tag][ReplicaState.STARTING]), + len(self._replicas[backend_tag][ReplicaState.RUNNING]), + ]) + delta_num_replicas = num_replicas - current_num_replicas - backend_info: BackendInfo = self.backends[backend_tag] - if delta_num_replicas > 0: + backend_info: BackendInfo = self._backend_metadata[backend_tag] + if delta_num_replicas == 0: + return False + + elif delta_num_replicas > 0: can_schedule = try_schedule_resources_on_nodes(requirements=[ backend_info.replica_config.resource_dict for _ in range(delta_num_replicas) @@ -292,10 +420,11 @@ def scale_backend_replicas( if _RESOURCE_CHECK_ENABLED and not all(can_schedule): num_possible = sum(can_schedule) - raise RayServeException( + logger.error( "Cannot scale backend {} to {} replicas. Ray Serve tried " "to add {} replicas but the resources only allows {} " - "to be added. To fix this, consider scaling to replica to " + "to be added. This is not a problem if the cluster is " + "autoscaling. To fix this, consider scaling to replica to " "{} or add more resources to the cluster. You can check " "avaiable resources with ray.nodes().".format( backend_tag, num_replicas, delta_num_replicas, @@ -305,154 +434,132 @@ def scale_backend_replicas( delta_num_replicas, backend_tag)) for _ in range(delta_num_replicas): replica_tag = "{}#{}".format(backend_tag, get_random_letters()) - self.backend_replicas_to_start[backend_tag].append(replica_tag) + self._replicas[backend_tag][ReplicaState.SHOULD_START].append( + BackendReplica(self._controller_name, self._detached, + replica_tag, backend_tag)) elif delta_num_replicas < 0: logger.debug("Removing {} replicas from backend '{}'".format( -delta_num_replicas, backend_tag)) - assert len( - self.backend_replicas[backend_tag]) >= delta_num_replicas - replicas_copy = self.backend_replicas.copy() + assert self._target_replicas[backend_tag] >= delta_num_replicas + for _ in range(-delta_num_replicas): - replica_tag, _ = replicas_copy[backend_tag].popitem() + replica_state_dict = self._replicas[backend_tag] + list_to_use = replica_state_dict[ReplicaState.SHOULD_START] \ + or replica_state_dict[ReplicaState.STARTING] \ + or replica_state_dict[ReplicaState.RUNNING] + + assert len(list_to_use), replica_state_dict + replica_to_stop = list_to_use.pop() graceful_timeout_s = (backend_info.backend_config. experimental_graceful_shutdown_timeout_s) - if force_kill: - graceful_timeout_s = 0 - self.backend_replicas_to_stop[backend_tag].append(( - replica_tag, - graceful_timeout_s, - )) - - def _start_pending_replicas(self): - for backend_tag, replicas_to_create in self.backend_replicas_to_start.\ - items(): - for replica_tag in replicas_to_create: - replica_handle = self._start_backend_replica( - backend_tag, replica_tag) - ready_future = replica_handle.ready.remote().as_future() - self.currently_starting_replicas[ready_future] = ( - backend_tag, replica_tag, replica_handle) - - def _stop_pending_replicas(self): - for backend_tag, replicas_to_stop in ( - self.backend_replicas_to_stop.items()): - for replica_tag, shutdown_timeout in replicas_to_stop: - replica_name = format_actor_name(replica_tag, - self._controller_name) - - async def kill_actor(replica_name_to_use): - # NOTE: the replicas may already be stopped if we failed - # after stopping them but before writing a checkpoint. - try: - replica = ray.get_actor(replica_name_to_use) - except ValueError: - return - - try: - await asyncio.wait_for( - replica.drain_pending_queries.remote(), - timeout=shutdown_timeout) - except asyncio.TimeoutError: - # Graceful period passed, kill it forcefully. - logger.debug( - f"{replica_name_to_use} did not shutdown after " - f"{shutdown_timeout}s, killing.") - finally: - ray.kill(replica, no_restart=True) - - self.currently_stopping_replicas[asyncio.ensure_future( - kill_actor(replica_name))] = (backend_tag, replica_tag) - - async def _check_currently_starting_replicas(self) -> int: - """Returns the number of pending replicas waiting to start""" - in_flight: Set[Future[Any]] = set() - - if self.currently_starting_replicas: - done, in_flight = await asyncio.wait( - list(self.currently_starting_replicas.keys()), timeout=0) - for fut in done: - (backend_tag, replica_tag, - replica_handle) = self.currently_starting_replicas.pop(fut) - self.backend_replicas[backend_tag][ - replica_tag] = replica_handle - - backend = self.backend_replicas_to_start.get(backend_tag) - if backend: - try: - backend.remove(replica_tag) - except ValueError: - pass - if len(backend) == 0: - del self.backend_replicas_to_start[backend_tag] - - async def _check_currently_stopping_replicas(self) -> int: - """Returns the number of replicas waiting to stop""" - in_flight: Set[Future[Any]] = set() - - if self.currently_stopping_replicas: - done_stopping, in_flight = await asyncio.wait( - list(self.currently_stopping_replicas.keys()), timeout=0) - for fut in done_stopping: - (backend_tag, - replica_tag) = self.currently_stopping_replicas.pop(fut) - - backend_to_stop = self.backend_replicas_to_stop.get( - backend_tag) - - if backend_to_stop: - try: - backend_to_stop.remove(replica_tag) - except ValueError: - pass - if len(backend_to_stop) == 0: - del self.backend_replicas_to_stop[backend_tag] - - backend = self.backend_replicas.get(backend_tag) - if backend: - try: - del backend[replica_tag] - except KeyError: - pass - - if len(self.backend_replicas[backend_tag]) == 0: - del self.backend_replicas[backend_tag] + + replica_to_stop.set_should_stop(graceful_timeout_s) + self._replicas[backend_tag][ReplicaState.SHOULD_STOP].append( + replica_to_stop) + + return True + + def scale_all_backends(self): + checkpoint_needed = False + for backend_tag, num_replicas in list(self._target_replicas.items()): + checkpoint_needed = (checkpoint_needed + or self.scale_backend_replicas(backend_tag)) + if num_replicas == 0: + del self._backend_metadata[backend_tag] + del self._target_replicas[backend_tag] + + if checkpoint_needed: + self._checkpoint() + + def _pop_replicas_of_state(self, state: ReplicaState + ) -> List[Tuple[ReplicaState, BackendTag]]: + replicas = [] + for backend_tag, state_to_replica_dict in self._replicas.items(): + if state in state_to_replica_dict: + replicas.extend( + (replica, backend_tag) + for replica in state_to_replica_dict.pop(state)) + + return replicas def _completed_goals(self) -> List[GoalId]: completed_goals = [] - all_tags = set(self.backend_replicas.keys()).union( - set(self.backends.keys())) + all_tags = set(self._replicas.keys()).union( + set(self._backend_metadata.keys())) for backend_tag in all_tags: - desired_info = self.backends.get(backend_tag) - existing_info = self.backend_replicas.get(backend_tag) + desired_num_replicas = self._target_replicas.get(backend_tag) + state_dict = self._replicas.get(backend_tag, {}) + existing_info = state_dict.get(ReplicaState.RUNNING, []) + + # If we have pending ops, the current goal is *not* ready + if (state_dict.get(ReplicaState.SHOULD_START) + or state_dict.get(ReplicaState.STARTING) + or state_dict.get(ReplicaState.SHOULD_STOP) + or state_dict.get(ReplicaState.STOPPING)): + continue + + # TODO(ilr): FIX # Check for deleting - if (not desired_info or - desired_info.backend_config.num_replicas == 0) and \ + if (not desired_num_replicas or + desired_num_replicas == 0) and \ (not existing_info or len(existing_info) == 0): - completed_goals.append(self.backend_goals.get(backend_tag)) + completed_goals.append( + self.backend_goals.pop(backend_tag, None)) # Check for a non-zero number of backends - if desired_info and existing_info and desired_info.backend_config.\ - num_replicas == len(existing_info): - completed_goals.append(self.backend_goals.get(backend_tag)) + if (desired_num_replicas and existing_info) \ + and desired_num_replicas == len(existing_info): + completed_goals.append( + self.backend_goals.pop(backend_tag, None)) return [goal for goal in completed_goals if goal] async def update(self) -> bool: + self.scale_all_backends() + for goal_id in self._completed_goals(): self._goal_manager.complete_goal(goal_id) - self._start_pending_replicas() - self._stop_pending_replicas() - - num_starting = len(self.currently_starting_replicas) - num_stopping = len(self.currently_stopping_replicas) - - await self._check_currently_starting_replicas() - await self._check_currently_stopping_replicas() - - if (len(self.currently_starting_replicas) != num_starting) or \ - (len(self.currently_stopping_replicas) != num_stopping): + for replica_state, backend_tag in self._pop_replicas_of_state( + ReplicaState.SHOULD_START): + replica_state.start(self._backend_metadata[backend_tag]) + self._replicas[backend_tag][ReplicaState.STARTING].append( + replica_state) + + for replica_state, backend_tag in self._pop_replicas_of_state( + ReplicaState.SHOULD_STOP): + replica_state.stop() + self._replicas[backend_tag][ReplicaState.STOPPING].append( + replica_state) + + transition_triggered = False + + for replica_state, backend_tag in self._pop_replicas_of_state( + ReplicaState.STARTING): + if replica_state.check_started(): + self._replicas[backend_tag][ReplicaState.RUNNING].append( + replica_state) + transition_triggered = True + else: + self._replicas[backend_tag][ReplicaState.STARTING].append( + replica_state) + + for replica_state, backend_tag in self._pop_replicas_of_state( + ReplicaState.STOPPING): + if replica_state.check_stopped(): + transition_triggered = True + else: + self._replicas[backend_tag][ReplicaState.STOPPING].append( + replica_state) + + for backend_tag in list(self._replicas.keys()): + if not any(self._replicas[backend_tag]): + del self._replicas[backend_tag] + del self._backend_metadata[backend_tag] + del self._target_replicas[backend_tag] + + if transition_triggered: self._checkpoint() self._notify_replica_handles_changed() diff --git a/python/ray/serve/config.py b/python/ray/serve/config.py index 205af81b065a..41a1eca08ae8 100644 --- a/python/ray/serve/config.py +++ b/python/ray/serve/config.py @@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional import pydantic -from pydantic import BaseModel, PositiveFloat, PositiveInt, validator +from pydantic import BaseModel, confloat, PositiveFloat, PositiveInt, validator from ray.serve.constants import (ASYNC_CONCURRENCY, DEFAULT_HTTP_HOST, DEFAULT_HTTP_PORT) @@ -64,7 +64,7 @@ class BackendConfig(BaseModel): user_config: Any = None experimental_graceful_shutdown_wait_loop_s: PositiveFloat = 2.0 - experimental_graceful_shutdown_timeout_s: PositiveFloat = 20.0 + experimental_graceful_shutdown_timeout_s: confloat(ge=0) = 20.0 class Config: validate_assignment = True diff --git a/python/ray/serve/controller.py b/python/ray/serve/controller.py index a3c75c711878..b5c65111a8f9 100644 --- a/python/ray/serve/controller.py +++ b/python/ray/serve/controller.py @@ -118,7 +118,7 @@ async def run_control_loop(self) -> None: def _all_replica_handles( self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]: """Used for testing.""" - return self.backend_state.get_replica_handles() + return self.backend_state.get_running_replica_handles() def get_all_backends(self) -> Dict[BackendTag, BackendConfig]: """Returns a dictionary of backend tag to backend config.""" @@ -235,7 +235,7 @@ async def shutdown(self) -> None: async with self.write_lock: for proxy in self.http_state.get_http_proxy_handles().values(): ray.kill(proxy, no_restart=True) - for replica_dict in self.backend_state.get_replica_handles( + for replica_dict in self.backend_state.get_running_replica_handles( ).values(): for replica in replica_dict.values(): ray.kill(replica, no_restart=True) diff --git a/python/ray/serve/tests/test_api.py b/python/ray/serve/tests/test_api.py index 202b01386059..a35f7e54b361 100644 --- a/python/ray/serve/tests/test_api.py +++ b/python/ray/serve/tests/test_api.py @@ -683,6 +683,9 @@ def f(): client.create_endpoint("endpoint", backend="backend") +# This error is only printed because creation is run in the control loop, not +# in the API path. +@pytest.mark.skip() def test_create_infeasible_error(serve_instance): client = serve_instance From 088b604d853388bf530181b1abdce3060a8113b3 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 230/244] Revert "[Dashboard]Don't set node actors when node_id of actor is Nil (#13573)" This reverts commit c33f889979669612dbef8167f5fe38fa0ca1dccf. --- .../stats_collector/stats_collector_consts.py | 3 -- .../stats_collector/stats_collector_head.py | 15 +++---- .../tests/test_stats_collector.py | 44 ------------------- 3 files changed, 6 insertions(+), 56 deletions(-) diff --git a/dashboard/modules/stats_collector/stats_collector_consts.py b/dashboard/modules/stats_collector/stats_collector_consts.py index cdcbf6bd126d..55119cd75dfa 100644 --- a/dashboard/modules/stats_collector/stats_collector_consts.py +++ b/dashboard/modules/stats_collector/stats_collector_consts.py @@ -1,8 +1,5 @@ -import ray - NODE_STATS_UPDATE_INTERVAL_SECONDS = 1 RETRY_GET_ALL_ACTOR_INFO_INTERVAL_SECONDS = 1 ACTOR_CHANNEL = "ACTOR" ERROR_INFO_UPDATE_INTERVAL_SECONDS = 5 LOG_INFO_UPDATE_INTERVAL_SECONDS = 5 -NIL_NODE_ID = ray.NodeID.nil().hex() diff --git a/dashboard/modules/stats_collector/stats_collector_head.py b/dashboard/modules/stats_collector/stats_collector_head.py index e0b6cffa77b8..aa37e2e6e107 100644 --- a/dashboard/modules/stats_collector/stats_collector_head.py +++ b/dashboard/modules/stats_collector/stats_collector_head.py @@ -203,10 +203,8 @@ def _process_actor_table_data(data): node_id = actor_table_data["address"]["rayletId"] job_actors.setdefault(job_id, {})[actor_id] = actor_table_data - # Update only when node_id is not Nil. - if node_id != stats_collector_consts.NIL_NODE_ID: - node_actors.setdefault( - node_id, {})[actor_id] = actor_table_data + node_actors.setdefault(node_id, + {})[actor_id] = actor_table_data DataSource.job_actors.reset(job_actors) DataSource.node_actors.reset(node_actors) logger.info("Received %d actor info from GCS.", @@ -235,11 +233,10 @@ def _process_actor_table_data(data): node_id = actor_table_data["address"]["rayletId"] # Update actors. DataSource.actors[actor_id] = actor_table_data - # Update node actors (only when node_id is not Nil). - if node_id != stats_collector_consts.NIL_NODE_ID: - node_actors = dict(DataSource.node_actors.get(node_id, {})) - node_actors[actor_id] = actor_table_data - DataSource.node_actors[node_id] = node_actors + # Update node actors. + node_actors = dict(DataSource.node_actors.get(node_id, {})) + node_actors[actor_id] = actor_table_data + DataSource.node_actors[node_id] = node_actors # Update job actors. job_actors = dict(DataSource.job_actors.get(job_id, {})) job_actors[actor_id] = actor_table_data diff --git a/dashboard/modules/stats_collector/tests/test_stats_collector.py b/dashboard/modules/stats_collector/tests/test_stats_collector.py index fcd1c42e3456..bed6d650fc29 100644 --- a/dashboard/modules/stats_collector/tests/test_stats_collector.py +++ b/dashboard/modules/stats_collector/tests/test_stats_collector.py @@ -8,8 +8,6 @@ import pytest import ray import threading -import ray.new_dashboard.modules.stats_collector.stats_collector_consts \ - as stats_collector_consts from datetime import datetime, timedelta from ray.cluster_utils import Cluster from ray.new_dashboard.tests.conftest import * # noqa @@ -375,47 +373,5 @@ def check_errs(): check_errs, (AssertionError), timeout_ms=1000) -def test_nil_node(enable_test_module, disable_aiohttp_cache, - ray_start_with_dashboard): - assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) - is True) - webui_url = ray_start_with_dashboard["webui_url"] - assert wait_until_server_available(webui_url) - webui_url = format_web_url(webui_url) - - @ray.remote(num_gpus=1) - class InfeasibleActor: - pass - - infeasible_actor = InfeasibleActor.remote() # noqa - - timeout_seconds = 5 - start_time = time.time() - last_ex = None - while True: - time.sleep(1) - try: - resp = requests.get(f"{webui_url}/logical/actors") - resp_json = resp.json() - resp_data = resp_json["data"] - actors = resp_data["actors"] - assert len(actors) == 1 - response = requests.get(webui_url + "/test/dump?key=node_actors") - response.raise_for_status() - result = response.json() - assert stats_collector_consts.NIL_NODE_ID not in result["data"][ - "nodeActors"] - break - except Exception as ex: - last_ex = ex - finally: - if time.time() > start_time + timeout_seconds: - ex_stack = traceback.format_exception( - type(last_ex), last_ex, - last_ex.__traceback__) if last_ex else [] - ex_stack = "".join(ex_stack) - raise Exception(f"Timed out while testing, {ex_stack}") - - if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) From 11aff1714d2358f72b8cc8249bb7868474b170a3 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 231/244] Revert "[dashboard] Fixes dashboard issues when environments have set http_proxy (#12598)" This reverts commit b40db307e084c798654357d679f4fbc4be534667. --- dashboard/agent.py | 3 +- dashboard/head.py | 4 +- .../modules/logical_view/logical_view_head.py | 4 +- dashboard/modules/reporter/reporter_head.py | 4 +- .../stats_collector/stats_collector_head.py | 3 +- dashboard/tests/conftest.py | 57 ++++++------------- dashboard/tests/test_dashboard.py | 33 ----------- dashboard/utils.py | 21 ++++--- 8 files changed, 32 insertions(+), 97 deletions(-) diff --git a/dashboard/agent.py b/dashboard/agent.py index 7bf5e1551a2b..f34024e545c7 100644 --- a/dashboard/agent.py +++ b/dashboard/agent.py @@ -75,9 +75,8 @@ def __init__(self, logger.info("Dashboard agent grpc address: %s:%s", self.ip, self.grpc_port) self.aioredis_client = None - options = (("grpc.enable_http_proxy", 0), ) self.aiogrpc_raylet_channel = aiogrpc.insecure_channel( - f"{self.ip}:{self.node_manager_port}", options=options) + f"{self.ip}:{self.node_manager_port}") self.http_session = None def _load_modules(self): diff --git a/dashboard/head.py b/dashboard/head.py index f1ef75ef478d..e8e9119132d2 100644 --- a/dashboard/head.py +++ b/dashboard/head.py @@ -159,9 +159,7 @@ async def run(self): if not gcs_address: raise Exception("GCS address not found.") logger.info("Connect to GCS at %s", gcs_address) - options = (("grpc.enable_http_proxy", 0), ) - channel = aiogrpc.insecure_channel( - gcs_address, options=options) + channel = aiogrpc.insecure_channel(gcs_address) except Exception as ex: logger.error("Connect to GCS failed: %s, retry...", ex) await asyncio.sleep( diff --git a/dashboard/modules/logical_view/logical_view_head.py b/dashboard/modules/logical_view/logical_view_head.py index 6b8e0bae1ecb..cf29db637da1 100644 --- a/dashboard/modules/logical_view/logical_view_head.py +++ b/dashboard/modules/logical_view/logical_view_head.py @@ -46,9 +46,7 @@ async def kill_actor(self, req) -> aiohttp.web.Response: except KeyError: return rest_response(success=False, message="Bad Request") try: - options = (("grpc.enable_http_proxy", 0), ) - channel = aiogrpc.insecure_channel( - f"{ip_address}:{port}", options=options) + channel = aiogrpc.insecure_channel(f"{ip_address}:{port}") stub = core_worker_pb2_grpc.CoreWorkerServiceStub(channel) await stub.KillActor( diff --git a/dashboard/modules/reporter/reporter_head.py b/dashboard/modules/reporter/reporter_head.py index 2d84c6b65c21..8faef274d60c 100644 --- a/dashboard/modules/reporter/reporter_head.py +++ b/dashboard/modules/reporter/reporter_head.py @@ -38,9 +38,7 @@ async def _update_stubs(self, change): if change.new: node_id, ports = change.new ip = DataSource.node_id_to_ip[node_id] - options = (("grpc.enable_http_proxy", 0), ) - channel = aiogrpc.insecure_channel( - f"{ip}:{ports[1]}", options=options) + channel = aiogrpc.insecure_channel(f"{ip}:{ports[1]}") stub = reporter_pb2_grpc.ReporterServiceStub(channel) self._stubs[ip] = stub diff --git a/dashboard/modules/stats_collector/stats_collector_head.py b/dashboard/modules/stats_collector/stats_collector_head.py index aa37e2e6e107..ae75864e50ca 100644 --- a/dashboard/modules/stats_collector/stats_collector_head.py +++ b/dashboard/modules/stats_collector/stats_collector_head.py @@ -71,8 +71,7 @@ async def _update_stubs(self, change): node_id, node_info = change.new address = "{}:{}".format(node_info["nodeManagerAddress"], int(node_info["nodeManagerPort"])) - options = (("grpc.enable_http_proxy", 0), ) - channel = aiogrpc.insecure_channel(address, options=options) + channel = aiogrpc.insecure_channel(address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) self._stubs[node_id] = stub diff --git a/dashboard/tests/conftest.py b/dashboard/tests/conftest.py index ec893fbef252..cb49e8bfc94a 100644 --- a/dashboard/tests/conftest.py +++ b/dashboard/tests/conftest.py @@ -1,40 +1,17 @@ -import os -import pytest -from ray.tests.conftest import * # noqa - - -@pytest.fixture -def enable_test_module(): - os.environ["RAY_DASHBOARD_MODULE_TEST"] = "true" - yield - os.environ.pop("RAY_DASHBOARD_MODULE_TEST", None) - - -@pytest.fixture -def disable_aiohttp_cache(): - os.environ["RAY_DASHBOARD_NO_CACHE"] = "true" - yield - os.environ.pop("RAY_DASHBOARD_NO_CACHE", None) - - -@pytest.fixture -def set_http_proxy(): - http_proxy = os.environ.get("http_proxy", None) - https_proxy = os.environ.get("https_proxy", None) - - # set http proxy - os.environ["http_proxy"] = "www.example.com:990" - os.environ["https_proxy"] = "www.example.com:990" - - yield - - # reset http proxy - if http_proxy: - os.environ["http_proxy"] = http_proxy - else: - del os.environ["http_proxy"] - - if https_proxy: - os.environ["https_proxy"] = https_proxy - else: - del os.environ["https_proxy"] +import os +import pytest +from ray.tests.conftest import * # noqa + + +@pytest.fixture +def enable_test_module(): + os.environ["RAY_DASHBOARD_MODULE_TEST"] = "true" + yield + os.environ.pop("RAY_DASHBOARD_MODULE_TEST", None) + + +@pytest.fixture +def disable_aiohttp_cache(): + os.environ["RAY_DASHBOARD_NO_CACHE"] = "true" + yield + os.environ.pop("RAY_DASHBOARD_NO_CACHE", None) diff --git a/dashboard/tests/test_dashboard.py b/dashboard/tests/test_dashboard.py index 529e394613d0..1acc94a169fe 100644 --- a/dashboard/tests/test_dashboard.py +++ b/dashboard/tests/test_dashboard.py @@ -571,38 +571,5 @@ def test_immutable_types(): print(d3[1]) -def test_http_proxy(enable_test_module, set_http_proxy, shutdown_only): - address_info = ray.init(num_cpus=1, include_dashboard=True) - assert (wait_until_server_available(address_info["webui_url"]) is True) - - webui_url = address_info["webui_url"] - webui_url = format_web_url(webui_url) - - timeout_seconds = 10 - start_time = time.time() - while True: - time.sleep(1) - try: - response = requests.get( - webui_url + "/test/dump", - proxies={ - "http": None, - "https": None - }) - response.raise_for_status() - try: - response.json() - assert response.ok - except Exception as ex: - logger.info("failed response: %s", response.text) - raise ex - break - except (AssertionError, requests.exceptions.ConnectionError) as e: - logger.info("Retry because of %s", e) - finally: - if time.time() > start_time + timeout_seconds: - raise Exception("Timed out while testing.") - - if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/dashboard/utils.py b/dashboard/utils.py index 5c347ed32a49..e1379eea8e14 100644 --- a/dashboard/utils.py +++ b/dashboard/utils.py @@ -1,35 +1,34 @@ import abc +import os +import socket +import time import asyncio import collections +import json import datetime import functools import importlib import inspect -import json import logging -import os import pkgutil -import socket import traceback -from abc import ABCMeta, abstractmethod from base64 import b64decode -from collections import namedtuple +from abc import ABCMeta, abstractmethod from collections.abc import MutableMapping, Mapping, Sequence +from collections import namedtuple from typing import Any -import aiohttp.signals -import aiohttp.web import aioredis -import time +import aiohttp.web +import ray.new_dashboard.consts as dashboard_consts from aiohttp import hdrs from aiohttp.frozenlist import FrozenList from aiohttp.typedefs import PathLike from aiohttp.web import RouteDef +import aiohttp.signals from google.protobuf.json_format import MessageToDict - -import ray.new_dashboard.consts as dashboard_consts -from ray.ray_constants import env_bool from ray.utils import binary_to_hex +from ray.ray_constants import env_bool try: create_task = asyncio.create_task From f3009df858253cc0bd61bc726773b39c2e46e9d7 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 232/244] Revert "[autoscaler] remove worker_default_node_type that is useless. (#13588)" This reverts commit 773d13c4bfc8b3d851238c86f294cfd47c16c62d. --- doc/source/cluster/autoscaling.rst | 3 +++ python/ray/autoscaler/_private/util.py | 8 ++++++++ python/ray/autoscaler/aws/example-multi-node-type.yaml | 3 +++ .../kubernetes/operator_configs/cluster_crd.yaml | 4 ++++ .../kubernetes/operator_configs/example_cluster.yaml | 2 ++ .../kubernetes/operator_configs/example_cluster2.yaml | 2 ++ python/ray/autoscaler/ray-schema.json | 4 ++++ .../ray/autoscaler/staroid/example-multi-node-type.yaml | 3 +++ python/ray/operator/operator_utils.py | 1 + python/ray/tests/test_resource_demand_scheduler.py | 3 ++- 10 files changed, 32 insertions(+), 1 deletion(-) diff --git a/doc/source/cluster/autoscaling.rst b/doc/source/cluster/autoscaling.rst index ecb7af15565a..e8d8f235d4e5 100644 --- a/doc/source/cluster/autoscaling.rst +++ b/doc/source/cluster/autoscaling.rst @@ -111,6 +111,9 @@ An example of configuring multiple node types is as follows `(full example) None: if config["head_node_type"] not in config["available_node_types"]: raise ValueError( "`head_node_type` must be one of `available_node_types`.") + if "worker_default_node_type" not in config: + raise ValueError("You must specify `worker_default_node_type` if " + "`available_node_types is set.") + if (config["worker_default_node_type"] not in config[ + "available_node_types"]): + raise ValueError("`worker_default_node_type` must be one of " + "`available_node_types`.") def prepare_config(config): @@ -116,6 +123,7 @@ def rewrite_legacy_yaml_to_available_node_types( }, } config["head_node_type"] = NODE_TYPE_LEGACY_HEAD + config["worker_default_node_type"] = NODE_TYPE_LEGACY_WORKER return config diff --git a/python/ray/autoscaler/aws/example-multi-node-type.yaml b/python/ray/autoscaler/aws/example-multi-node-type.yaml index 1a83b8cc6212..56b5c1b78d2e 100644 --- a/python/ray/autoscaler/aws/example-multi-node-type.yaml +++ b/python/ray/autoscaler/aws/example-multi-node-type.yaml @@ -55,6 +55,9 @@ available_node_types: # Specify the node type of the head node (as configured above). head_node_type: cpu_4_ondemand +# Specify the default type of the worker node (as configured above). +worker_default_node_type: cpu_16_spot + # The default settings for the head node. This will be merged with the per-node # type configs given above. head_node: diff --git a/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml b/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml index 75a802b58d87..9e92d5d4f6bc 100644 --- a/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml +++ b/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml @@ -25,6 +25,7 @@ spec: required: - podTypes - headPodType + - workerDefaultPodType properties: maxWorkers: description: The maximum number of workers nodes to launch in addition to the @@ -4263,6 +4264,9 @@ spec: headPodType: description: Specifies the head node type. type: string + workerDefaultPodType: + description: Specifies the default worker node type. + type: string headStartRayCommands: description: Commands to start Ray on the head node. type: array diff --git a/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml b/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml index 8d2aa4561936..bb4a71fcc203 100644 --- a/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml +++ b/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml @@ -14,6 +14,8 @@ spec: idleTimeoutMinutes: 5 # Specify the pod type for the ray head node (as configured below). headPodType: head-node + # Specify the default pod type for ray the worker nodes (as configured below). + workerDefaultPodType: worker-nodes # Specify the allowed pod types for this ray cluster and the resources they provide. podTypes: - name: head-node diff --git a/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml b/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml index 0c6eb604e1eb..e5e4ecf3197a 100644 --- a/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml +++ b/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml @@ -14,6 +14,8 @@ spec: idleTimeoutMinutes: 5 # Specify the pod type for the ray head node (as configured below). headPodType: head-node + # Specify the default pod type for ray the worker nodes (as configured below). + workerDefaultPodType: worker-nodes # Specify the allowed pod types for this ray cluster and the resources they provide. podTypes: - name: head-node diff --git a/python/ray/autoscaler/ray-schema.json b/python/ray/autoscaler/ray-schema.json index 22b21b84cb66..41a4a070832e 100644 --- a/python/ray/autoscaler/ray-schema.json +++ b/python/ray/autoscaler/ray-schema.json @@ -254,6 +254,10 @@ "type": "string", "description": "If using multiple node types, specifies the head node type." }, + "worker_default_node_type": { + "type": "string", + "description": "If using multiple node types, specifies the default worker node type." + }, "head_node": { "type": "object", "description": "Provider-specific config for the head node, e.g. instance type." diff --git a/python/ray/autoscaler/staroid/example-multi-node-type.yaml b/python/ray/autoscaler/staroid/example-multi-node-type.yaml index 563e3a74c6e4..860bb6a87674 100644 --- a/python/ray/autoscaler/staroid/example-multi-node-type.yaml +++ b/python/ray/autoscaler/staroid/example-multi-node-type.yaml @@ -103,6 +103,9 @@ available_node_types: # Specify the node type of the head node (as configured above). head_node_type: cpu_4_ondemand +# Specify the default type of the worker node (as configured above). +worker_default_node_type: cpu_4_spot + # The default settings for the head node. This will be merged with the per-node # type configs given above. #head_node: diff --git a/python/ray/operator/operator_utils.py b/python/ray/operator/operator_utils.py index 94d2a00cf34e..08926a723857 100644 --- a/python/ray/operator/operator_utils.py +++ b/python/ray/operator/operator_utils.py @@ -17,6 +17,7 @@ "upscalingSpeed": "upscaling_speed", "idleTimeoutMinutes": "idle_timeout_minutes", "headPodType": "head_node_type", + "workerDefaultPodType": "worker_default_node_type", "workerStartRayCommands": "worker_start_ray_commands", "headStartRayCommands": "head_start_ray_commands", "podTypes": "available_node_types" diff --git a/python/ray/tests/test_resource_demand_scheduler.py b/python/ray/tests/test_resource_demand_scheduler.py index 536cbe18bc5a..3bfe28f7cc83 100644 --- a/python/ray/tests/test_resource_demand_scheduler.py +++ b/python/ray/tests/test_resource_demand_scheduler.py @@ -87,7 +87,8 @@ MULTI_WORKER_CLUSTER = dict( SMALL_CLUSTER, **{ "available_node_types": TYPES_A, - "head_node_type": "empty_node" + "head_node_type": "empty_node", + "worker_default_node_type": "m4.large", }) From c1c9e5f9d838fb3bd8cd53704f860aadb5625e6d Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 233/244] Revert "[Autoscaler] Ensure ubuntu is owner of docker host mount folder (#13579)" This reverts commit 556f2be958a2b3c224024e269ece300abe79d9c5. --- python/ray/autoscaler/_private/command_runner.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/python/ray/autoscaler/_private/command_runner.py b/python/ray/autoscaler/_private/command_runner.py index 544e8b1077e4..f328d4fd6c1a 100644 --- a/python/ray/autoscaler/_private/command_runner.py +++ b/python/ray/autoscaler/_private/command_runner.py @@ -632,10 +632,8 @@ def run_rsync_up(self, source, target, options=None): self._get_docker_host_mount_location( self.ssh_command_runner.cluster_name), target.lstrip("/")) - host_mount_location = os.path.dirname(host_destination.rstrip("/")) self.ssh_command_runner.run( - f"mkdir -p {host_mount_location} && chown -R " - f"{self.ssh_command_runner.ssh_user} {host_mount_location}", + f"mkdir -p {os.path.dirname(host_destination.rstrip('/'))}", silent=is_rsync_silent()) self.ssh_command_runner.run_rsync_up( @@ -657,10 +655,8 @@ def run_rsync_down(self, source, target, options=None): host_source = os.path.join( self._get_docker_host_mount_location( self.ssh_command_runner.cluster_name), source.lstrip("/")) - host_mount_location = os.path.dirname(host_source.rstrip("/")) self.ssh_command_runner.run( - f"mkdir -p {host_mount_location} && chown -R " - f"{self.ssh_command_runner.ssh_user} {host_mount_location}", + f"mkdir -p {os.path.dirname(host_source.rstrip('/'))}", silent=is_rsync_silent()) if source[-1] == "/": source += "." From 45f915bbf650fa3eff6f5202cc2ef02609987358 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 234/244] Revert "[core] Admission control for pulling objects to the local node (#13514)" This reverts commit e281944d17de5426766bfc4b7925f53411171216. --- .travis.yml | 11 +- BUILD.bazel | 24 + python/ray/tests/test_object_manager.py | 83 --- python/ray/tests/test_object_spilling.py | 61 --- python/ray/tests/test_reconstruction.py | 3 - src/ray/core_worker/core_worker.cc | 1 - src/ray/core_worker/reference_count.cc | 9 - src/ray/core_worker/reference_count.h | 6 - src/ray/gcs/accessor.h | 2 +- .../gcs/gcs_client/service_based_accessor.cc | 4 - .../gcs/gcs_client/service_based_accessor.h | 2 +- .../test/global_state_accessor_test.cc | 2 +- .../test/service_based_gcs_client_test.cc | 2 +- src/ray/gcs/gcs_server/gcs_object_manager.cc | 7 +- src/ray/gcs/gcs_server/gcs_object_manager.h | 1 - src/ray/object_manager/object_directory.cc | 41 +- src/ray/object_manager/object_directory.h | 8 +- src/ray/object_manager/object_manager.cc | 60 +-- .../ownership_based_object_directory.cc | 11 +- .../object_manager/plasma/eviction_policy.h | 2 - src/ray/object_manager/plasma/store.h | 7 - src/ray/object_manager/plasma/store_runner.h | 9 +- src/ray/object_manager/pull_manager.cc | 231 +------- src/ray/object_manager/pull_manager.h | 96 +--- .../test/object_manager_stress_test.cc | 453 ++++++++++++++++ .../test/object_manager_test.cc | 496 ++++++++++++++++++ .../object_manager/test/pull_manager_test.cc | 318 ++--------- src/ray/protobuf/core_worker.proto | 1 - src/ray/protobuf/gcs.proto | 4 - src/ray/protobuf/gcs_service.proto | 2 - src/ray/raylet/reconstruction_policy.cc | 2 +- src/ray/raylet/reconstruction_policy_test.cc | 4 +- .../raylet/test/local_object_manager_test.cc | 5 +- src/ray/test/run_object_manager_tests.sh | 43 ++ 34 files changed, 1138 insertions(+), 873 deletions(-) create mode 100644 src/ray/object_manager/test/object_manager_stress_test.cc create mode 100644 src/ray/object_manager/test/object_manager_test.cc create mode 100755 src/ray/test/run_object_manager_tests.sh diff --git a/.travis.yml b/.travis.yml index 5170ed0864b8..36e49aaa74ef 100644 --- a/.travis.yml +++ b/.travis.yml @@ -78,9 +78,7 @@ matrix: - . ./ci/travis/ci.sh build script: # Run all C++ unit tests with ASAN enabled. ASAN adds too much overhead to run Python tests. - # NOTE: core_worker_test is out-of-date and should already covered by - # Python tests. - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only -- //:all -core_worker_test + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only -- //:all - os: osx osx_image: xcode7 @@ -437,10 +435,11 @@ matrix: script: - . ./ci/travis/ci.sh test_cpp script: + # raylet integration tests (core_worker_tests included in bazel tests below) + - ./ci/suppress_output bash src/ray/test/run_object_manager_tests.sh + # cc bazel tests (w/o RLlib) - # NOTE: core_worker_test is out-of-date and should already covered by Python - # tests. - - ./ci/suppress_output bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only -- //:all -rllib/... -core_worker_test + - ./ci/suppress_output bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only -- //:all -rllib/... # ray serve tests - if [ $RAY_CI_SERVE_AFFECTED == "1" ]; then ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=-jenkins_only python/ray/serve/...; fi diff --git a/BUILD.bazel b/BUILD.bazel index c1745e468852..a863727ecd95 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -1365,6 +1365,30 @@ cc_library( ], ) +cc_binary( + name = "object_manager_test", + testonly = 1, + srcs = ["src/ray/object_manager/test/object_manager_test.cc"], + copts = COPTS, + deps = [ + ":object_manager", + "//src/ray/protobuf:common_cc_proto", + "@com_google_googletest//:gtest_main", + ], +) + +cc_binary( + name = "object_manager_stress_test", + testonly = 1, + srcs = ["src/ray/object_manager/test/object_manager_stress_test.cc"], + copts = COPTS, + deps = [ + ":object_manager", + "//src/ray/protobuf:common_cc_proto", + "@com_google_googletest//:gtest_main", + ], +) + cc_library( name = "platform_shims", srcs = [] + select({ diff --git a/python/ray/tests/test_object_manager.py b/python/ray/tests/test_object_manager.py index e38733f62d7e..b29b9caa228f 100644 --- a/python/ray/tests/test_object_manager.py +++ b/python/ray/tests/test_object_manager.py @@ -296,89 +296,6 @@ def driver(): ray.get(driver.remote()) -@pytest.mark.skip( - reason="This hangs due to a deadlock between a worker getting its " - "arguments and the node pulling arguments for the next task queued.") -@pytest.mark.timeout(30) -def test_pull_bundles_admission_control(shutdown_only): - cluster = Cluster() - object_size = int(6e6) - num_objects = 10 - num_tasks = 10 - # Head node can fit all of the objects at once. - cluster.add_node( - num_cpus=0, - object_store_memory=2 * num_tasks * num_objects * object_size) - cluster.wait_for_nodes() - ray.init(address=cluster.address) - - # Worker node can only fit 1 task at a time. - cluster.add_node( - num_cpus=1, object_store_memory=1.5 * num_objects * object_size) - cluster.wait_for_nodes() - - @ray.remote - def foo(*args): - return - - args = [] - for _ in range(num_tasks): - task_args = [ - ray.put(np.zeros(object_size, dtype=np.uint8)) - for _ in range(num_objects) - ] - args.append(task_args) - - tasks = [foo.remote(*task_args) for task_args in args] - ray.get(tasks) - - -@pytest.mark.skip( - reason="This hangs due to a deadlock between a worker getting its " - "arguments and the node pulling arguments for the next task queued.") -@pytest.mark.timeout(30) -def test_pull_bundles_admission_control_dynamic(shutdown_only): - # This test is the same as test_pull_bundles_admission_control, except that - # the object store's capacity starts off higher and is later consumed - # dynamically by concurrent workers. - cluster = Cluster() - object_size = int(6e6) - num_objects = 10 - num_tasks = 10 - # Head node can fit all of the objects at once. - cluster.add_node( - num_cpus=0, - object_store_memory=2 * num_tasks * num_objects * object_size) - cluster.wait_for_nodes() - ray.init(address=cluster.address) - - # Worker node can fit 2 tasks at a time. - cluster.add_node( - num_cpus=1, object_store_memory=2.5 * num_objects * object_size) - cluster.wait_for_nodes() - - @ray.remote - def foo(*args): - return - - @ray.remote - def allocate(*args): - return np.zeros(object_size, dtype=np.uint8) - - args = [] - for _ in range(num_tasks): - task_args = [ - ray.put(np.zeros(object_size, dtype=np.uint8)) - for _ in range(num_objects) - ] - args.append(task_args) - - tasks = [foo.remote(*task_args) for task_args in args] - allocated = [allocate.remote() for _ in range(num_objects)] - ray.get(tasks) - del allocated - - if __name__ == "__main__": import pytest import sys diff --git a/python/ray/tests/test_object_spilling.py b/python/ray/tests/test_object_spilling.py index 745eb3bafc1d..10b1da77306a 100644 --- a/python/ray/tests/test_object_spilling.py +++ b/python/ray/tests/test_object_spilling.py @@ -648,66 +648,5 @@ def test_release_during_plasma_fetch(tmp_path, shutdown_only): do_test_release_resource(tmp_path, expect_released=True) -@pytest.mark.skip( - reason="This hangs due to a deadlock between a worker getting its " - "arguments and the node pulling arguments for the next task queued.") -@pytest.mark.skipif( - platform.system() == "Windows", reason="Failing on Windows.") -@pytest.mark.timeout(30) -def test_spill_objects_on_object_transfer(object_spilling_config, - ray_start_cluster): - # This test checks that objects get spilled to make room for transferred - # objects. - cluster = ray_start_cluster - object_size = int(1e7) - num_objects = 10 - num_tasks = 10 - # Head node can fit all of the objects at once. - cluster.add_node( - num_cpus=0, - object_store_memory=2 * num_tasks * num_objects * object_size, - _system_config={ - "max_io_workers": 1, - "automatic_object_spilling_enabled": True, - "object_store_full_delay_ms": 100, - "object_spilling_config": object_spilling_config, - "min_spilling_size": 0 - }) - cluster.wait_for_nodes() - ray.init(address=cluster.address) - - # Worker node can fit 1 tasks at a time. - cluster.add_node( - num_cpus=1, object_store_memory=1.5 * num_objects * object_size) - cluster.wait_for_nodes() - - @ray.remote - def foo(*args): - return - - @ray.remote - def allocate(*args): - return np.zeros(object_size, dtype=np.uint8) - - # Allocate some objects that must be spilled to make room for foo's - # arguments. - allocated = [allocate.remote() for _ in range(num_objects)] - ray.get(allocated) - print("done allocating") - - args = [] - for _ in range(num_tasks): - task_args = [ - ray.put(np.zeros(object_size, dtype=np.uint8)) - for _ in range(num_objects) - ] - args.append(task_args) - - # Check that tasks scheduled to the worker node have enough room after - # spilling. - tasks = [foo.remote(*task_args) for task_args in args] - ray.get(tasks) - - if __name__ == "__main__": sys.exit(pytest.main(["-sv", __file__])) diff --git a/python/ray/tests/test_reconstruction.py b/python/ray/tests/test_reconstruction.py index 1cd1f133a911..f5eed1e8fb23 100644 --- a/python/ray/tests/test_reconstruction.py +++ b/python/ray/tests/test_reconstruction.py @@ -372,7 +372,6 @@ def probe(): raise e.as_instanceof_cause() -@pytest.mark.skip(reason="This hangs due to a deadlock in admission control.") @pytest.mark.parametrize("reconstruction_enabled", [False, True]) def test_multiple_downstream_tasks(ray_start_cluster, reconstruction_enabled): config = { @@ -437,7 +436,6 @@ def dependent_task(x): raise e.as_instanceof_cause() -@pytest.mark.skip(reason="This hangs due to a deadlock in admission control.") @pytest.mark.parametrize("reconstruction_enabled", [False, True]) def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled): config = { @@ -489,7 +487,6 @@ def dependent_task(x): raise e.as_instanceof_cause() -@pytest.mark.skip(reason="This hangs due to a deadlock in admission control.") @pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.") def test_reconstruction_stress(ray_start_cluster): config = { diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index f7e473eca5a2..21fc462a7af6 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -2213,7 +2213,6 @@ void CoreWorker::HandleGetObjectLocationsOwner( } else { status = Status::ObjectNotFound("Object " + object_id.Hex() + " not found"); } - reply->set_object_size(reference_counter_->GetObjectSize(object_id)); send_reply_callback(status, nullptr, nullptr); } diff --git a/src/ray/core_worker/reference_count.cc b/src/ray/core_worker/reference_count.cc index ba2e20994e44..c638f831dbed 100644 --- a/src/ray/core_worker/reference_count.cc +++ b/src/ray/core_worker/reference_count.cc @@ -948,15 +948,6 @@ absl::optional> ReferenceCounter::GetObjectLocations return it->second.locations; } -size_t ReferenceCounter::GetObjectSize(const ObjectID &object_id) const { - absl::MutexLock lock(&mutex_); - auto it = object_id_refs_.find(object_id); - if (it == object_id_refs_.end()) { - return 0; - } - return it->second.object_size; -} - void ReferenceCounter::HandleObjectSpilled(const ObjectID &object_id) { absl::MutexLock lock(&mutex_); auto it = object_id_refs_.find(object_id); diff --git a/src/ray/core_worker/reference_count.h b/src/ray/core_worker/reference_count.h index 9c0576393fb3..caceabc53ab5 100644 --- a/src/ray/core_worker/reference_count.h +++ b/src/ray/core_worker/reference_count.h @@ -397,12 +397,6 @@ class ReferenceCounter : public ReferenceCounterInterface, absl::optional> GetObjectLocations( const ObjectID &object_id) LOCKS_EXCLUDED(mutex_); - /// Get an object's size. This will return 0 if the object is out of scope. - /// - /// \param[in] object_id The object whose size to get. - /// \return Object size, or 0 if the object is out of scope. - size_t GetObjectSize(const ObjectID &object_id) const; - /// Handle an object has been spilled to external storage. /// /// This notifies the primary raylet that the object is safe to release and diff --git a/src/ray/gcs/accessor.h b/src/ray/gcs/accessor.h index ab0704bcadd7..83dc3de3ca46 100644 --- a/src/ray/gcs/accessor.h +++ b/src/ray/gcs/accessor.h @@ -297,7 +297,7 @@ class ObjectInfoAccessor { /// \param callback Callback that will be called after object has been added to GCS. /// \return Status virtual Status AsyncAddLocation(const ObjectID &object_id, const NodeID &node_id, - size_t object_size, const StatusCallback &callback) = 0; + const StatusCallback &callback) = 0; /// Add spilled location of object to GCS asynchronously. /// diff --git a/src/ray/gcs/gcs_client/service_based_accessor.cc b/src/ray/gcs/gcs_client/service_based_accessor.cc index dfa192320976..f9380b78ee12 100644 --- a/src/ray/gcs/gcs_client/service_based_accessor.cc +++ b/src/ray/gcs/gcs_client/service_based_accessor.cc @@ -1070,7 +1070,6 @@ Status ServiceBasedObjectInfoAccessor::AsyncGetAll( Status ServiceBasedObjectInfoAccessor::AsyncAddLocation(const ObjectID &object_id, const NodeID &node_id, - size_t object_size, const StatusCallback &callback) { RAY_LOG(DEBUG) << "Adding object location, object id = " << object_id << ", node id = " << node_id @@ -1078,7 +1077,6 @@ Status ServiceBasedObjectInfoAccessor::AsyncAddLocation(const ObjectID &object_i rpc::AddObjectLocationRequest request; request.set_object_id(object_id.Binary()); request.set_node_id(node_id.Binary()); - request.set_size(object_size); auto operation = [this, request, object_id, node_id, callback](const SequencerDoneCallback &done_callback) { @@ -1173,13 +1171,11 @@ Status ServiceBasedObjectInfoAccessor::AsyncSubscribeToLocations( rpc::ObjectLocationChange update; update.set_is_add(true); update.set_node_id(loc.manager()); - update.set_size(result->size()); notification.push_back(update); } if (!result->spilled_url().empty()) { rpc::ObjectLocationChange update; update.set_spilled_url(result->spilled_url()); - update.set_size(result->size()); notification.push_back(update); } subscribe(object_id, notification); diff --git a/src/ray/gcs/gcs_client/service_based_accessor.h b/src/ray/gcs/gcs_client/service_based_accessor.h index 2d362976dd22..b498e0acfd46 100644 --- a/src/ray/gcs/gcs_client/service_based_accessor.h +++ b/src/ray/gcs/gcs_client/service_based_accessor.h @@ -323,7 +323,7 @@ class ServiceBasedObjectInfoAccessor : public ObjectInfoAccessor { Status AsyncGetAll(const MultiItemCallback &callback) override; Status AsyncAddLocation(const ObjectID &object_id, const NodeID &node_id, - size_t object_size, const StatusCallback &callback) override; + const StatusCallback &callback) override; Status AsyncAddSpilledUrl(const ObjectID &object_id, const std::string &spilled_url, const StatusCallback &callback) override; diff --git a/src/ray/gcs/gcs_client/test/global_state_accessor_test.cc b/src/ray/gcs/gcs_client/test/global_state_accessor_test.cc index e896beccb6f5..7af602808fc7 100644 --- a/src/ray/gcs/gcs_client/test/global_state_accessor_test.cc +++ b/src/ray/gcs/gcs_client/test/global_state_accessor_test.cc @@ -283,7 +283,7 @@ TEST_F(GlobalStateAccessorTest, TestObjectTable) { NodeID node_id = NodeID::FromRandom(); std::promise promise; RAY_CHECK_OK(gcs_client_->Objects().AsyncAddLocation( - object_id, node_id, 0, + object_id, node_id, [&promise](Status status) { promise.set_value(status.ok()); })); WaitReady(promise.get_future(), timeout_ms_); } diff --git a/src/ray/gcs/gcs_client/test/service_based_gcs_client_test.cc b/src/ray/gcs/gcs_client/test/service_based_gcs_client_test.cc index 3b1a6a69ad7a..3b0f731bbccd 100644 --- a/src/ray/gcs/gcs_client/test/service_based_gcs_client_test.cc +++ b/src/ray/gcs/gcs_client/test/service_based_gcs_client_test.cc @@ -450,7 +450,7 @@ class ServiceBasedGcsClientTest : public ::testing::Test { bool AddLocation(const ObjectID &object_id, const NodeID &node_id) { std::promise promise; RAY_CHECK_OK(gcs_client_->Objects().AsyncAddLocation( - object_id, node_id, 0, + object_id, node_id, [&promise](Status status) { promise.set_value(status.ok()); })); return WaitReady(promise.get_future(), timeout_ms_); } diff --git a/src/ray/gcs/gcs_server/gcs_object_manager.cc b/src/ray/gcs/gcs_server/gcs_object_manager.cc index 73971ed7f18f..b5cc8f765113 100644 --- a/src/ray/gcs/gcs_server/gcs_object_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_object_manager.cc @@ -51,7 +51,6 @@ void GcsObjectManager::HandleGetAllObjectLocations( object_table_data.set_manager(node_id.Binary()); object_location_info.add_locations()->CopyFrom(object_table_data); } - object_location_info.set_size(item.second.object_size); reply->add_object_location_info_list()->CopyFrom(object_location_info); } RAY_LOG(DEBUG) << "Finished getting all object locations."; @@ -79,8 +78,7 @@ void GcsObjectManager::HandleAddObjectLocation( RAY_LOG(DEBUG) << "Adding object spilled location, object id = " << object_id; } - size_t size = request.size(); - auto on_done = [this, object_id, node_id, spilled_url, size, reply, + auto on_done = [this, object_id, node_id, spilled_url, reply, send_reply_callback](const Status &status) { if (status.ok()) { rpc::ObjectLocationChange notification; @@ -91,7 +89,6 @@ void GcsObjectManager::HandleAddObjectLocation( if (!spilled_url.empty()) { notification.set_spilled_url(spilled_url); } - notification.set_size(size); RAY_CHECK_OK(gcs_pub_sub_->Publish(OBJECT_CHANNEL, object_id.Hex(), notification.SerializeAsString(), nullptr)); RAY_LOG(DEBUG) << "Finished adding object location, job id = " @@ -110,7 +107,6 @@ void GcsObjectManager::HandleAddObjectLocation( }; absl::MutexLock lock(&mutex_); - object_to_locations_[object_id].object_size = size; const auto object_data = GenObjectLocationInfo(object_id); Status status = gcs_table_storage_->ObjectTable().Put(object_id, object_data, on_done); if (!status.ok()) { @@ -291,7 +287,6 @@ const ObjectLocationInfo GcsObjectManager::GenObjectLocationInfo( object_data.add_locations()->set_manager(node_id.Binary()); } object_data.set_spilled_url(it->second.spilled_url); - object_data.set_size(it->second.object_size); } return object_data; } diff --git a/src/ray/gcs/gcs_server/gcs_object_manager.h b/src/ray/gcs/gcs_server/gcs_object_manager.h index 2afff0816850..bd21bfd1b977 100644 --- a/src/ray/gcs/gcs_server/gcs_object_manager.h +++ b/src/ray/gcs/gcs_server/gcs_object_manager.h @@ -65,7 +65,6 @@ class GcsObjectManager : public rpc::ObjectInfoHandler { struct LocationSet { absl::flat_hash_set locations; std::string spilled_url = ""; - size_t object_size = 0; }; /// Add a location of objects. diff --git a/src/ray/object_manager/object_directory.cc b/src/ray/object_manager/object_directory.cc index ccfda7f5a37c..189cc0dd7d4b 100644 --- a/src/ray/object_manager/object_directory.cc +++ b/src/ray/object_manager/object_directory.cc @@ -31,21 +31,13 @@ using ray::rpc::ObjectTableData; /// object table entries up to but not including this notification. bool UpdateObjectLocations(const std::vector &location_updates, std::shared_ptr gcs_client, - std::unordered_set *node_ids, std::string *spilled_url, - size_t *object_size) { + std::unordered_set *node_ids, + std::string *spilled_url) { // location_updates contains the updates of locations of the object. // with GcsChangeMode, we can determine whether the update mode is // addition or deletion. bool isUpdated = false; for (const auto &update : location_updates) { - // The size can be 0 if the update was a deletion. This assumes that an - // object's size is always greater than 0. - // TODO(swang): If that's not the case, we should use a flag to check - // whether the size is set instead. - if (update.size() > 0) { - *object_size = update.size(); - } - if (!update.node_id().empty()) { NodeID node_id = NodeID::FromBinary(update.node_id()); if (update.is_add() && 0 == node_ids->count(node_id)) { @@ -81,10 +73,9 @@ bool UpdateObjectLocations(const std::vector &locatio ray::Status ObjectDirectory::ReportObjectAdded( const ObjectID &object_id, const NodeID &node_id, const object_manager::protocol::ObjectInfoT &object_info) { - size_t size = object_info.data_size + object_info.metadata_size; - RAY_LOG(DEBUG) << "Reporting object added to GCS " << object_id << " size " << size; + RAY_LOG(DEBUG) << "Reporting object added to GCS " << object_id; ray::Status status = - gcs_client_->Objects().AsyncAddLocation(object_id, node_id, size, nullptr); + gcs_client_->Objects().AsyncAddLocation(object_id, node_id, nullptr); return status; } @@ -128,14 +119,14 @@ void ObjectDirectory::HandleNodeRemoved(const NodeID &node_id) { // If the subscribed object has the removed node as a location, update // its locations with an empty update so that the location will be removed. UpdateObjectLocations({}, gcs_client_, &listener.second.current_object_locations, - &listener.second.spilled_url, &listener.second.object_size); + &listener.second.spilled_url); // Re-call all the subscribed callbacks for the object, since its // locations have changed. for (const auto &callback_pair : listener.second.callbacks) { // It is safe to call the callback directly since this is already running // in the subscription callback stack. callback_pair.second(object_id, listener.second.current_object_locations, - listener.second.spilled_url, listener.second.object_size); + listener.second.spilled_url); } } } @@ -166,7 +157,7 @@ ray::Status ObjectDirectory::SubscribeObjectLocations(const UniqueID &callback_i // Update entries for this object. if (!UpdateObjectLocations(object_notifications, gcs_client_, &it->second.current_object_locations, - &it->second.spilled_url, &it->second.object_size)) { + &it->second.spilled_url)) { return; } // Copy the callbacks so that the callbacks can unsubscribe without interrupting @@ -180,7 +171,7 @@ ray::Status ObjectDirectory::SubscribeObjectLocations(const UniqueID &callback_i // It is safe to call the callback directly since this is already running // in the subscription callback stack. callback_pair.second(object_id, it->second.current_object_locations, - it->second.spilled_url, it->second.object_size); + it->second.spilled_url); } }; status = gcs_client_->Objects().AsyncSubscribeToLocations( @@ -198,9 +189,8 @@ ray::Status ObjectDirectory::SubscribeObjectLocations(const UniqueID &callback_i if (listener_state.subscribed) { auto &locations = listener_state.current_object_locations; auto &spilled_url = listener_state.spilled_url; - auto object_size = it->second.object_size; - io_service_.post([callback, locations, spilled_url, object_size, object_id]() { - callback(object_id, locations, spilled_url, object_size); + io_service_.post([callback, locations, spilled_url, object_id]() { + callback(object_id, locations, spilled_url); }); } return status; @@ -233,9 +223,8 @@ ray::Status ObjectDirectory::LookupLocations(const ObjectID &object_id, // cached locations. auto &locations = it->second.current_object_locations; auto &spilled_url = it->second.spilled_url; - auto object_size = it->second.object_size; - io_service_.post([callback, object_id, spilled_url, locations, object_size]() { - callback(object_id, locations, spilled_url, object_size); + io_service_.post([callback, object_id, spilled_url, locations]() { + callback(object_id, locations, spilled_url); }); } else { // We do not have any locations cached due to a concurrent @@ -263,12 +252,10 @@ ray::Status ObjectDirectory::LookupLocations(const ObjectID &object_id, std::unordered_set node_ids; std::string spilled_url; - size_t object_size = 0; - UpdateObjectLocations(notification, gcs_client_, &node_ids, &spilled_url, - &object_size); + UpdateObjectLocations(notification, gcs_client_, &node_ids, &spilled_url); // It is safe to call the callback directly since this is already running // in the GCS client's lookup callback stack. - callback(object_id, node_ids, spilled_url, object_size); + callback(object_id, node_ids, spilled_url); }); } return status; diff --git a/src/ray/object_manager/object_directory.h b/src/ray/object_manager/object_directory.h index 8f06888aee23..3ce15882bfea 100644 --- a/src/ray/object_manager/object_directory.h +++ b/src/ray/object_manager/object_directory.h @@ -41,9 +41,9 @@ struct RemoteConnectionInfo { }; /// Callback for object location notifications. -using OnLocationsFound = std::function &, - const std::string &, size_t object_size)>; +using OnLocationsFound = + std::function &, const std::string &)>; class ObjectDirectoryInterface { public: @@ -185,8 +185,6 @@ class ObjectDirectory : public ObjectDirectoryInterface { std::unordered_set current_object_locations; /// The location where this object has been spilled, if any. std::string spilled_url = ""; - /// The size of the object. - size_t object_size = 0; /// This flag will get set to true if received any notification of the object. /// It means current_object_locations is up-to-date with GCS. It /// should never go back to false once set to true. If this is true, and diff --git a/src/ray/object_manager/object_manager.cc b/src/ray/object_manager/object_manager.cc index 467ea25675e9..d82a5fb0d069 100644 --- a/src/ray/object_manager/object_manager.cc +++ b/src/ray/object_manager/object_manager.cc @@ -73,6 +73,18 @@ ObjectManager::ObjectManager(asio::io_service &main_service, const NodeID &self_ boost::posix_time::milliseconds(config.timer_freq_ms)) { RAY_CHECK(config_.rpc_service_threads_number > 0); + const auto &object_is_local = [this](const ObjectID &object_id) { + return local_objects_.count(object_id) != 0; + }; + const auto &send_pull_request = [this](const ObjectID &object_id, + const NodeID &client_id) { + SendPullRequest(object_id, client_id); + }; + const auto &get_time = []() { return absl::GetCurrentTimeNanos() / 1e9; }; + pull_manager_.reset(new PullManager(self_node_id_, object_is_local, send_pull_request, + restore_spilled_object_, get_time, + config.pull_timeout_ms)); + push_manager_.reset(new PushManager(/* max_chunks_in_flight= */ std::max( static_cast(1L), static_cast(config_.max_bytes_in_flight / config_.object_chunk_size)))); @@ -87,40 +99,14 @@ ObjectManager::ObjectManager(asio::io_service &main_service, const NodeID &self_ main_service, config_.store_socket_name); } - const auto &object_is_local = [this](const ObjectID &object_id) { - return local_objects_.count(object_id) != 0; - }; - const auto &send_pull_request = [this](const ObjectID &object_id, - const NodeID &client_id) { - SendPullRequest(object_id, client_id); - }; - const auto &get_time = []() { return absl::GetCurrentTimeNanos() / 1e9; }; - int64_t available_memory = config.object_store_memory; - if (available_memory < 0) { - available_memory = 0; - } - pull_manager_.reset(new PullManager( - self_node_id_, object_is_local, send_pull_request, restore_spilled_object_, - get_time, config.pull_timeout_ms, available_memory, - [spill_objects_callback, object_store_full_callback]() { - // TODO(swang): This copies the out-of-memory handling in the - // CreateRequestQueue. It would be nice to unify these. - if (object_store_full_callback) { - object_store_full_callback(); - } - - static_cast(spill_objects_callback()); - })); - store_notification_->SubscribeObjAdded( [this](const object_manager::protocol::ObjectInfoT &object_info) { HandleObjectAdded(object_info); }); store_notification_->SubscribeObjDeleted([this](const ObjectID &oid) { + // TODO(swang): We may want to force the pull manager to fetch this object + // again, in case it was needed by an active pull request. NotifyDirectoryObjectDeleted(oid); - // Ask the pull manager to fetch this object again as soon as possible, if - // it was needed by an active pull request. - pull_manager_->ResetRetryTimer(oid); }); // Start object manager rpc server and send & receive request threads @@ -220,8 +206,8 @@ uint64_t ObjectManager::Pull(const std::vector &object_ref const auto &callback = [this](const ObjectID &object_id, const std::unordered_set &client_ids, - const std::string &spilled_url, size_t object_size) { - pull_manager_->OnLocationChange(object_id, client_ids, spilled_url, object_size); + const std::string &spilled_url) { + pull_manager_->OnLocationChange(object_id, client_ids, spilled_url); }; for (const auto &ref : objects_to_locate) { @@ -513,7 +499,7 @@ ray::Status ObjectManager::LookupRemainingWaitObjects(const UniqueID &wait_id) { object_id, wait_state.owner_addresses[object_id], [this, wait_id](const ObjectID &lookup_object_id, const std::unordered_set &node_ids, - const std::string &spilled_url, size_t object_size) { + const std::string &spilled_url) { auto &wait_state = active_wait_requests_.find(wait_id)->second; // Note that the object is guaranteed to be added to local_objects_ before // the notification is triggered. @@ -554,7 +540,7 @@ void ObjectManager::SubscribeRemainingWaitObjects(const UniqueID &wait_id) { wait_id, object_id, wait_state.owner_addresses[object_id], [this, wait_id](const ObjectID &subscribe_object_id, const std::unordered_set &node_ids, - const std::string &spilled_url, size_t object_size) { + const std::string &spilled_url) { auto object_id_wait_state = active_wait_requests_.find(wait_id); if (object_id_wait_state == active_wait_requests_.end()) { // Depending on the timing of calls to the object directory, we @@ -836,16 +822,6 @@ void ObjectManager::Tick(const boost::system::error_code &e) { << ". Please file a bug report on here: " "https://github.com/ray-project/ray/issues"; - // Request the current available memory from the object - // store. - if (plasma::plasma_store_runner) { - plasma::plasma_store_runner->GetAvailableMemoryAsync([this](size_t available_memory) { - main_service_->post([this, available_memory]() { - pull_manager_->UpdatePullsBasedOnAvailableMemory(available_memory); - }); - }); - } - pull_manager_->Tick(); auto interval = boost::posix_time::milliseconds(config_.timer_freq_ms); diff --git a/src/ray/object_manager/ownership_based_object_directory.cc b/src/ray/object_manager/ownership_based_object_directory.cc index efc37b3e8d8c..df11a4bb750f 100644 --- a/src/ray/object_manager/ownership_based_object_directory.cc +++ b/src/ray/object_manager/ownership_based_object_directory.cc @@ -126,10 +126,6 @@ void OwnershipBasedObjectDirectory::SubscriptionCallback( return; } - if (reply.object_size() > 0) { - it->second.object_size = reply.object_size(); - } - std::unordered_set node_ids; for (auto const &node_id : reply.node_ids()) { node_ids.emplace(NodeID::FromBinary(node_id)); @@ -145,8 +141,7 @@ void OwnershipBasedObjectDirectory::SubscriptionCallback( for (const auto &callback_pair : callbacks) { // It is safe to call the callback directly since this is already running // in the subscription callback stack. - callback_pair.second(object_id, it->second.current_object_locations, "", - it->second.object_size); + callback_pair.second(object_id, it->second.current_object_locations, ""); } } @@ -213,7 +208,7 @@ ray::Status OwnershipBasedObjectDirectory::LookupLocations( RAY_LOG(WARNING) << "Object " << object_id << " does not have owner. " << "LookupLocations returns an empty list of locations."; io_service_.post([callback, object_id]() { - callback(object_id, std::unordered_set(), "", 0); + callback(object_id, std::unordered_set(), ""); }); return Status::OK(); } @@ -234,7 +229,7 @@ ray::Status OwnershipBasedObjectDirectory::LookupLocations( node_ids.emplace(NodeID::FromBinary(node_id)); } FilterRemovedNodes(gcs_client_, &node_ids); - callback(object_id, node_ids, "", reply.object_size()); + callback(object_id, node_ids, ""); }); return Status::OK(); } diff --git a/src/ray/object_manager/plasma/eviction_policy.h b/src/ray/object_manager/plasma/eviction_policy.h index d20d0b51eeb7..91788bb34ca5 100644 --- a/src/ray/object_manager/plasma/eviction_policy.h +++ b/src/ray/object_manager/plasma/eviction_policy.h @@ -196,8 +196,6 @@ class EvictionPolicy { /// Returns debugging information for this eviction policy. virtual std::string DebugString() const; - int64_t GetPinnedMemoryBytes() const { return pinned_memory_bytes_; } - protected: /// Returns the size of the object int64_t GetObjectSize(const ObjectID &object_id) const; diff --git a/src/ray/object_manager/plasma/store.h b/src/ray/object_manager/plasma/store.h index 2ad3aad261c7..ec338d388514 100644 --- a/src/ray/object_manager/plasma/store.h +++ b/src/ray/object_manager/plasma/store.h @@ -33,7 +33,6 @@ #include "ray/object_manager/plasma/connection.h" #include "ray/object_manager/plasma/create_request_queue.h" #include "ray/object_manager/plasma/plasma.h" -#include "ray/object_manager/plasma/plasma_allocator.h" #include "ray/object_manager/plasma/protocol.h" #include "ray/object_manager/plasma/quota_aware_policy.h" @@ -210,12 +209,6 @@ class PlasmaStore { /// Process queued requests to create an object. void ProcessCreateRequests(); - void GetAvailableMemory(std::function callback) const { - size_t available = - PlasmaAllocator::GetFootprintLimit() - eviction_policy_.GetPinnedMemoryBytes(); - callback(available); - } - private: PlasmaError HandleCreateObjectRequest(const std::shared_ptr &client, const std::vector &message, diff --git a/src/ray/object_manager/plasma/store_runner.h b/src/ray/object_manager/plasma/store_runner.h index 7ac7be59bbc5..3edd70350cc2 100644 --- a/src/ray/object_manager/plasma/store_runner.h +++ b/src/ray/object_manager/plasma/store_runner.h @@ -1,8 +1,9 @@ #pragma once -#include #include +#include + #include "absl/synchronization/mutex.h" #include "ray/object_manager/notification/object_store_notification_manager.h" #include "ray/object_manager/plasma/store.h" @@ -22,10 +23,6 @@ class PlasmaStoreRunner { } bool IsPlasmaObjectSpillable(const ObjectID &object_id); - void GetAvailableMemoryAsync(std::function callback) const { - main_service_.post([this, callback]() { store_->GetAvailableMemory(callback); }); - } - private: void Shutdown(); absl::Mutex store_runner_mutex_; @@ -33,7 +30,7 @@ class PlasmaStoreRunner { int64_t system_memory_; bool hugepages_enabled_; std::string plasma_directory_; - mutable boost::asio::io_service main_service_; + boost::asio::io_service main_service_; std::unique_ptr store_; std::shared_ptr listener_; }; diff --git a/src/ray/object_manager/pull_manager.cc b/src/ray/object_manager/pull_manager.cc index 1ebf9214a707..289ad13eb5cc 100644 --- a/src/ray/object_manager/pull_manager.cc +++ b/src/ray/object_manager/pull_manager.cc @@ -8,16 +8,13 @@ PullManager::PullManager( NodeID &self_node_id, const std::function object_is_local, const std::function send_pull_request, const RestoreSpilledObjectCallback restore_spilled_object, - const std::function get_time, int pull_timeout_ms, - size_t num_bytes_available, std::function object_store_full_callback) + const std::function get_time, int pull_timeout_ms) : self_node_id_(self_node_id), object_is_local_(object_is_local), send_pull_request_(send_pull_request), restore_spilled_object_(restore_spilled_object), get_time_(get_time), pull_timeout_ms_(pull_timeout_ms), - num_bytes_available_(num_bytes_available), - object_store_full_callback_(object_store_full_callback), gen_(std::chrono::high_resolution_clock::now().time_since_epoch().count()) {} uint64_t PullManager::Pull(const std::vector &object_ref_bundle, @@ -42,224 +39,33 @@ uint64_t PullManager::Pull(const std::vector &object_ref_b it->second.bundle_request_ids.insert(bundle_it->first); } - // We have a new request. Activate the new request, if the - // current available memory allows it. - UpdatePullsBasedOnAvailableMemory(num_bytes_available_); - return bundle_it->first; } -bool PullManager::ActivateNextPullBundleRequest( - const std::map>::iterator - &next_request_it) { - // Check that we have sizes for all of the objects in the bundle. If not, we - // should not activate the bundle, since it may put us over the available - // capacity. - for (const auto &ref : next_request_it->second) { - auto obj_id = ObjectRefToId(ref); - const auto it = object_pull_requests_.find(obj_id); - RAY_CHECK(it != object_pull_requests_.end()); - if (!it->second.object_size_set) { - // NOTE(swang): The size could be 0 if we haven't received size - // information yet. If we receive the size later on, we will update the - // total bytes being pulled then. - RAY_LOG(DEBUG) << "No size for " << obj_id << ", canceling activation for pull " - << next_request_it->first; - return false; - } - } - - // Activate the bundle. - for (const auto &ref : next_request_it->second) { - auto obj_id = ObjectRefToId(ref); - bool start_pull = active_object_pull_requests_.count(obj_id) == 0; - active_object_pull_requests_[obj_id].insert(next_request_it->first); - if (start_pull) { - RAY_LOG(DEBUG) << "Activating pull for object " << obj_id; - // This is the first bundle request in the queue to require this object. - // Add the size to the number of bytes being pulled. - auto it = object_pull_requests_.find(obj_id); - RAY_CHECK(it != object_pull_requests_.end()); - num_bytes_being_pulled_ += it->second.object_size; - } - } - - // Update the pointer to the last pull request that we are actively pulling. - RAY_CHECK(next_request_it->first > highest_req_id_being_pulled_); - highest_req_id_being_pulled_ = next_request_it->first; - return true; -} - -void PullManager::DeactivatePullBundleRequest( - const std::map>::iterator &request_it) { - for (const auto &ref : request_it->second) { - auto obj_id = ObjectRefToId(ref); - RAY_CHECK(active_object_pull_requests_[obj_id].erase(request_it->first)); - if (active_object_pull_requests_[obj_id].empty()) { - RAY_LOG(DEBUG) << "Deactivating pull for object " << obj_id; - auto it = object_pull_requests_.find(obj_id); - RAY_CHECK(it != object_pull_requests_.end()); - num_bytes_being_pulled_ -= it->second.object_size; - active_object_pull_requests_.erase(obj_id); - } - } - - // If this was the last active request, update the pointer to its - // predecessor, if one exists. - if (highest_req_id_being_pulled_ == request_it->first) { - if (request_it == pull_request_bundles_.begin()) { - highest_req_id_being_pulled_ = 0; - } else { - highest_req_id_being_pulled_ = std::prev(request_it)->first; - } - } -} - -void PullManager::UpdatePullsBasedOnAvailableMemory(size_t num_bytes_available) { - if (num_bytes_available_ != num_bytes_available) { - RAY_LOG(DEBUG) << "Updating pulls based on available memory: " << num_bytes_available; - } - num_bytes_available_ = num_bytes_available; - uint64_t prev_highest_req_id_being_pulled = highest_req_id_being_pulled_; - - std::unordered_set object_ids_to_pull; - // While there is available capacity, activate the next pull request. - while (num_bytes_being_pulled_ < num_bytes_available_) { - // Get the next pull request in the queue. - const auto last_request_it = pull_request_bundles_.find(highest_req_id_being_pulled_); - auto next_request_it = last_request_it; - if (next_request_it == pull_request_bundles_.end()) { - // No requests are active. Get the first request in the queue. - next_request_it = pull_request_bundles_.begin(); - } else { - next_request_it++; - } - - if (next_request_it == pull_request_bundles_.end()) { - // No requests in the queue. - break; - } - - RAY_LOG(DEBUG) << "Activating request " << next_request_it->first - << " num bytes being pulled: " << num_bytes_being_pulled_ - << " num bytes available: " << num_bytes_available_; - // There is another pull bundle request that we could try, and there is - // enough space. Activate the next pull bundle request in the queue. - if (!ActivateNextPullBundleRequest(next_request_it)) { - // This pull bundle request could not be activated, due to lack of object - // size information. Wait until we have object size information before - // activating this pull bundle. - break; - } - } - - std::unordered_set object_ids_to_cancel; - // While the total bytes requested is over the available capacity, deactivate - // the last pull request, ordered by request ID. - while (num_bytes_being_pulled_ > num_bytes_available_) { - RAY_LOG(DEBUG) << "Deactivating request " << highest_req_id_being_pulled_ - << " num bytes being pulled: " << num_bytes_being_pulled_ - << " num bytes available: " << num_bytes_available_; - const auto last_request_it = pull_request_bundles_.find(highest_req_id_being_pulled_); - RAY_CHECK(last_request_it != pull_request_bundles_.end()); - DeactivatePullBundleRequest(last_request_it); - } - - TriggerOutOfMemoryHandlingIfNeeded(); - - if (highest_req_id_being_pulled_ > prev_highest_req_id_being_pulled) { - // There are newly activated requests. Start pulling objects for the newly - // activated requests. - // NOTE(swang): We could also just wait for the next timer tick to pull the - // objects, but this would add a delay of up to one tick for any bundles of - // multiple objects, even when we are not under memory pressure. - Tick(); - } -} - -void PullManager::TriggerOutOfMemoryHandlingIfNeeded() { - if (pull_request_bundles_.empty()) { - // No requests queued. - return; - } - - const auto head = pull_request_bundles_.begin(); - if (highest_req_id_being_pulled_ >= head->first) { - // At least one request is being actively pulled, so there is currently - // enough space. - return; - } - - // No requests are being pulled. Check whether this is because we don't have - // object size information yet. - size_t num_bytes_needed = 0; - for (const auto &ref : head->second) { - auto obj_id = ObjectRefToId(ref); - const auto it = object_pull_requests_.find(obj_id); - RAY_CHECK(it != object_pull_requests_.end()); - if (!it->second.object_size_set) { - // We're not pulling the first request because we don't have size - // information. Wait for the size information before triggering OOM - return; - } - num_bytes_needed += it->second.object_size; - } - - // The first request in the queue is not being pulled due to lack of space. - // Trigger out-of-memory handling to try to make room. - // TODO(swang): This can hang if no room can be made. We should return an - // error for requests whose total size is larger than the capacity of the - // memory store. - if (get_time_() - last_oom_reported_ms_ > 30000) { - RAY_LOG(WARNING) - << "There is not enough memory to pull objects needed by a queued task or " - "a worker blocked in ray.get or ray.wait. " - << "Need " << num_bytes_needed << " bytes, but only " << num_bytes_available_ - << " bytes are available on this node. " - << "This job may hang if no memory can be freed through garbage collection or " - "object spilling. See " - "https://docs.ray.io/en/master/memory-management.html for more information. " - "Please file a GitHub issue if you see this message repeatedly."; - last_oom_reported_ms_ = get_time_(); - } - object_store_full_callback_(); -} - std::vector PullManager::CancelPull(uint64_t request_id) { + std::vector objects_to_cancel; RAY_LOG(DEBUG) << "Cancel pull request " << request_id; auto bundle_it = pull_request_bundles_.find(request_id); RAY_CHECK(bundle_it != pull_request_bundles_.end()); - // If the pull request was being actively pulled, deactivate it now. - if (bundle_it->first <= highest_req_id_being_pulled_) { - DeactivatePullBundleRequest(bundle_it); - } - - // Erase this pull request. - std::vector object_ids_to_cancel; for (const auto &ref : bundle_it->second) { auto obj_id = ObjectRefToId(ref); auto it = object_pull_requests_.find(obj_id); RAY_CHECK(it != object_pull_requests_.end()); - RAY_CHECK(it->second.bundle_request_ids.erase(bundle_it->first)); + RAY_CHECK(it->second.bundle_request_ids.erase(request_id)); if (it->second.bundle_request_ids.empty()) { object_pull_requests_.erase(it); - object_ids_to_cancel.push_back(obj_id); + objects_to_cancel.push_back(obj_id); } } - pull_request_bundles_.erase(bundle_it); - - // We need to update the pulls in case there is another request(s) after this - // request that can now be activated. We do this after erasing the cancelled - // request to avoid reactivating it again. - UpdatePullsBasedOnAvailableMemory(num_bytes_available_); - return object_ids_to_cancel; + pull_request_bundles_.erase(bundle_it); + return objects_to_cancel; } void PullManager::OnLocationChange(const ObjectID &object_id, const std::unordered_set &client_ids, - const std::string &spilled_url, size_t object_size) { + const std::string &spilled_url) { // Exit if the Pull request has already been fulfilled or canceled. auto it = object_pull_requests_.find(object_id); if (it == object_pull_requests_.end()) { @@ -271,14 +77,6 @@ void PullManager::OnLocationChange(const ObjectID &object_id, // before. it->second.client_locations = std::vector(client_ids.begin(), client_ids.end()); it->second.spilled_url = spilled_url; - - if (!it->second.object_size_set) { - RAY_LOG(DEBUG) << "Updated size of object " << object_id << " to " << object_size - << ", num bytes being pulled is now " << num_bytes_being_pulled_; - it->second.object_size = object_size; - it->second.object_size_set = true; - UpdatePullsBasedOnAvailableMemory(num_bytes_available_); - } RAY_LOG(DEBUG) << "OnLocationChange " << spilled_url << " num clients " << client_ids.size(); @@ -289,11 +87,10 @@ void PullManager::TryToMakeObjectLocal(const ObjectID &object_id) { if (object_is_local_(object_id)) { return; } - if (active_object_pull_requests_.count(object_id) == 0) { + auto it = object_pull_requests_.find(object_id); + if (it == object_pull_requests_.end()) { return; } - auto it = object_pull_requests_.find(object_id); - RAY_CHECK(it != object_pull_requests_.end()); auto &request = it->second; if (request.next_pull_time > get_time_()) { return; @@ -377,14 +174,6 @@ bool PullManager::PullFromRandomLocation(const ObjectID &object_id) { return true; } -void PullManager::ResetRetryTimer(const ObjectID &object_id) { - auto it = object_pull_requests_.find(object_id); - if (it != object_pull_requests_.end()) { - it->second.next_pull_time = get_time_(); - it->second.num_retries = 0; - } -} - void PullManager::UpdateRetryTimer(ObjectPullRequest &request) { const auto time = get_time_(); auto retry_timeout_len = (pull_timeout_ms_ / 1000.) * (1UL << request.num_retries); @@ -395,7 +184,7 @@ void PullManager::UpdateRetryTimer(ObjectPullRequest &request) { } void PullManager::Tick() { - for (auto &pair : active_object_pull_requests_) { + for (auto &pair : object_pull_requests_) { const auto &object_id = pair.first; TryToMakeObjectLocal(object_id); } diff --git a/src/ray/object_manager/pull_manager.h b/src/ray/object_manager/pull_manager.h index e4a662eb6306..6364ae34a68d 100644 --- a/src/ray/object_manager/pull_manager.h +++ b/src/ray/object_manager/pull_manager.h @@ -40,14 +40,9 @@ class PullManager { NodeID &self_node_id, const std::function object_is_local, const std::function send_pull_request, const RestoreSpilledObjectCallback restore_spilled_object, - const std::function get_time, int pull_timeout_ms, - size_t num_bytes_available, std::function object_store_full_callback); - - /// Add a new pull request for a bundle of objects. The objects in the - /// request will get pulled once: - /// 1. Their sizes are known. - /// 2. Their total size, together with the total size of all requests - /// preceding this one, is within the capacity of the local object store. + const std::function get_time, int pull_timeout_ms); + + /// Begin a new pull request for a bundle of objects. /// /// \param object_refs The bundle of objects that must be made local. /// \param objects_to_locate The objects whose new locations the caller @@ -56,15 +51,6 @@ class PullManager { uint64_t Pull(const std::vector &object_ref_bundle, std::vector *objects_to_locate); - /// Update the pull requests that are currently being pulled, according to - /// the current capacity. The PullManager will choose the objects to pull by - /// taking the longest contiguous prefix of the request queue whose total - /// size is less than the given capacity. - /// - /// \param num_bytes_available The number of bytes that are currently - /// available to store objects pulled from another node. - void UpdatePullsBasedOnAvailableMemory(size_t num_bytes_available); - /// Called when the available locations for a given object change. /// /// \param object_id The ID of the object which is now available in a new location. @@ -74,7 +60,7 @@ class PullManager { /// non-empty, the object may no longer be on any node. void OnLocationChange(const ObjectID &object_id, const std::unordered_set &client_ids, - const std::string &spilled_url, size_t object_size); + const std::string &spilled_url); /// Cancel an existing pull request. /// @@ -87,13 +73,6 @@ class PullManager { /// existing objects from other nodes if necessary. void Tick(); - /// Call to reset the retry timer for an object that is actively being - /// pulled. This should be called for objects that were evicted but that may - /// still be needed on this node. - /// - /// \param object_id The object ID to reset. - void ResetRetryTimer(const ObjectID &object_id); - /// The number of ongoing object pulls. int NumActiveRequests() const; @@ -110,11 +89,6 @@ class PullManager { std::string spilled_url; double next_pull_time; uint8_t num_retries; - bool object_size_set = false; - size_t object_size = 0; - // All bundle requests that haven't been canceled yet that require this - // object. This includes bundle requests whose objects are not actively - // being pulled. absl::flat_hash_set bundle_request_ids; }; @@ -138,22 +112,6 @@ class PullManager { /// \param request The request to update the retry time of. void UpdateRetryTimer(ObjectPullRequest &request); - /// Activate the next pull request in the queue. This will start pulls for - /// any objects in the request that are not already being pulled. - bool ActivateNextPullBundleRequest( - const std::map>::iterator - &next_request_it); - - /// Deactivate a pull request in the queue. This cancels any pull or restore - /// operations for the object. - void DeactivatePullBundleRequest( - const std::map>::iterator &request_it); - - /// Trigger out-of-memory handling if the first request in the queue needs - /// more space than the bytes available. This is needed to make room for the - /// request. - void TriggerOutOfMemoryHandlingIfNeeded(); - /// See the constructor's arguments. NodeID self_node_id_; const std::function object_is_local_; @@ -166,51 +124,13 @@ class PullManager { /// cancel. Start at 1 because 0 means null. uint64_t next_req_id_ = 1; - /// The currently active pull requests. Each request is a bundle of objects - /// that must be made local. The key is the ID that was assigned to that - /// request, which can be used by the caller to cancel the request. - std::map> pull_request_bundles_; - - /// The total number of bytes that we are currently pulling. This is the - /// total size of the objects requested that we are actively pulling. To - /// avoid starvation, this is always less than the available capacity in the - /// local object store. - size_t num_bytes_being_pulled_ = 0; - - /// The total number of bytes that is available to store objects that we are - /// pulling. - size_t num_bytes_available_; - - /// Triggered when the first request in the queue can't be pulled due to - /// out-of-memory. This callback should try to make more bytes available. - std::function object_store_full_callback_; - - /// The last time OOM was reported. Track this so we don't spam warnings when - /// the object store is full. - uint64_t last_oom_reported_ms_ = 0; - - /// A pointer to the highest request ID whose objects we are currently - /// pulling. We always pull a contiguous prefix of the active pull requests. - /// This means that all requests with a lower ID are either already canceled - /// or their objects are also being pulled. - uint64_t highest_req_id_being_pulled_ = 0; - - /// The objects that this object manager has been asked to fetch from remote - /// object managers. - std::unordered_map object_pull_requests_; + std::unordered_map> pull_request_bundles_; - /// The objects that we are currently fetching. This is a subset of the - /// objects that we have been asked to fetch. The total size of these objects - /// is the number of bytes that we are currently pulling, and it must be less - /// than the bytes available. - absl::flat_hash_map> - active_object_pull_requests_; + /// The objects that this object manager is currently trying to fetch from + /// remote object managers. + std::unordered_map object_pull_requests_; /// Internally maintained random number generator. std::mt19937_64 gen_; - - friend class PullManagerTest; - friend class PullManagerTestWithCapacity; - friend class PullManagerWithAdmissionControlTest; }; } // namespace ray diff --git a/src/ray/object_manager/test/object_manager_stress_test.cc b/src/ray/object_manager/test/object_manager_stress_test.cc new file mode 100644 index 000000000000..8896ba9968db --- /dev/null +++ b/src/ray/object_manager/test/object_manager_stress_test.cc @@ -0,0 +1,453 @@ +// Copyright 2017 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "ray/common/common_protocol.h" +#include "ray/common/status.h" +#include "ray/common/test_util.h" +#include "ray/gcs/gcs_client/service_based_gcs_client.h" +#include "ray/object_manager/object_manager.h" +#include "ray/util/filesystem.h" +#include "src/ray/protobuf/common.pb.h" + +extern "C" { +#include "hiredis/hiredis.h" +} + +namespace ray { + +using rpc::GcsNodeInfo; + +static inline bool flushall_redis(void) { + redisContext *context = redisConnect("127.0.0.1", 6379); + if (context == nullptr || context->err) { + return false; + } + freeReplyObject(redisCommand(context, "FLUSHALL")); + freeReplyObject(redisCommand(context, "SET NumRedisShards 1")); + freeReplyObject(redisCommand(context, "LPUSH RedisShards 127.0.0.1:6380")); + redisFree(context); + + redisContext *shard_context = redisConnect("127.0.0.1", 6380); + if (shard_context == nullptr || shard_context->err) { + return false; + } + freeReplyObject(redisCommand(shard_context, "FLUSHALL")); + redisFree(shard_context); + + return true; +} + +int64_t current_time_ms() { + std::chrono::milliseconds ms_since_epoch = + std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()); + return ms_since_epoch.count(); +} + +class MockServer { + public: + MockServer(boost::asio::io_service &main_service, + const ObjectManagerConfig &object_manager_config, + std::shared_ptr gcs_client) + : node_id_(NodeID::FromRandom()), + config_(object_manager_config), + gcs_client_(gcs_client), + object_manager_(main_service, node_id_, object_manager_config, + std::make_shared(main_service, gcs_client_), + nullptr) { + RAY_CHECK_OK(RegisterGcs(main_service)); + } + + ~MockServer() { RAY_CHECK_OK(gcs_client_->Nodes().UnregisterSelf()); } + + private: + ray::Status RegisterGcs(boost::asio::io_service &io_service) { + auto object_manager_port = object_manager_.GetServerPort(); + GcsNodeInfo node_info; + node_info.set_node_id(node_id_.Binary()); + node_info.set_node_manager_address("127.0.0.1"); + node_info.set_node_manager_port(object_manager_port); + node_info.set_object_manager_port(object_manager_port); + + ray::Status status = gcs_client_->Nodes().RegisterSelf(node_info, nullptr); + std::this_thread::sleep_for(std::chrono::milliseconds(5000)); + return status; + } + + friend class StressTestObjectManager; + + NodeID node_id_; + ObjectManagerConfig config_; + std::shared_ptr gcs_client_; + ObjectManager object_manager_; +}; + +class TestObjectManagerBase : public ::testing::Test { + public: + void SetUp() { + WaitForCondition(flushall_redis, 7000); + + // start store + socket_name_1 = TestSetupUtil::StartObjectStore(); + socket_name_2 = TestSetupUtil::StartObjectStore(); + + unsigned int pull_timeout_ms = 1000; + uint64_t object_chunk_size = static_cast(std::pow(10, 3)); + int push_timeout_ms = 10000; + + // start first server + gcs_server_socket_name_ = TestSetupUtil::StartGcsServer("127.0.0.1"); + gcs::GcsClientOptions client_options("127.0.0.1", 6379, /*password*/ "", + /*is_test_client=*/false); + gcs_client_1 = std::make_shared(client_options); + RAY_CHECK_OK(gcs_client_1->Connect(main_service)); + ObjectManagerConfig om_config_1; + om_config_1.store_socket_name = socket_name_1; + om_config_1.pull_timeout_ms = pull_timeout_ms; + om_config_1.object_chunk_size = object_chunk_size; + om_config_1.push_timeout_ms = push_timeout_ms; + om_config_1.object_manager_port = 0; + om_config_1.rpc_service_threads_number = 3; + server1.reset(new MockServer(main_service, om_config_1, gcs_client_1)); + + // start second server + gcs_client_2 = std::make_shared(client_options); + RAY_CHECK_OK(gcs_client_2->Connect(main_service)); + ObjectManagerConfig om_config_2; + om_config_2.store_socket_name = socket_name_2; + om_config_2.pull_timeout_ms = pull_timeout_ms; + om_config_2.object_chunk_size = object_chunk_size; + om_config_2.push_timeout_ms = push_timeout_ms; + om_config_2.object_manager_port = 0; + om_config_2.rpc_service_threads_number = 3; + server2.reset(new MockServer(main_service, om_config_2, gcs_client_2)); + + // connect to stores. + RAY_CHECK_OK(client1.Connect(socket_name_1)); + RAY_CHECK_OK(client2.Connect(socket_name_2)); + } + + void TearDown() { + Status client1_status = client1.Disconnect(); + Status client2_status = client2.Disconnect(); + ASSERT_TRUE(client1_status.ok() && client2_status.ok()); + + gcs_client_1->Disconnect(); + gcs_client_2->Disconnect(); + + this->server1.reset(); + this->server2.reset(); + + TestSetupUtil::StopObjectStore(socket_name_1); + TestSetupUtil::StopObjectStore(socket_name_2); + + if (!gcs_server_socket_name_.empty()) { + TestSetupUtil::StopGcsServer(gcs_server_socket_name_); + } + } + + ObjectID WriteDataToClient(plasma::PlasmaClient &client, int64_t data_size) { + ObjectID object_id = ObjectID::FromRandom(); + RAY_LOG(DEBUG) << "ObjectID Created: " << object_id; + uint8_t metadata[] = {5}; + int64_t metadata_size = sizeof(metadata); + uint64_t retry_with_request_id = 0; + std::shared_ptr data; + RAY_CHECK_OK(client.Create(object_id, ray::rpc::Address(), data_size, metadata, + metadata_size, &retry_with_request_id, &data)); + RAY_CHECK(retry_with_request_id == 0); + RAY_CHECK_OK(client.Seal(object_id)); + return object_id; + } + + void object_added_handler_1(ObjectID object_id) { v1.push_back(object_id); }; + + void object_added_handler_2(ObjectID object_id) { v2.push_back(object_id); }; + + protected: + std::thread p; + boost::asio::io_service main_service; + std::shared_ptr gcs_client_1; + std::shared_ptr gcs_client_2; + std::unique_ptr server1; + std::unique_ptr server2; + + plasma::PlasmaClient client1; + plasma::PlasmaClient client2; + std::vector v1; + std::vector v2; + + std::string gcs_server_socket_name_; + std::string socket_name_1; + std::string socket_name_2; +}; + +class StressTestObjectManager : public TestObjectManagerBase { + public: + enum class TransferPattern { + PUSH_A_B, + PUSH_B_A, + BIDIRECTIONAL_PUSH, + PULL_A_B, + PULL_B_A, + BIDIRECTIONAL_PULL, + BIDIRECTIONAL_PULL_VARIABLE_DATA_SIZE, + }; + + int async_loop_index = -1; + size_t num_expected_objects; + + std::vector async_loop_patterns = { + TransferPattern::PUSH_A_B, + TransferPattern::PUSH_B_A, + TransferPattern::BIDIRECTIONAL_PUSH, + TransferPattern::PULL_A_B, + TransferPattern::PULL_B_A, + TransferPattern::BIDIRECTIONAL_PULL, + TransferPattern::BIDIRECTIONAL_PULL_VARIABLE_DATA_SIZE}; + + int num_connected_clients = 0; + + NodeID node_id_1; + NodeID node_id_2; + + int64_t start_time; + + void WaitConnections() { + node_id_1 = gcs_client_1->Nodes().GetSelfId(); + node_id_2 = gcs_client_2->Nodes().GetSelfId(); + RAY_CHECK_OK(gcs_client_1->Nodes().AsyncSubscribeToNodeChange( + [this](const NodeID &node_id, const GcsNodeInfo &data) { + if (node_id == node_id_1 || node_id == node_id_2) { + num_connected_clients += 1; + } + if (num_connected_clients == 4) { + StartTests(); + } + }, + nullptr)); + RAY_CHECK_OK(gcs_client_2->Nodes().AsyncSubscribeToNodeChange( + [this](const NodeID &node_id, const GcsNodeInfo &data) { + if (node_id == node_id_1 || node_id == node_id_2) { + num_connected_clients += 1; + } + if (num_connected_clients == 4) { + StartTests(); + } + }, + nullptr)); + } + + void StartTests() { + TestConnections(); + AddTransferTestHandlers(); + TransferTestNext(); + } + + void AddTransferTestHandlers() { + ray::Status status = ray::Status::OK(); + status = server1->object_manager_.SubscribeObjAdded( + [this](const object_manager::protocol::ObjectInfoT &object_info) { + object_added_handler_1(ObjectID::FromBinary(object_info.object_id)); + if (v1.size() == num_expected_objects && v1.size() == v2.size()) { + TransferTestComplete(); + } + }); + RAY_CHECK_OK(status); + status = server2->object_manager_.SubscribeObjAdded( + [this](const object_manager::protocol::ObjectInfoT &object_info) { + object_added_handler_2(ObjectID::FromBinary(object_info.object_id)); + if (v2.size() == num_expected_objects && v1.size() == v2.size()) { + TransferTestComplete(); + } + }); + RAY_CHECK_OK(status); + } + + void TransferTestNext() { + async_loop_index += 1; + if ((size_t)async_loop_index < async_loop_patterns.size()) { + TransferPattern pattern = async_loop_patterns[async_loop_index]; + TransferTestExecute(100, 3 * std::pow(10, 3) - 1, pattern); + } else { + main_service.stop(); + } + } + + plasma::ObjectBuffer GetObject(plasma::PlasmaClient &client, ObjectID &object_id) { + plasma::ObjectBuffer object_buffer; + RAY_CHECK_OK(client.Get(&object_id, 1, 0, &object_buffer)); + return object_buffer; + } + + void CompareObjects(ObjectID &object_id_1, ObjectID &object_id_2) { + plasma::ObjectBuffer object_buffer_1 = GetObject(client1, object_id_1); + plasma::ObjectBuffer object_buffer_2 = GetObject(client2, object_id_2); + uint8_t *data_1 = const_cast(object_buffer_1.data->Data()); + uint8_t *data_2 = const_cast(object_buffer_2.data->Data()); + ASSERT_EQ(object_buffer_1.data->Size(), object_buffer_2.data->Size()); + ASSERT_EQ(object_buffer_1.metadata->Size(), object_buffer_2.metadata->Size()); + int64_t total_size = object_buffer_1.data->Size() + object_buffer_1.metadata->Size(); + RAY_LOG(DEBUG) << "total_size " << total_size; + for (int i = -1; ++i < total_size;) { + ASSERT_TRUE(data_1[i] == data_2[i]); + } + } + + void TransferTestComplete() { + int64_t elapsed = current_time_ms() - start_time; + RAY_LOG(INFO) << "TransferTestComplete: " + << static_cast(async_loop_patterns[async_loop_index]) << " " + << v1.size() << " " << elapsed; + ASSERT_TRUE(v1.size() == v2.size()); + for (size_t i = 0; i < v1.size(); ++i) { + ASSERT_TRUE(std::find(v1.begin(), v1.end(), v2[i]) != v1.end()); + } + + // Compare objects and their hashes. + for (size_t i = 0; i < v1.size(); ++i) { + ObjectID object_id_2 = v2[i]; + ObjectID object_id_1 = + v1[std::distance(v1.begin(), std::find(v1.begin(), v1.end(), v2[i]))]; + CompareObjects(object_id_1, object_id_2); + } + + v1.clear(); + v2.clear(); + TransferTestNext(); + } + + void TransferTestExecute(int num_trials, int64_t data_size, + TransferPattern transfer_pattern) { + NodeID node_id_1 = gcs_client_1->Nodes().GetSelfId(); + NodeID node_id_2 = gcs_client_2->Nodes().GetSelfId(); + + if (transfer_pattern == TransferPattern::BIDIRECTIONAL_PULL || + transfer_pattern == TransferPattern::BIDIRECTIONAL_PUSH || + transfer_pattern == TransferPattern::BIDIRECTIONAL_PULL_VARIABLE_DATA_SIZE) { + num_expected_objects = (size_t)2 * num_trials; + } else { + num_expected_objects = (size_t)num_trials; + } + + start_time = current_time_ms(); + + switch (transfer_pattern) { + case TransferPattern::PUSH_A_B: { + for (int i = -1; ++i < num_trials;) { + ObjectID oid1 = WriteDataToClient(client1, data_size); + server1->object_manager_.Push(oid1, node_id_2); + } + } break; + case TransferPattern::PUSH_B_A: { + for (int i = -1; ++i < num_trials;) { + ObjectID oid2 = WriteDataToClient(client2, data_size); + server2->object_manager_.Push(oid2, node_id_1); + } + } break; + case TransferPattern::BIDIRECTIONAL_PUSH: { + for (int i = -1; ++i < num_trials;) { + ObjectID oid1 = WriteDataToClient(client1, data_size); + server1->object_manager_.Push(oid1, node_id_2); + ObjectID oid2 = WriteDataToClient(client2, data_size); + server2->object_manager_.Push(oid2, node_id_1); + } + } break; + case TransferPattern::PULL_A_B: { + for (int i = -1; ++i < num_trials;) { + ObjectID oid1 = WriteDataToClient(client1, data_size); + static_cast( + server2->object_manager_.Pull({ObjectIdToRef(oid1, rpc::Address())})); + } + } break; + case TransferPattern::PULL_B_A: { + for (int i = -1; ++i < num_trials;) { + ObjectID oid2 = WriteDataToClient(client2, data_size); + static_cast( + server1->object_manager_.Pull({ObjectIdToRef(oid2, rpc::Address())})); + } + } break; + case TransferPattern::BIDIRECTIONAL_PULL: { + for (int i = -1; ++i < num_trials;) { + ObjectID oid1 = WriteDataToClient(client1, data_size); + static_cast( + server2->object_manager_.Pull({ObjectIdToRef(oid1, rpc::Address())})); + ObjectID oid2 = WriteDataToClient(client2, data_size); + static_cast( + server1->object_manager_.Pull({ObjectIdToRef(oid2, rpc::Address())})); + } + } break; + case TransferPattern::BIDIRECTIONAL_PULL_VARIABLE_DATA_SIZE: { + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution<> dis(1, 50); + for (int i = -1; ++i < num_trials;) { + ObjectID oid1 = WriteDataToClient(client1, data_size + dis(gen)); + static_cast( + server2->object_manager_.Pull({ObjectIdToRef(oid1, rpc::Address())})); + ObjectID oid2 = WriteDataToClient(client2, data_size + dis(gen)); + static_cast( + server1->object_manager_.Pull({ObjectIdToRef(oid2, rpc::Address())})); + } + } break; + default: { + RAY_LOG(FATAL) << "No case for transfer_pattern " + << static_cast(transfer_pattern); + } break; + } + } + + void TestConnections() { + RAY_LOG(DEBUG) << "\n" + << "Server node ids:" + << "\n"; + NodeID node_id_1 = gcs_client_1->Nodes().GetSelfId(); + NodeID node_id_2 = gcs_client_2->Nodes().GetSelfId(); + RAY_LOG(DEBUG) << "Server 1: " << node_id_1 << "\n" + << "Server 2: " << node_id_2; + + RAY_LOG(DEBUG) << "\n" + << "All connected nodes:" + << "\n"; + auto data = gcs_client_1->Nodes().Get(node_id_1); + RAY_LOG(DEBUG) << "NodeID=" << NodeID::FromBinary(data->node_id()) << "\n" + << "NodeIp=" << data->node_manager_address() << "\n" + << "NodePort=" << data->node_manager_port(); + auto data2 = gcs_client_1->Nodes().Get(node_id_2); + RAY_LOG(DEBUG) << "NodeID=" << NodeID::FromBinary(data2->node_id()) << "\n" + << "NodeIp=" << data2->node_manager_address() << "\n" + << "NodePort=" << data2->node_manager_port(); + } +}; + +TEST_F(StressTestObjectManager, StartStressTestObjectManager) { + auto AsyncStartTests = main_service.wrap([this]() { WaitConnections(); }); + AsyncStartTests(); + main_service.run(); +} + +} // namespace ray + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + ray::TEST_STORE_EXEC_PATH = std::string(argv[1]); + ray::TEST_GCS_SERVER_EXEC_PATH = std::string(argv[2]); + return RUN_ALL_TESTS(); +} diff --git a/src/ray/object_manager/test/object_manager_test.cc b/src/ray/object_manager/test/object_manager_test.cc new file mode 100644 index 000000000000..7afe2e42ef03 --- /dev/null +++ b/src/ray/object_manager/test/object_manager_test.cc @@ -0,0 +1,496 @@ +// Copyright 2017 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/object_manager/object_manager.h" + +#include +#include + +#include "gtest/gtest.h" +#include "ray/common/status.h" +#include "ray/common/test_util.h" +#include "ray/gcs/gcs_client/service_based_gcs_client.h" +#include "ray/util/filesystem.h" +#include "src/ray/protobuf/common.pb.h" + +extern "C" { +#include "hiredis/hiredis.h" +} + +namespace { +int64_t wait_timeout_ms; +} // namespace + +namespace ray { + +using rpc::GcsNodeInfo; + +static inline void flushall_redis(void) { + redisContext *context = redisConnect("127.0.0.1", 6379); + freeReplyObject(redisCommand(context, "FLUSHALL")); + freeReplyObject(redisCommand(context, "SET NumRedisShards 1")); + freeReplyObject(redisCommand(context, "LPUSH RedisShards 127.0.0.1:6380")); + redisFree(context); +} + +class MockServer { + public: + MockServer(boost::asio::io_service &main_service, + const ObjectManagerConfig &object_manager_config, + std::shared_ptr gcs_client) + : node_id_(NodeID::FromRandom()), + config_(object_manager_config), + gcs_client_(gcs_client), + object_manager_(main_service, node_id_, object_manager_config, + std::make_shared(main_service, gcs_client_), + nullptr) { + RAY_CHECK_OK(RegisterGcs(main_service)); + } + + ~MockServer() { RAY_CHECK_OK(gcs_client_->Nodes().UnregisterSelf()); } + + private: + ray::Status RegisterGcs(boost::asio::io_service &io_service) { + auto object_manager_port = object_manager_.GetServerPort(); + GcsNodeInfo node_info; + node_info.set_node_id(node_id_.Binary()); + node_info.set_node_manager_address("127.0.0.1"); + node_info.set_node_manager_port(object_manager_port); + node_info.set_object_manager_port(object_manager_port); + + ray::Status status = gcs_client_->Nodes().RegisterSelf(node_info, nullptr); + return status; + } + + friend class TestObjectManager; + + NodeID node_id_; + ObjectManagerConfig config_; + std::shared_ptr gcs_client_; + ObjectManager object_manager_; +}; + +class TestObjectManagerBase : public ::testing::Test { + public: + void SetUp() { + flushall_redis(); + + // start store + socket_name_1 = TestSetupUtil::StartObjectStore(); + socket_name_2 = TestSetupUtil::StartObjectStore(); + + unsigned int pull_timeout_ms = 1; + push_timeout_ms = 1500; + + // start first server + gcs_server_socket_name_ = TestSetupUtil::StartGcsServer("127.0.0.1"); + gcs::GcsClientOptions client_options("127.0.0.1", 6379, /*password*/ "", + /*is_test_client=*/true); + gcs_client_1 = std::make_shared(client_options); + RAY_CHECK_OK(gcs_client_1->Connect(main_service)); + ObjectManagerConfig om_config_1; + om_config_1.store_socket_name = socket_name_1; + om_config_1.pull_timeout_ms = pull_timeout_ms; + om_config_1.object_chunk_size = object_chunk_size; + om_config_1.push_timeout_ms = push_timeout_ms; + om_config_1.object_manager_port = 0; + om_config_1.rpc_service_threads_number = 3; + server1.reset(new MockServer(main_service, om_config_1, gcs_client_1)); + + // start second server + gcs_client_2 = std::make_shared(client_options); + RAY_CHECK_OK(gcs_client_2->Connect(main_service)); + ObjectManagerConfig om_config_2; + om_config_2.store_socket_name = socket_name_2; + om_config_2.pull_timeout_ms = pull_timeout_ms; + om_config_2.object_chunk_size = object_chunk_size; + om_config_2.push_timeout_ms = push_timeout_ms; + om_config_2.object_manager_port = 0; + om_config_2.rpc_service_threads_number = 3; + server2.reset(new MockServer(main_service, om_config_2, gcs_client_2)); + + // connect to stores. + RAY_CHECK_OK(client1.Connect(socket_name_1)); + RAY_CHECK_OK(client2.Connect(socket_name_2)); + } + + void TearDown() { + Status client1_status = client1.Disconnect(); + Status client2_status = client2.Disconnect(); + ASSERT_TRUE(client1_status.ok() && client2_status.ok()); + + gcs_client_1->Disconnect(); + gcs_client_2->Disconnect(); + + this->server1.reset(); + this->server2.reset(); + + TestSetupUtil::StopObjectStore(socket_name_1); + TestSetupUtil::StopObjectStore(socket_name_2); + + if (!gcs_server_socket_name_.empty()) { + TestSetupUtil::StopGcsServer(gcs_server_socket_name_); + } + } + + ObjectID WriteDataToClient(plasma::PlasmaClient &client, int64_t data_size) { + return WriteDataToClient(client, data_size, ObjectID::FromRandom()); + } + + ObjectID WriteDataToClient(plasma::PlasmaClient &client, int64_t data_size, + ObjectID object_id) { + RAY_LOG(DEBUG) << "ObjectID Created: " << object_id; + uint8_t metadata[] = {5}; + int64_t metadata_size = sizeof(metadata); + uint64_t retry_with_request_id = 0; + std::shared_ptr data; + RAY_CHECK_OK(client.Create(object_id, ray::rpc::Address(), data_size, metadata, + metadata_size, &retry_with_request_id, &data)); + RAY_CHECK(retry_with_request_id == 0); + RAY_CHECK_OK(client.Seal(object_id)); + return object_id; + } + + void object_added_handler_1(ObjectID object_id) { v1.push_back(object_id); }; + + void object_added_handler_2(ObjectID object_id) { v2.push_back(object_id); }; + + protected: + std::thread p; + boost::asio::io_service main_service; + std::shared_ptr gcs_client_1; + std::shared_ptr gcs_client_2; + std::unique_ptr server1; + std::unique_ptr server2; + + plasma::PlasmaClient client1; + plasma::PlasmaClient client2; + std::vector v1; + std::vector v2; + + std::string gcs_server_socket_name_; + std::string socket_name_1; + std::string socket_name_2; + + unsigned int push_timeout_ms; + + uint64_t object_chunk_size = static_cast(std::pow(10, 3)); +}; + +class TestObjectManager : public TestObjectManagerBase { + public: + int current_wait_test = -1; + int num_connected_clients_1 = 0; + int num_connected_clients_2 = 0; + std::atomic ready_cnt; + NodeID node_id_1; + NodeID node_id_2; + + ObjectID created_object_id1; + ObjectID created_object_id2; + + std::unique_ptr timer; + + void WaitConnections() { + node_id_1 = gcs_client_1->Nodes().GetSelfId(); + node_id_2 = gcs_client_2->Nodes().GetSelfId(); + RAY_CHECK_OK(gcs_client_1->Nodes().AsyncSubscribeToNodeChange( + [this](const NodeID &node_id, const GcsNodeInfo &data) { + if (node_id == node_id_1 || node_id == node_id_2) { + num_connected_clients_1 += 1; + } + if (num_connected_clients_1 == 2) { + ready_cnt += 1; + if (ready_cnt == 2) { + StartTests(); + } + } + }, + nullptr)); + RAY_CHECK_OK(gcs_client_2->Nodes().AsyncSubscribeToNodeChange( + [this](const NodeID &node_id, const GcsNodeInfo &data) { + if (node_id == node_id_1 || node_id == node_id_2) { + num_connected_clients_2 += 1; + } + if (num_connected_clients_2 == 2) { + ready_cnt += 1; + if (ready_cnt == 2) { + StartTests(); + } + } + }, + nullptr)); + } + + void StartTests() { + TestConnections(); + TestNotifications(); + } + + void TestNotifications() { + ray::Status status = ray::Status::OK(); + status = server1->object_manager_.SubscribeObjAdded( + [this](const object_manager::protocol::ObjectInfoT &object_info) { + object_added_handler_1(ObjectID::FromBinary(object_info.object_id)); + NotificationTestCompleteIfSatisfied(); + }); + RAY_CHECK_OK(status); + status = server2->object_manager_.SubscribeObjAdded( + [this](const object_manager::protocol::ObjectInfoT &object_info) { + object_added_handler_2(ObjectID::FromBinary(object_info.object_id)); + NotificationTestCompleteIfSatisfied(); + }); + RAY_CHECK_OK(status); + + size_t data_size = 1000000; + + // dummy_id is not local. The push function will timeout. + ObjectID dummy_id = ObjectID::FromRandom(); + server1->object_manager_.Push(dummy_id, gcs_client_2->Nodes().GetSelfId()); + + created_object_id1 = ObjectID::FromRandom(); + WriteDataToClient(client1, data_size, created_object_id1); + // Server1 holds Object1 so this Push call will success. + server1->object_manager_.Push(created_object_id1, gcs_client_2->Nodes().GetSelfId()); + + // This timer is used to guarantee that the Push function for dummy_id will timeout. + timer.reset(new boost::asio::deadline_timer(main_service)); + auto period = boost::posix_time::milliseconds(push_timeout_ms + 10); + timer->expires_from_now(period); + created_object_id2 = ObjectID::FromRandom(); + timer->async_wait([this, data_size](const boost::system::error_code &error) { + WriteDataToClient(client2, data_size, created_object_id2); + }); + } + + void NotificationTestCompleteIfSatisfied() { + size_t num_expected_objects1 = 1; + size_t num_expected_objects2 = 2; + if (v1.size() == num_expected_objects1 && v2.size() == num_expected_objects2) { + SubscribeObjectThenWait(); + } + } + + void SubscribeObjectThenWait() { + int data_size = 100; + // Test to ensure Wait works properly during an active subscription to the same + // object. + ObjectID object_1 = WriteDataToClient(client2, data_size); + ObjectID object_2 = WriteDataToClient(client2, data_size); + server2->object_manager_.Push(object_1, gcs_client_1->Nodes().GetSelfId()); + server2->object_manager_.Push(object_2, gcs_client_1->Nodes().GetSelfId()); + + UniqueID sub_id = ray::UniqueID::FromRandom(); + RAY_CHECK_OK(server1->object_manager_.object_directory_->SubscribeObjectLocations( + sub_id, object_1, rpc::Address(), + [this, sub_id, object_1, object_2](const ray::ObjectID &object_id, + const std::unordered_set &clients, + const std::string &spilled_url) { + if (!clients.empty()) { + TestWaitWhileSubscribed(sub_id, object_1, object_2); + } + })); + } + + void TestWaitWhileSubscribed(UniqueID sub_id, ObjectID object_1, ObjectID object_2) { + int required_objects = 1; + int timeout_ms = 1500; + + std::vector object_ids = {object_1, object_2}; + boost::posix_time::ptime start_time = boost::posix_time::second_clock::local_time(); + + UniqueID wait_id = UniqueID::FromRandom(); + + RAY_CHECK_OK(server1->object_manager_.AddWaitRequest( + wait_id, object_ids, std::unordered_map(), timeout_ms, + required_objects, + [this, sub_id, object_1, object_ids, start_time]( + const std::vector &found, + const std::vector &remaining) { + int64_t elapsed = (boost::posix_time::second_clock::local_time() - start_time) + .total_milliseconds(); + RAY_LOG(DEBUG) << "elapsed " << elapsed; + RAY_LOG(DEBUG) << "found " << found.size(); + RAY_LOG(DEBUG) << "remaining " << remaining.size(); + RAY_CHECK(found.size() == 1); + // There's nothing more to test. A check will fail if unexpected behavior is + // triggered. + RAY_CHECK_OK( + server1->object_manager_.object_directory_->UnsubscribeObjectLocations( + sub_id, object_1)); + NextWaitTest(); + })); + + // Skip lookups and rely on Subscribe only to test subscribe interaction. + server1->object_manager_.SubscribeRemainingWaitObjects(wait_id); + } + + void NextWaitTest() { + int data_size = 600; + current_wait_test += 1; + switch (current_wait_test) { + case 0: { + // Ensure timeout_ms = 0 is handled correctly. + // Out of 5 objects, we expect 3 ready objects and 2 remaining objects. + TestWait(data_size, 5, 3, /*timeout_ms=*/0, false, false); + } break; + case 1: { + // Ensure timeout_ms = 1500 is handled correctly. + // Out of 5 objects, we expect 3 ready objects and 2 remaining objects. + TestWait(data_size, 5, 3, wait_timeout_ms, false, false); + } break; + case 2: { + // Generate objects locally to ensure local object code-path works properly. + // Out of 5 objects, we expect 3 ready objects and 2 remaining objects. + TestWait(data_size, 5, 3, wait_timeout_ms, false, /*test_local=*/true); + } break; + case 3: { + // Wait on an object that's never registered with GCS to ensure timeout works + // properly. + TestWait(data_size, /*num_objects=*/5, /*required_objects=*/6, wait_timeout_ms, + /*include_nonexistent=*/true, false); + } break; + case 4: { + // Ensure infinite time code-path works properly. + TestWait(data_size, 5, 5, /*timeout_ms=*/-1, false, false); + } break; + } + } + + void TestWait(int data_size, int num_objects, uint64_t required_objects, int timeout_ms, + bool include_nonexistent, bool test_local) { + std::vector object_ids; + for (int i = -1; ++i < num_objects;) { + ObjectID oid; + if (test_local) { + oid = WriteDataToClient(client1, data_size); + } else { + oid = WriteDataToClient(client2, data_size); + server2->object_manager_.Push(oid, gcs_client_1->Nodes().GetSelfId()); + } + object_ids.push_back(oid); + } + if (include_nonexistent) { + num_objects += 1; + object_ids.push_back(ObjectID::FromRandom()); + } + + boost::posix_time::ptime start_time = boost::posix_time::second_clock::local_time(); + RAY_CHECK_OK(server1->object_manager_.Wait( + object_ids, std::unordered_map(), timeout_ms, + required_objects, + [this, object_ids, num_objects, timeout_ms, required_objects, start_time]( + const std::vector &found, + const std::vector &remaining) { + int64_t elapsed = (boost::posix_time::second_clock::local_time() - start_time) + .total_milliseconds(); + RAY_LOG(DEBUG) << "elapsed " << elapsed; + RAY_LOG(DEBUG) << "found " << found.size(); + RAY_LOG(DEBUG) << "remaining " << remaining.size(); + + // Ensure object order is preserved for all invocations. + size_t j = 0; + size_t k = 0; + for (size_t i = 0; i < object_ids.size(); ++i) { + ObjectID oid = object_ids[i]; + // Make sure the object is in either the found vector or the remaining vector. + if (j < found.size() && found[j] == oid) { + j += 1; + } + if (k < remaining.size() && remaining[k] == oid) { + k += 1; + } + } + if (!found.empty()) { + ASSERT_EQ(j, found.size()); + } + if (!remaining.empty()) { + ASSERT_EQ(k, remaining.size()); + } + + switch (current_wait_test) { + case 0: { + // Ensure timeout_ms = 0 returns expected number of found and remaining + // objects. + ASSERT_TRUE(found.size() <= required_objects); + ASSERT_TRUE(static_cast(found.size() + remaining.size()) == num_objects); + NextWaitTest(); + } break; + case 1: { + // Ensure lookup succeeds as expected when timeout_ms = 1500. + ASSERT_TRUE(found.size() >= required_objects); + ASSERT_TRUE(static_cast(found.size() + remaining.size()) == num_objects); + NextWaitTest(); + } break; + case 2: { + // Ensure lookup succeeds as expected when objects are local. + ASSERT_TRUE(found.size() >= required_objects); + ASSERT_TRUE(static_cast(found.size() + remaining.size()) == num_objects); + NextWaitTest(); + } break; + case 3: { + // Ensure lookup returns after timeout_ms elapses when one object doesn't + // exist. + ASSERT_TRUE(elapsed >= timeout_ms); + ASSERT_TRUE(static_cast(found.size() + remaining.size()) == num_objects); + NextWaitTest(); + } break; + case 4: { + // Ensure timeout_ms = -1 works properly. + ASSERT_TRUE(static_cast(found.size()) == num_objects); + ASSERT_TRUE(remaining.size() == 0); + TestWaitComplete(); + } break; + } + })); + } + + void TestWaitComplete() { main_service.stop(); } + + void TestConnections() { + RAY_LOG(DEBUG) << "\n" + << "Server node ids:" + << "\n"; + auto data = gcs_client_1->Nodes().Get(node_id_1); + RAY_LOG(DEBUG) << (NodeID::FromBinary(data->node_id()).IsNil()); + RAY_LOG(DEBUG) << "Server 1 NodeID=" << NodeID::FromBinary(data->node_id()); + RAY_LOG(DEBUG) << "Server 1 NodeIp=" << data->node_manager_address(); + RAY_LOG(DEBUG) << "Server 1 NodePort=" << data->node_manager_port(); + ASSERT_EQ(node_id_1, NodeID::FromBinary(data->node_id())); + auto data2 = gcs_client_1->Nodes().Get(node_id_2); + RAY_LOG(DEBUG) << "Server 2 NodeID=" << NodeID::FromBinary(data2->node_id()); + RAY_LOG(DEBUG) << "Server 2 NodeIp=" << data2->node_manager_address(); + RAY_LOG(DEBUG) << "Server 2 NodePort=" << data2->node_manager_port(); + ASSERT_EQ(node_id_2, NodeID::FromBinary(data2->node_id())); + } +}; + +/* TODO(ekl) this seems to be hanging occasionally on Linux +TEST_F(TestObjectManager, StartTestObjectManager) { + // TODO: Break this test suite into unit tests. + auto AsyncStartTests = main_service.wrap([this]() { WaitConnections(); }); + AsyncStartTests(); + main_service.run(); +} +*/ + +} // namespace ray + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + ray::TEST_STORE_EXEC_PATH = std::string(argv[1]); + wait_timeout_ms = std::stoi(std::string(argv[2])); + ray::TEST_GCS_SERVER_EXEC_PATH = std::string(argv[3]); + return RUN_ALL_TESTS(); +} diff --git a/src/ray/object_manager/test/pull_manager_test.cc b/src/ray/object_manager/test/pull_manager_test.cc index 345cc6ceadfe..9230c87e9db9 100644 --- a/src/ray/object_manager/test/pull_manager_test.cc +++ b/src/ray/object_manager/test/pull_manager_test.cc @@ -10,14 +10,13 @@ namespace ray { using ::testing::ElementsAre; -class PullManagerTestWithCapacity { +class PullManagerTest : public ::testing::Test { public: - PullManagerTestWithCapacity(size_t num_available_bytes) + PullManagerTest() : self_node_id_(NodeID::FromRandom()), object_is_local_(false), num_send_pull_request_calls_(0), num_restore_spilled_object_calls_(0), - num_object_store_full_calls_(0), fake_time_(0), pull_manager_(self_node_id_, [this](const ObjectID &object_id) { return object_is_local_; }, @@ -29,51 +28,17 @@ class PullManagerTestWithCapacity { num_restore_spilled_object_calls_++; restore_object_callback_ = callback; }, - [this]() { return fake_time_; }, 10000, num_available_bytes, - [this]() { num_object_store_full_calls_++; }) {} - - void AssertNoLeaks() { - ASSERT_TRUE(pull_manager_.pull_request_bundles_.empty()); - ASSERT_TRUE(pull_manager_.object_pull_requests_.empty()); - ASSERT_TRUE(pull_manager_.active_object_pull_requests_.empty()); - // Most tests should not throw OOM. - ASSERT_EQ(num_object_store_full_calls_, 0); - } + [this]() { return fake_time_; }, 10000) {} NodeID self_node_id_; bool object_is_local_; int num_send_pull_request_calls_; int num_restore_spilled_object_calls_; - int num_object_store_full_calls_; std::function restore_object_callback_; double fake_time_; PullManager pull_manager_; }; -class PullManagerTest : public PullManagerTestWithCapacity, public ::testing::Test { - public: - PullManagerTest() : PullManagerTestWithCapacity(1) {} - - void AssertNumActiveRequestsEquals(size_t num_requests) { - ASSERT_EQ(pull_manager_.object_pull_requests_.size(), num_requests); - ASSERT_EQ(pull_manager_.active_object_pull_requests_.size(), num_requests); - } -}; - -class PullManagerWithAdmissionControlTest : public PullManagerTestWithCapacity, - public ::testing::Test { - public: - PullManagerWithAdmissionControlTest() : PullManagerTestWithCapacity(10) {} - - void AssertNumActiveRequestsEquals(size_t num_requests) { - ASSERT_EQ(pull_manager_.active_object_pull_requests_.size(), num_requests); - } - - bool IsUnderCapacity(size_t num_bytes_requested) { - return num_bytes_requested <= pull_manager_.num_bytes_available_; - } -}; - std::vector CreateObjectRefs(int num_objs) { std::vector refs; for (int i = 0; i < num_objs; i++) { @@ -88,14 +53,14 @@ std::vector CreateObjectRefs(int num_objs) { TEST_F(PullManagerTest, TestStaleSubscription) { auto refs = CreateObjectRefs(1); auto oid = ObjectRefsToIds(refs)[0]; - AssertNumActiveRequestsEquals(0); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); std::vector objects_to_locate; auto req_id = pull_manager_.Pull(refs, &objects_to_locate); ASSERT_EQ(ObjectRefsToIds(objects_to_locate), ObjectRefsToIds(refs)); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 1); std::unordered_set client_ids; - pull_manager_.OnLocationChange(oid, client_ids, "", 0); - AssertNumActiveRequestsEquals(1); + pull_manager_.OnLocationChange(oid, client_ids, ""); // There are no client ids to pull from. ASSERT_EQ(num_send_pull_request_calls_, 0); @@ -106,30 +71,29 @@ TEST_F(PullManagerTest, TestStaleSubscription) { ASSERT_EQ(num_send_pull_request_calls_, 0); ASSERT_EQ(num_restore_spilled_object_calls_, 0); - AssertNumActiveRequestsEquals(0); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); client_ids.insert(NodeID::FromRandom()); - pull_manager_.OnLocationChange(oid, client_ids, "", 0); + pull_manager_.OnLocationChange(oid, client_ids, ""); // Now we're getting a notification about an object that was already cancelled. ASSERT_EQ(num_send_pull_request_calls_, 0); ASSERT_EQ(num_restore_spilled_object_calls_, 0); - - AssertNoLeaks(); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); } TEST_F(PullManagerTest, TestRestoreSpilledObject) { auto refs = CreateObjectRefs(1); auto obj1 = ObjectRefsToIds(refs)[0]; rpc::Address addr1; - AssertNumActiveRequestsEquals(0); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); std::vector objects_to_locate; auto req_id = pull_manager_.Pull(refs, &objects_to_locate); ASSERT_EQ(ObjectRefsToIds(objects_to_locate), ObjectRefsToIds(refs)); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 1); std::unordered_set client_ids; - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); - AssertNumActiveRequestsEquals(1); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar"); // client_ids is empty here, so there's nowhere to pull from. ASSERT_EQ(num_send_pull_request_calls_, 0); @@ -137,7 +101,7 @@ TEST_F(PullManagerTest, TestRestoreSpilledObject) { client_ids.insert(NodeID::FromRandom()); fake_time_ += 10.; - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar"); // The behavior is supposed to be to always restore the spilled object if possible (even // if it exists elsewhere in the cluster). @@ -147,27 +111,26 @@ TEST_F(PullManagerTest, TestRestoreSpilledObject) { // Don't restore an object if it's local. object_is_local_ = true; num_restore_spilled_object_calls_ = 0; - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar"); ASSERT_EQ(num_restore_spilled_object_calls_, 0); auto objects_to_cancel = pull_manager_.CancelPull(req_id); ASSERT_EQ(objects_to_cancel, ObjectRefsToIds(refs)); - - AssertNoLeaks(); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); } TEST_F(PullManagerTest, TestRestoreObjectFailed) { auto refs = CreateObjectRefs(1); auto obj1 = ObjectRefsToIds(refs)[0]; rpc::Address addr1; - AssertNumActiveRequestsEquals(0); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); std::vector objects_to_locate; - auto req_id = pull_manager_.Pull(refs, &objects_to_locate); + pull_manager_.Pull(refs, &objects_to_locate); ASSERT_EQ(ObjectRefsToIds(objects_to_locate), ObjectRefsToIds(refs)); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 1); std::unordered_set client_ids; - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); - AssertNumActiveRequestsEquals(1); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar"); // client_ids is empty here, so there's nowhere to pull from. ASSERT_EQ(num_send_pull_request_calls_, 0); @@ -180,14 +143,14 @@ TEST_F(PullManagerTest, TestRestoreObjectFailed) { ASSERT_EQ(num_restore_spilled_object_calls_, 1); client_ids.insert(NodeID::FromRandom()); - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar"); // We always assume the restore succeeded so there's only 1 restore call still. ASSERT_EQ(num_send_pull_request_calls_, 0); ASSERT_EQ(num_restore_spilled_object_calls_, 1); fake_time_ += 10.0; - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar"); ASSERT_EQ(num_send_pull_request_calls_, 0); ASSERT_EQ(num_restore_spilled_object_calls_, 2); @@ -198,32 +161,29 @@ TEST_F(PullManagerTest, TestRestoreObjectFailed) { ASSERT_EQ(num_send_pull_request_calls_, 1); ASSERT_EQ(num_restore_spilled_object_calls_, 2); - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar"); // Now that we've successfully sent a pull request, we need to wait for the retry period // before sending another one. ASSERT_EQ(num_send_pull_request_calls_, 1); ASSERT_EQ(num_restore_spilled_object_calls_, 2); - - pull_manager_.CancelPull(req_id); - AssertNoLeaks(); } TEST_F(PullManagerTest, TestManyUpdates) { auto refs = CreateObjectRefs(1); auto obj1 = ObjectRefsToIds(refs)[0]; rpc::Address addr1; - AssertNumActiveRequestsEquals(0); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); std::vector objects_to_locate; auto req_id = pull_manager_.Pull(refs, &objects_to_locate); ASSERT_EQ(ObjectRefsToIds(objects_to_locate), ObjectRefsToIds(refs)); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 1); std::unordered_set client_ids; client_ids.insert(NodeID::FromRandom()); for (int i = 0; i < 100; i++) { - pull_manager_.OnLocationChange(obj1, client_ids, "", 0); - AssertNumActiveRequestsEquals(1); + pull_manager_.OnLocationChange(obj1, client_ids, ""); } // Since no time has passed, only send a single pull request. @@ -232,26 +192,25 @@ TEST_F(PullManagerTest, TestManyUpdates) { auto objects_to_cancel = pull_manager_.CancelPull(req_id); ASSERT_EQ(objects_to_cancel, ObjectRefsToIds(refs)); - - AssertNoLeaks(); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); } TEST_F(PullManagerTest, TestRetryTimer) { auto refs = CreateObjectRefs(1); auto obj1 = ObjectRefsToIds(refs)[0]; rpc::Address addr1; - AssertNumActiveRequestsEquals(0); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); std::vector objects_to_locate; auto req_id = pull_manager_.Pull(refs, &objects_to_locate); ASSERT_EQ(ObjectRefsToIds(objects_to_locate), ObjectRefsToIds(refs)); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 1); std::unordered_set client_ids; client_ids.insert(NodeID::FromRandom()); // We need to call OnLocationChange at least once, to population the list of nodes with // the object. - pull_manager_.OnLocationChange(obj1, client_ids, "", 0); - AssertNumActiveRequestsEquals(1); + pull_manager_.OnLocationChange(obj1, client_ids, ""); ASSERT_EQ(num_send_pull_request_calls_, 1); ASSERT_EQ(num_restore_spilled_object_calls_, 0); @@ -261,7 +220,7 @@ TEST_F(PullManagerTest, TestRetryTimer) { // Location changes can trigger reset timer. for (; fake_time_ <= 120 * 10; fake_time_ += 1.) { - pull_manager_.OnLocationChange(obj1, client_ids, "", 0); + pull_manager_.OnLocationChange(obj1, client_ids, ""); } // We should make a pull request every tick (even if it's a duplicate to a node we're @@ -279,59 +238,55 @@ TEST_F(PullManagerTest, TestRetryTimer) { auto objects_to_cancel = pull_manager_.CancelPull(req_id); ASSERT_EQ(objects_to_cancel, ObjectRefsToIds(refs)); - - AssertNoLeaks(); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); } TEST_F(PullManagerTest, TestBasic) { auto refs = CreateObjectRefs(3); auto oids = ObjectRefsToIds(refs); - AssertNumActiveRequestsEquals(0); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); std::vector objects_to_locate; auto req_id = pull_manager_.Pull(refs, &objects_to_locate); ASSERT_EQ(ObjectRefsToIds(objects_to_locate), oids); + ASSERT_EQ(pull_manager_.NumActiveRequests(), oids.size()); std::unordered_set client_ids; client_ids.insert(NodeID::FromRandom()); for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", 0); + pull_manager_.OnLocationChange(oids[i], client_ids, ""); + ASSERT_EQ(num_send_pull_request_calls_, i + 1); + ASSERT_EQ(num_restore_spilled_object_calls_, 0); } - ASSERT_EQ(num_send_pull_request_calls_, oids.size()); - ASSERT_EQ(num_restore_spilled_object_calls_, 0); - AssertNumActiveRequestsEquals(oids.size()); // Don't pull an object if it's local. object_is_local_ = true; num_send_pull_request_calls_ = 0; - fake_time_ += 10; for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", 0); + pull_manager_.OnLocationChange(oids[i], client_ids, ""); } ASSERT_EQ(num_send_pull_request_calls_, 0); auto objects_to_cancel = pull_manager_.CancelPull(req_id); ASSERT_EQ(objects_to_cancel, oids); - AssertNumActiveRequestsEquals(0); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); // Don't pull a remote object if we've canceled. object_is_local_ = false; num_send_pull_request_calls_ = 0; - fake_time_ += 10; for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", 0); + pull_manager_.OnLocationChange(oids[i], client_ids, ""); } ASSERT_EQ(num_send_pull_request_calls_, 0); - - AssertNoLeaks(); } TEST_F(PullManagerTest, TestDeduplicateBundles) { auto refs = CreateObjectRefs(3); auto oids = ObjectRefsToIds(refs); - AssertNumActiveRequestsEquals(0); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); std::vector objects_to_locate; auto req_id1 = pull_manager_.Pull(refs, &objects_to_locate); ASSERT_EQ(ObjectRefsToIds(objects_to_locate), oids); + ASSERT_EQ(pull_manager_.NumActiveRequests(), oids.size()); objects_to_locate.clear(); auto req_id2 = pull_manager_.Pull(refs, &objects_to_locate); @@ -340,21 +295,20 @@ TEST_F(PullManagerTest, TestDeduplicateBundles) { std::unordered_set client_ids; client_ids.insert(NodeID::FromRandom()); for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", 0); + pull_manager_.OnLocationChange(oids[i], client_ids, ""); + ASSERT_EQ(num_send_pull_request_calls_, i + 1); + ASSERT_EQ(num_restore_spilled_object_calls_, 0); } - ASSERT_EQ(num_send_pull_request_calls_, oids.size()); - ASSERT_EQ(num_restore_spilled_object_calls_, 0); - AssertNumActiveRequestsEquals(oids.size()); // Cancel one request. auto objects_to_cancel = pull_manager_.CancelPull(req_id1); ASSERT_TRUE(objects_to_cancel.empty()); // Objects should still be pulled because the other request is still open. - AssertNumActiveRequestsEquals(oids.size()); + ASSERT_EQ(pull_manager_.NumActiveRequests(), oids.size()); fake_time_ += 10; num_send_pull_request_calls_ = 0; for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", 0); + pull_manager_.OnLocationChange(oids[i], client_ids, ""); ASSERT_EQ(num_send_pull_request_calls_, i + 1); ASSERT_EQ(num_restore_spilled_object_calls_, 0); } @@ -362,191 +316,15 @@ TEST_F(PullManagerTest, TestDeduplicateBundles) { // Cancel the other request. objects_to_cancel = pull_manager_.CancelPull(req_id2); ASSERT_EQ(objects_to_cancel, oids); - AssertNumActiveRequestsEquals(0); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); // Don't pull a remote object if we've canceled. object_is_local_ = false; num_send_pull_request_calls_ = 0; for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", 0); + pull_manager_.OnLocationChange(oids[i], client_ids, ""); } ASSERT_EQ(num_send_pull_request_calls_, 0); - - AssertNoLeaks(); -} - -TEST_F(PullManagerWithAdmissionControlTest, TestBasic) { - /// Test admission control for a single pull bundle request. We should - /// activate the request when we are under the reported capacity and - /// deactivate it when we are over. - auto refs = CreateObjectRefs(3); - auto oids = ObjectRefsToIds(refs); - size_t object_size = 2; - AssertNumActiveRequestsEquals(0); - std::vector objects_to_locate; - auto req_id = pull_manager_.Pull(refs, &objects_to_locate); - ASSERT_EQ(ObjectRefsToIds(objects_to_locate), oids); - - std::unordered_set client_ids; - client_ids.insert(NodeID::FromRandom()); - for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", object_size); - } - ASSERT_EQ(num_send_pull_request_calls_, oids.size()); - ASSERT_EQ(num_restore_spilled_object_calls_, 0); - AssertNumActiveRequestsEquals(oids.size()); - ASSERT_TRUE(IsUnderCapacity(oids.size() * object_size)); - - // Reduce the available memory. - ASSERT_EQ(num_object_store_full_calls_, 0); - pull_manager_.UpdatePullsBasedOnAvailableMemory(oids.size() * object_size - 1); - AssertNumActiveRequestsEquals(0); - ASSERT_EQ(num_object_store_full_calls_, 1); - // No new pull requests after the next tick. - fake_time_ += 10; - auto prev_pull_requests = num_send_pull_request_calls_; - for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", object_size); - ASSERT_EQ(num_send_pull_request_calls_, prev_pull_requests); - ASSERT_EQ(num_restore_spilled_object_calls_, 0); - } - - // Increase the available memory again. - pull_manager_.UpdatePullsBasedOnAvailableMemory(oids.size() * object_size); - AssertNumActiveRequestsEquals(oids.size()); - ASSERT_TRUE(IsUnderCapacity(oids.size() * object_size)); - ASSERT_EQ(num_send_pull_request_calls_, prev_pull_requests + oids.size()); - - // OOM was not triggered a second time. - ASSERT_EQ(num_object_store_full_calls_, 1); - num_object_store_full_calls_ = 0; - - pull_manager_.CancelPull(req_id); - AssertNoLeaks(); -} - -TEST_F(PullManagerWithAdmissionControlTest, TestQueue) { - /// Test admission control for a queue of pull bundle requests. We should - /// activate as many requests as we can, subject to the reported capacity. - int object_size = 2; - int num_oids_per_request = 2; - int num_requests = 3; - - std::vector> bundles; - std::vector req_ids; - for (int i = 0; i < num_requests; i++) { - auto refs = CreateObjectRefs(num_oids_per_request); - auto oids = ObjectRefsToIds(refs); - std::vector objects_to_locate; - auto req_id = pull_manager_.Pull(refs, &objects_to_locate); - ASSERT_EQ(ObjectRefsToIds(objects_to_locate), oids); - - bundles.push_back(oids); - req_ids.push_back(req_id); - } - - std::unordered_set client_ids; - client_ids.insert(NodeID::FromRandom()); - for (auto &oids : bundles) { - for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", object_size); - } - } - - for (int capacity = 0; capacity < 20; capacity++) { - int num_requests_expected = - std::min(num_requests, capacity / (object_size * num_oids_per_request)); - pull_manager_.UpdatePullsBasedOnAvailableMemory(capacity); - - AssertNumActiveRequestsEquals(num_requests_expected * num_oids_per_request); - // The total requests that are active is under the specified capacity. - ASSERT_TRUE( - IsUnderCapacity(num_requests_expected * num_oids_per_request * object_size)); - // This is the maximum number of requests that can be served at once that - // is under the capacity. - if (num_requests_expected < num_requests) { - ASSERT_FALSE(IsUnderCapacity((num_requests_expected + 1) * num_oids_per_request * - object_size)); - } - // Check that OOM was triggered. - if (num_requests_expected == 0) { - ASSERT_EQ(num_object_store_full_calls_, 1); - } else { - ASSERT_EQ(num_object_store_full_calls_, 0); - } - num_object_store_full_calls_ = 0; - } - - for (auto req_id : req_ids) { - pull_manager_.CancelPull(req_id); - } - AssertNoLeaks(); -} - -TEST_F(PullManagerWithAdmissionControlTest, TestCancel) { - /// Test admission control while requests are cancelled out-of-order. When an - /// active request is cancelled, we should activate another request in the - /// queue, if there is one that satisfies the reported capacity. - auto test_cancel = [&](std::vector object_sizes, int capacity, size_t cancel_idx, - int num_active_requests_expected_before, - int num_active_requests_expected_after) { - pull_manager_.UpdatePullsBasedOnAvailableMemory(capacity); - auto refs = CreateObjectRefs(object_sizes.size()); - auto oids = ObjectRefsToIds(refs); - std::vector req_ids; - for (auto &ref : refs) { - std::vector objects_to_locate; - auto req_id = pull_manager_.Pull({ref}, &objects_to_locate); - req_ids.push_back(req_id); - } - for (size_t i = 0; i < object_sizes.size(); i++) { - pull_manager_.OnLocationChange(oids[i], {}, "", object_sizes[i]); - } - AssertNumActiveRequestsEquals(num_active_requests_expected_before); - pull_manager_.CancelPull(req_ids[cancel_idx]); - AssertNumActiveRequestsEquals(num_active_requests_expected_after); - - // Request is really canceled. - pull_manager_.OnLocationChange(oids[cancel_idx], {NodeID::FromRandom()}, "", - object_sizes[cancel_idx]); - ASSERT_EQ(num_send_pull_request_calls_, 0); - - // The expected number of requests at the head of the queue are pulled. - int num_active = 0; - for (size_t i = 0; i < refs.size() && num_active < num_active_requests_expected_after; - i++) { - pull_manager_.OnLocationChange(oids[i], {NodeID::FromRandom()}, "", - object_sizes[i]); - if (i != cancel_idx) { - num_active++; - } - } - ASSERT_EQ(num_send_pull_request_calls_, num_active_requests_expected_after); - - // Reset state. - for (size_t i = 0; i < req_ids.size(); i++) { - if (i != cancel_idx) { - pull_manager_.CancelPull(req_ids[i]); - } - } - num_send_pull_request_calls_ = 0; - }; - - // The next request in the queue is infeasible. If it is canceled, the - // request after that is activated. - test_cancel({1, 1, 2, 1}, 3, 2, 2, 3); - - // If an activated request is canceled, the next request is activated. - test_cancel({1, 1, 2, 1}, 3, 0, 2, 2); - test_cancel({1, 1, 2, 1}, 3, 1, 2, 2); - - // Cancellation of requests at the end of the queue has no effect. - test_cancel({1, 1, 2, 1, 1}, 3, 3, 2, 2); - - // As many new requests as possible are activated when one is canceled. - test_cancel({1, 2, 1, 1, 1}, 3, 1, 2, 3); - - AssertNoLeaks(); } } // namespace ray diff --git a/src/ray/protobuf/core_worker.proto b/src/ray/protobuf/core_worker.proto index 43a3a667407b..799530d274e9 100644 --- a/src/ray/protobuf/core_worker.proto +++ b/src/ray/protobuf/core_worker.proto @@ -186,7 +186,6 @@ message GetObjectLocationsOwnerRequest { message GetObjectLocationsOwnerReply { repeated bytes node_ids = 1; - uint64 object_size = 2; } message KillActorRequest { diff --git a/src/ray/protobuf/gcs.proto b/src/ray/protobuf/gcs.proto index a332a908159e..d0793c35ca13 100644 --- a/src/ray/protobuf/gcs.proto +++ b/src/ray/protobuf/gcs.proto @@ -413,8 +413,6 @@ message ObjectLocationInfo { // For objects that have been spilled to external storage, the URL from which // they can be retrieved. string spilled_url = 3; - // The size of the object in bytes. - uint64 size = 4; } // A notification message about one object's locations being changed. @@ -425,8 +423,6 @@ message ObjectLocationChange { // The object has been spilled to this URL. This should be set xor the above // fields are set. string spilled_url = 3; - // The size of the object in bytes. - uint64 size = 4; } // A notification message about one node's resources being changed. diff --git a/src/ray/protobuf/gcs_service.proto b/src/ray/protobuf/gcs_service.proto index eda00b806b26..35c86b3bedbe 100644 --- a/src/ray/protobuf/gcs_service.proto +++ b/src/ray/protobuf/gcs_service.proto @@ -272,8 +272,6 @@ message AddObjectLocationRequest { // The spilled URL that will be added to GCS Service. Either this or the node // ID should be set. string spilled_url = 3; - // The size of the object in bytes. - uint64 size = 4; } message AddObjectLocationReply { diff --git a/src/ray/raylet/reconstruction_policy.cc b/src/ray/raylet/reconstruction_policy.cc index f4fd3d025fda..59d4789f08c5 100644 --- a/src/ray/raylet/reconstruction_policy.cc +++ b/src/ray/raylet/reconstruction_policy.cc @@ -179,7 +179,7 @@ void ReconstructionPolicy::HandleTaskLeaseExpired(const TaskID &task_id) { created_object_id, it->second.owner_addresses[created_object_id], [this, task_id, reconstruction_attempt]( const ray::ObjectID &object_id, const std::unordered_set &nodes, - const std::string &spilled_url, size_t object_size) { + const std::string &spilled_url) { if (nodes.empty() && spilled_url.empty()) { // The required object no longer exists on any live nodes. Attempt // reconstruction. diff --git a/src/ray/raylet/reconstruction_policy_test.cc b/src/ray/raylet/reconstruction_policy_test.cc index 8b5fd9d0e75c..199e4d51ee2d 100644 --- a/src/ray/raylet/reconstruction_policy_test.cc +++ b/src/ray/raylet/reconstruction_policy_test.cc @@ -58,9 +58,9 @@ class MockObjectDirectory : public ObjectDirectoryInterface { const ObjectID object_id = callback.first; auto it = locations_.find(object_id); if (it == locations_.end()) { - callback.second(object_id, std::unordered_set(), "", 0); + callback.second(object_id, std::unordered_set(), ""); } else { - callback.second(object_id, it->second, "", 0); + callback.second(object_id, it->second, ""); } } callbacks_.clear(); diff --git a/src/ray/raylet/test/local_object_manager_test.cc b/src/ray/raylet/test/local_object_manager_test.cc index bbae5bb144b0..616e7348283b 100644 --- a/src/ray/raylet/test/local_object_manager_test.cc +++ b/src/ray/raylet/test/local_object_manager_test.cc @@ -185,9 +185,8 @@ class MockObjectInfoAccessor : public gcs::ObjectInfoAccessor { MOCK_METHOD1(AsyncGetAll, Status(const gcs::MultiItemCallback &callback)); - MOCK_METHOD4(AsyncAddLocation, - Status(const ObjectID &object_id, const NodeID &node_id, - size_t object_size, const gcs::StatusCallback &callback)); + MOCK_METHOD3(AsyncAddLocation, Status(const ObjectID &object_id, const NodeID &node_id, + const gcs::StatusCallback &callback)); Status AsyncAddSpilledUrl(const ObjectID &object_id, const std::string &spilled_url, const gcs::StatusCallback &callback) { diff --git a/src/ray/test/run_object_manager_tests.sh b/src/ray/test/run_object_manager_tests.sh new file mode 100755 index 000000000000..ebb5eba223aa --- /dev/null +++ b/src/ray/test/run_object_manager_tests.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +# This needs to be run in the root directory. + +# Cause the script to exit if a single command fails. +set -e +set -x + +bazel build "//:object_manager_stress_test" "//:object_manager_test" "//:plasma_store_server" + +# Get the directory in which this script is executing. +SCRIPT_DIR="$(dirname "$0")" +RAY_ROOT="$SCRIPT_DIR/../../.." +# Makes $RAY_ROOT an absolute path. +RAY_ROOT="$(cd "$RAY_ROOT" && pwd)" +if [ -z "$RAY_ROOT" ] ; then + exit 1 +fi +# Ensure we're in the right directory. +if [ ! -d "$RAY_ROOT/python" ]; then + echo "Unable to find root Ray directory. Has this script moved?" + exit 1 +fi + +REDIS_MODULE="./bazel-bin/libray_redis_module.so" +LOAD_MODULE_ARGS=(--loadmodule "${REDIS_MODULE}") +STORE_EXEC="./bazel-bin/plasma_store_server" +GCS_SERVER_EXEC="./bazel-bin/gcs_server" + +# Allow cleanup commands to fail. +bazel run //:redis-cli -- -p 6379 shutdown || true +bazel run //:redis-cli -- -p 6380 shutdown || true +sleep 1s +bazel run //:redis-server -- --loglevel warning "${LOAD_MODULE_ARGS[@]}" --port 6379 & +bazel run //:redis-server -- --loglevel warning "${LOAD_MODULE_ARGS[@]}" --port 6380 & +sleep 1s +# Run tests. +./bazel-bin/object_manager_stress_test $STORE_EXEC $GCS_SERVER_EXEC +sleep 1s +# Use timeout=1000ms for the Wait tests. +./bazel-bin/object_manager_test $STORE_EXEC 1000 $GCS_SERVER_EXEC +bazel run //:redis-cli -- -p 6379 shutdown +bazel run //:redis-cli -- -p 6380 shutdown From 3fb2ba33c1bbd65cf05a891cc05797569df24733 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 235/244] Revert "add 3.8 (#13608)" This reverts commit 721ae664940951b57b59195a9d54ee21fecbb69e. --- .../linux-py3.8-requirements_tune.txt | 864 ------------------ 1 file changed, 864 deletions(-) delete mode 100644 python/requirements/linux-py3.8-requirements_tune.txt diff --git a/python/requirements/linux-py3.8-requirements_tune.txt b/python/requirements/linux-py3.8-requirements_tune.txt deleted file mode 100644 index 36dbb1dce9ad..000000000000 --- a/python/requirements/linux-py3.8-requirements_tune.txt +++ /dev/null @@ -1,864 +0,0 @@ -# -# This file is autogenerated by pip-compile -# To update, run: -# -# pip-compile requirements_tune.in -# ---find-links https://download.pytorch.org/whl/torch_stable.html - -absl-py==0.11.0 - # via tensorboard -alembic==1.4.1 - # via - # mlflow - # optuna -argon2-cffi==20.1.0 - # via notebook -async-generator==1.10 - # via nbclient -atari-py==0.2.6 - # via - # -c ../requirements.txt - # gym -attrs==20.3.0 - # via - # cmd2 - # jsonschema - # pytest -autocfg==0.0.6 - # via gluoncv -autogluon.core==0.0.16b20210121 - # via gluoncv -autograd==1.3 - # via autogluon.core -ax-platform==0.1.19 ; python_version >= "3.7" - # via -r requirements_tune.in -azure-core==1.10.0 - # via azure-storage-blob -azure-storage-blob==12.7.1 - # via mlflow -backcall==0.2.0 - # via ipython -bayesian-optimization==1.2.0 - # via - # -r requirements_tune.in - # nevergrad -bcrypt==3.2.0 - # via paramiko -bleach==3.2.2 - # via nbconvert -bokeh==2.2.3 - # via dask -boto3==1.16.57 - # via - # -c ../requirements.txt - # autogluon.core - # smart-open -botocore==1.19.57 - # via - # boto3 - # s3transfer -botorch==0.3.3 - # via ax-platform -cachetools==4.2.0 - # via google-auth -certifi==2020.12.5 - # via - # kubernetes - # msrest - # requests - # sentry-sdk -cffi==1.14.4 - # via - # argon2-cffi - # bcrypt - # cryptography - # pynacl -chardet==4.0.0 - # via requests -click==7.1.2 - # via - # -c ../requirements.txt - # databricks-cli - # distributed - # flask - # mlflow - # sacremoses - # wandb -cliff==3.6.0 - # via optuna -cloudpickle==1.6.0 - # via - # dask - # distributed - # gym - # hyperopt - # mlflow - # tensorflow-probability -cma==3.0.3 - # via nevergrad -cmaes==0.7.0 - # via optuna -cmd2==1.4.0 - # via cliff -colorama==0.4.4 - # via - # -c ../requirements.txt - # cmd2 -colorlog==4.7.2 - # via optuna -configparser==5.0.1 - # via wandb -configspace==0.4.10 - # via - # -r requirements_tune.in - # autogluon.core - # hpbandster -cryptography==3.3.1 - # via - # azure-storage-blob - # paramiko -cycler==0.10.0 - # via matplotlib -cython==0.29.0 - # via - # -c ../requirements.txt - # autogluon.core - # configspace -dask[complete]==2021.1.0 - # via - # -c ../requirements.txt - # autogluon.core - # distributed -databricks-cli==0.14.1 - # via mlflow -dataclasses==0.6 - # via torch -decorator==4.4.2 - # via - # ipython - # networkx - # paramz - # tensorflow-probability -decord==0.4.2 - # via gluoncv -defusedxml==0.6.0 - # via nbconvert -dill==0.3.3 - # via autogluon.core -distributed==2021.1.0 - # via - # autogluon.core - # dask -dm-tree==0.1.5 - # via - # -c ../requirements.txt - # tensorflow-probability -docker-pycreds==0.4.0 - # via wandb -docker==4.4.1 - # via mlflow -dragonfly-opt==0.1.6 - # via -r requirements_tune.in -entrypoints==0.3 - # via - # mlflow - # nbconvert -filelock==3.0.12 - # via - # -c ../requirements.txt - # transformers -flask==1.1.2 - # via - # -c ../requirements.txt - # mlflow - # prometheus-flask-exporter -fsspec==0.8.5 - # via - # dask - # pytorch-lightning -future==0.18.2 - # via - # autograd - # dragonfly-opt - # hyperopt - # pyglet - # pytorch-lightning - # torch -gast==0.4.0 - # via tensorflow-probability -gitdb==4.0.5 - # via gitpython -gitpython==3.1.12 - # via - # mlflow - # wandb -gluoncv==0.9.1 - # via -r requirements_tune.in -google-auth-oauthlib==0.4.2 - # via tensorboard -google-auth==1.24.0 - # via - # google-auth-oauthlib - # kubernetes - # tensorboard -gpy==1.9.9 - # via -r requirements_tune.in -gpytorch==1.3.1 - # via botorch -graphviz==0.8.4 - # via - # autogluon.core - # mxnet -grpcio==1.35.0 - # via - # -c ../requirements.txt - # tensorboard -gunicorn==20.0.4 - # via mlflow -gym[atari]==0.18.0 - # via - # -c ../requirements.txt - # -r requirements_tune.in -h5py==3.1.0 - # via - # -r requirements_tune.in - # keras -heapdict==1.0.1 - # via zict -hpbandster==0.7.4 - # via -r requirements_tune.in -hyperopt==0.2.5 - # via -r requirements_tune.in -idna==2.10 - # via requests -ipykernel==5.4.3 - # via - # ipywidgets - # jupyter - # jupyter-console - # notebook - # qtconsole -ipython-genutils==0.2.0 - # via - # nbformat - # notebook - # qtconsole - # traitlets -ipython==7.19.0 - # via - # ipykernel - # ipywidgets - # jupyter-console -ipywidgets==7.6.3 - # via jupyter -isodate==0.6.0 - # via msrest -itsdangerous==1.1.0 - # via flask -jedi==0.18.0 - # via ipython -jinja2==2.11.2 - # via - # ax-platform - # bokeh - # flask - # nbconvert - # notebook -jmespath==0.10.0 - # via - # boto3 - # botocore -joblib==1.0.0 - # via - # optuna - # sacremoses - # scikit-learn - # scikit-optimize -jsonschema==3.2.0 - # via - # -c ../requirements.txt - # nbformat -jupyter-client==6.1.11 - # via - # ipykernel - # jupyter-console - # nbclient - # notebook - # qtconsole -jupyter-console==6.2.0 - # via jupyter -jupyter-core==4.7.0 - # via - # jupyter-client - # nbconvert - # nbformat - # notebook - # qtconsole -jupyter==1.0.0 - # via -r requirements_tune.in -jupyterlab-pygments==0.1.2 - # via nbconvert -jupyterlab-widgets==1.0.0 - # via ipywidgets -keras==2.4.3 - # via -r requirements_tune.in -kiwisolver==1.3.1 - # via matplotlib -kubernetes==12.0.1 - # via - # -c ../requirements.txt - # -r requirements_tune.in -lightgbm==3.1.1 - # via -r requirements_tune.in -locket==0.2.1 - # via partd -mako==1.1.4 - # via alembic -markdown==3.3.3 - # via tensorboard -markupsafe==1.1.1 - # via - # jinja2 - # mako -matplotlib==3.3.3 - # via - # -r requirements_tune.in - # autogluon.core - # gluoncv - # zoopt -mistune==0.8.4 - # via nbconvert -mlflow==1.13.1 - # via -r requirements_tune.in -more-itertools==8.6.0 - # via pytest -msgpack==1.0.2 - # via - # -c ../requirements.txt - # distributed -msrest==0.6.19 - # via azure-storage-blob -mxnet==1.7.0.post1 - # via -r requirements_tune.in -nbclient==0.5.1 - # via nbconvert -nbconvert==6.0.7 - # via - # jupyter - # notebook -nbformat==5.1.2 - # via - # ipywidgets - # nbclient - # nbconvert - # notebook -nest-asyncio==1.4.3 - # via nbclient -netifaces==0.10.9 - # via hpbandster -networkx==2.5 - # via - # -c ../requirements.txt - # hyperopt -nevergrad==0.4.2.post5 - # via -r requirements_tune.in -notebook==6.2.0 - # via - # jupyter - # widgetsnbextension -numpy==1.19.5 - # via - # -c ../requirements.txt - # atari-py - # autogluon.core - # autograd - # bayesian-optimization - # bokeh - # cma - # cmaes - # configspace - # dask - # decord - # dragonfly-opt - # gluoncv - # gpy - # gym - # h5py - # hpbandster - # hyperopt - # keras - # lightgbm - # matplotlib - # mlflow - # mxnet - # nevergrad - # opencv-python - # optuna - # pandas - # paramz - # patsy - # pytorch-lightning - # scikit-learn - # scikit-optimize - # scipy - # statsmodels - # tensorboard - # tensorboardx - # tensorflow-probability - # torch - # torchvision - # transformers - # xgboost - # zoopt -oauthlib==3.1.0 - # via requests-oauthlib -opencv-python==4.5.1.48 - # via - # gluoncv - # gym -optuna==2.3.0 - # via -r requirements_tune.in -packaging==20.8 - # via - # bleach - # bokeh - # optuna - # pytest - # transformers -pandas==1.0.5 - # via - # -c ../requirements.txt - # autogluon.core - # ax-platform - # dask - # gluoncv - # mlflow - # statsmodels -pandocfilters==1.4.3 - # via nbconvert -paramiko==2.7.2 - # via autogluon.core -paramz==0.9.5 - # via gpy -parso==0.8.1 - # via jedi -partd==1.1.0 - # via dask -patsy==0.5.1 - # via statsmodels -pbr==5.5.1 - # via - # cliff - # stevedore -pexpect==4.8.0 - # via - # -c ../requirements.txt - # ipython -pickleshare==0.7.5 - # via ipython -pillow==7.2.0 ; platform_system != "Windows" - # via - # -c ../requirements.txt - # bokeh - # gluoncv - # gym - # matplotlib - # torchvision -plotly==4.14.3 - # via ax-platform -pluggy==0.13.1 - # via pytest -portalocker==2.0.0 - # via gluoncv -prettytable==0.7.2 - # via cliff -prometheus-client==0.9.0 - # via - # -c ../requirements.txt - # notebook - # prometheus-flask-exporter -prometheus-flask-exporter==0.18.1 - # via mlflow -promise==2.3 - # via wandb -prompt-toolkit==3.0.11 - # via - # ipython - # jupyter-console -protobuf==3.14.0 - # via - # -c ../requirements.txt - # mlflow - # tensorboard - # tensorboardx - # wandb -psutil==5.8.0 - # via - # distributed - # wandb -ptyprocess==0.7.0 - # via - # pexpect - # terminado -py==1.10.0 - # via pytest -pyaml==20.4.0 - # via scikit-optimize -pyasn1-modules==0.2.8 - # via google-auth -pyasn1==0.4.8 - # via - # pyasn1-modules - # rsa -pycparser==2.20 - # via cffi -pyglet==1.5.0 - # via gym -pygments==2.7.4 - # via - # -c ../requirements.txt - # ipython - # jupyter-console - # jupyterlab-pygments - # nbconvert - # qtconsole -pynacl==1.4.0 - # via paramiko -pyparsing==2.4.7 - # via - # cliff - # configspace - # matplotlib - # packaging -pyperclip==1.8.1 - # via cmd2 -pyro4==4.80 - # via hpbandster -pyrsistent==0.17.3 - # via jsonschema -pytest-remotedata==0.3.2 - # via -r requirements_tune.in -pytest==5.4.3 - # via - # -c ../requirements.txt - # autogluon.core - # pytest-remotedata -python-dateutil==2.8.1 - # via - # alembic - # bokeh - # botocore - # jupyter-client - # kubernetes - # matplotlib - # mlflow - # pandas - # wandb -python-editor==1.0.4 - # via alembic -pytorch-lightning-bolts==0.2.5 - # via -r requirements_tune.in -pytorch-lightning==1.0.3 - # via - # -r requirements_tune.in - # pytorch-lightning-bolts -pytz==2020.5 - # via pandas -pyyaml==5.4.1 - # via - # -c ../requirements.txt - # autocfg - # bokeh - # cliff - # dask - # distributed - # gluoncv - # keras - # kubernetes - # mlflow - # pyaml - # pytorch-lightning - # wandb - # yacs -pyzmq==21.0.1 - # via - # jupyter-client - # notebook - # qtconsole -qtconsole==5.0.1 - # via jupyter -qtpy==1.9.0 - # via qtconsole -querystring-parser==1.2.4 - # via mlflow -regex==2020.11.13 - # via - # sacremoses - # transformers -requests-oauthlib==1.3.0 - # via - # google-auth-oauthlib - # kubernetes - # msrest -requests==2.25.1 - # via - # -c ../requirements.txt - # autogluon.core - # azure-core - # databricks-cli - # docker - # gluoncv - # kubernetes - # mlflow - # msrest - # mxnet - # requests-oauthlib - # sigopt - # tensorboard - # transformers - # wandb -retrying==1.3.3 - # via plotly -rsa==4.7 - # via google-auth -s3transfer==0.3.4 - # via boto3 -sacremoses==0.0.43 - # via transformers -scikit-learn==0.22.2 - # via - # -c ../requirements.txt - # -r requirements_tune.in - # autogluon.core - # ax-platform - # bayesian-optimization - # gpytorch - # lightgbm - # scikit-optimize -scikit-optimize==0.8.1 - # via - # -r requirements_tune.in - # autogluon.core -scipy==1.4.1 - # via - # -c ../requirements.txt - # autogluon.core - # ax-platform - # bayesian-optimization - # botorch - # dragonfly-opt - # gluoncv - # gpy - # gpytorch - # gym - # hpbandster - # hyperopt - # keras - # lightgbm - # optuna - # paramz - # scikit-learn - # scikit-optimize - # statsmodels - # xgboost -send2trash==1.5.0 - # via notebook -sentencepiece==0.1.95 - # via transformers -sentry-sdk==0.19.5 - # via wandb -serpent==1.30.2 - # via - # hpbandster - # pyro4 -shortuuid==1.0.1 - # via wandb -sigopt==5.7.0 - # via -r requirements_tune.in -six==1.15.0 - # via - # absl-py - # argon2-cffi - # atari-py - # azure-core - # bcrypt - # bleach - # cryptography - # cycler - # databricks-cli - # dm-tree - # docker - # docker-pycreds - # dragonfly-opt - # google-auth - # gpy - # grpcio - # hyperopt - # isodate - # jsonschema - # kubernetes - # mlflow - # paramz - # patsy - # plotly - # promise - # protobuf - # pynacl - # pytest-remotedata - # python-dateutil - # querystring-parser - # retrying - # sacremoses - # tensorboard - # tensorboardx - # tensorflow-probability - # wandb - # websocket-client -smart_open==4.0.1 - # via - # -c ../requirements.txt - # -r requirements_tune.in -smmap==3.0.4 - # via gitdb -sortedcontainers==2.3.0 - # via distributed -sqlalchemy==1.3.22 - # via - # alembic - # mlflow - # optuna -sqlparse==0.4.1 - # via mlflow -statsmodels==0.12.1 - # via hpbandster -stevedore==3.3.0 - # via cliff -subprocess32==3.5.4 - # via wandb -tabulate==0.8.7 - # via - # -c ../requirements.txt - # databricks-cli -tblib==1.7.0 - # via distributed -tensorboard-plugin-wit==1.8.0 - # via tensorboard -tensorboard==2.4.1 - # via pytorch-lightning -tensorboardx==2.1 - # via - # -c ../requirements.txt - # gluoncv -tensorflow-probability==0.11.1 - # via -r requirements_tune.in -terminado==0.9.2 - # via notebook -testpath==0.4.4 - # via nbconvert -timm==0.3.2 - # via -r requirements_tune.in -tokenizers==0.8.1.rc2 - # via transformers -toolz==0.11.1 - # via - # dask - # distributed - # partd -torch==1.7.0+cpu ; sys_platform != "darwin" - # via - # -r requirements_tune.in - # botorch - # gpytorch - # pytorch-lightning - # pytorch-lightning-bolts - # timm - # torchvision -torchvision==0.8.1+cpu ; sys_platform != "darwin" - # via - # -r requirements_tune.in - # timm -tornado==6.1 - # via - # autogluon.core - # bokeh - # distributed - # ipykernel - # jupyter-client - # notebook - # terminado -tqdm==4.56.0 - # via - # autogluon.core - # gluoncv - # hyperopt - # optuna - # pytorch-lightning - # sacremoses - # transformers -traitlets==5.0.5 - # via - # ipykernel - # ipython - # ipywidgets - # jupyter-client - # jupyter-core - # nbclient - # nbconvert - # nbformat - # notebook - # qtconsole -transformers==3.1 - # via -r requirements_tune.in -typeguard==2.10.0 - # via ax-platform -typing-extensions==3.7.4.3 - # via - # bokeh - # nevergrad - # torch -typing==3.7.4.3 - # via configspace -urllib3==1.26.2 - # via - # botocore - # kubernetes - # requests - # sentry-sdk -wandb==0.10.12 - # via -r requirements_tune.in -watchdog==1.0.2 - # via wandb -wcwidth==0.2.5 - # via - # cmd2 - # prompt-toolkit - # pytest -webencodings==0.5.1 - # via bleach -websocket-client==0.57.0 - # via - # docker - # kubernetes -werkzeug==1.0.1 - # via - # -c ../requirements.txt - # flask - # tensorboard -wheel==0.36.2 - # via - # lightgbm - # tensorboard -widgetsnbextension==3.5.1 - # via ipywidgets -xgboost==1.3.0.post0 - # via -r requirements_tune.in -yacs==0.1.8 - # via gluoncv -zict==2.0.0 - # via distributed -zoopt==0.4.1 - # via -r requirements_tune.in - -# The following packages are considered to be unsafe in a requirements file: -# setuptools From bd3ad3b5088c28b72c14b200fef7b1722fab4a45 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 236/244] Revert "Revert "Inline small objects in GetObjectStatus response. (#13309)" (#13615)" This reverts commit 292fd948dd078b2e1afc959a908814321d9c2b1b. --- python/ray/_raylet.pyx | 7 +-- python/ray/includes/libcoreworker.pxd | 3 +- python/ray/tests/test_advanced.py | 37 ++++++++++++++ src/ray/core_worker/core_worker.cc | 48 +++++++++++++----- src/ray/core_worker/core_worker.h | 4 +- src/ray/core_worker/future_resolver.cc | 69 +++++++++++++++++--------- src/ray/core_worker/future_resolver.h | 1 + src/ray/protobuf/core_worker.proto | 12 +++++ 8 files changed, 140 insertions(+), 41 deletions(-) diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 8ba80852fb40..4b5f9deeef1a 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -898,16 +898,17 @@ cdef class CoreWorker: return RayObjectsToDataMetadataPairs(results) - def object_exists(self, ObjectRef object_ref): + def object_exists(self, ObjectRef object_ref, memory_store_only=False): cdef: c_bool has_object + c_bool is_in_plasma CObjectID c_object_id = object_ref.native() with nogil: check_status(CCoreWorkerProcess.GetCoreWorker().Contains( - c_object_id, &has_object)) + c_object_id, &has_object, &is_in_plasma)) - return has_object + return has_object and (not memory_store_only or not is_in_plasma) cdef _create_put_buffer(self, shared_ptr[CBuffer] &metadata, size_t data_size, ObjectRef object_ref, diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd index f1acad1fadd8..637dbd750020 100644 --- a/python/ray/includes/libcoreworker.pxd +++ b/python/ray/includes/libcoreworker.pxd @@ -183,7 +183,8 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: CRayStatus Get(const c_vector[CObjectID] &ids, int64_t timeout_ms, c_vector[shared_ptr[CRayObject]] *results, c_bool plasma_objects_only) - CRayStatus Contains(const CObjectID &object_id, c_bool *has_object) + CRayStatus Contains(const CObjectID &object_id, c_bool *has_object, + c_bool *is_in_plasma) CRayStatus Wait(const c_vector[CObjectID] &object_ids, int num_objects, int64_t timeout_ms, c_vector[c_bool] *results, c_bool fetch_local) diff --git a/python/ray/tests/test_advanced.py b/python/ray/tests/test_advanced.py index 6df746fdcd91..8f607009ee49 100644 --- a/python/ray/tests/test_advanced.py +++ b/python/ray/tests/test_advanced.py @@ -521,6 +521,43 @@ def method(self): assert ray.worker.global_worker.core_worker.object_exists(x_id) +@pytest.mark.skipif(client_test_enabled(), reason="internal api") +def test_future_resolution_skip_plasma(ray_start_cluster): + cluster = ray_start_cluster + # Disable worker caching so worker leases are not reused; set object + # inlining size threshold and enable storing of small objects in in-memory + # object store so the borrowed ref is inlined. + cluster.add_node( + num_cpus=1, + resources={"pin_head": 1}, + _system_config={ + "worker_lease_timeout_milliseconds": 0, + "max_direct_call_object_size": 100 * 1024, + "put_small_object_in_memory_store": True, + }, + ) + cluster.add_node(num_cpus=1, resources={"pin_worker": 1}) + ray.init(address=cluster.address) + + @ray.remote(resources={"pin_head": 1}) + def f(x): + return x + 1 + + @ray.remote(resources={"pin_worker": 1}) + def g(x): + borrowed_ref = x[0] + f_ref = f.remote(borrowed_ref) + # borrowed_ref should be inlined on future resolution and shouldn't be + # in Plasma. + assert ray.worker.global_worker.core_worker.object_exists( + borrowed_ref, memory_store_only=True) + return ray.get(f_ref) * 2 + + one = ray.put(1) + g_ref = g.remote([one]) + assert ray.get(g_ref) == 4 + + if __name__ == "__main__": import pytest sys.exit(pytest.main(["-v", __file__])) diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 21fc462a7af6..dfbe8ef2ccd3 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -1058,7 +1058,8 @@ Status CoreWorker::Get(const std::vector &ids, const int64_t timeout_m return Status::OK(); } -Status CoreWorker::Contains(const ObjectID &object_id, bool *has_object) { +Status CoreWorker::Contains(const ObjectID &object_id, bool *has_object, + bool *is_in_plasma) { bool found = false; bool in_plasma = false; found = memory_store_->Contains(object_id, &in_plasma); @@ -1066,6 +1067,9 @@ Status CoreWorker::Contains(const ObjectID &object_id, bool *has_object) { RAY_RETURN_NOT_OK(plasma_store_provider_->Contains(object_id, &found)); } *has_object = found; + if (is_in_plasma != nullptr) { + *is_in_plasma = found && in_plasma; + } return Status::OK(); } @@ -2091,25 +2095,43 @@ void CoreWorker::HandleGetObjectStatus(const rpc::GetObjectStatusRequest &reques send_reply_callback(Status::OK(), nullptr, nullptr); } else { RAY_CHECK(owner_address.worker_id() == request.owner_worker_id()); + bool is_freed = reference_counter_->IsPlasmaObjectFreed(object_id); - if (reference_counter_->IsPlasmaObjectFreed(object_id)) { - reply->set_status(rpc::GetObjectStatusReply::FREED); - } else { - reply->set_status(rpc::GetObjectStatusReply::CREATED); - } // Send the reply once the value has become available. The value is // guaranteed to become available eventually because we own the object and // its ref count is > 0. - // TODO(swang): We could probably just send the object value if it is small - // enough and we have it local. - memory_store_->GetAsync(object_id, - [send_reply_callback](std::shared_ptr obj) { - send_reply_callback(Status::OK(), nullptr, nullptr); - }); + memory_store_->GetAsync(object_id, [reply, send_reply_callback, + is_freed](std::shared_ptr obj) { + if (is_freed) { + reply->set_status(rpc::GetObjectStatusReply::FREED); + } else { + // If obj is the concrete object value, it is small, so we + // send the object back to the caller in the GetObjectStatus + // reply, bypassing a Plasma put and object transfer. If obj + // is an indicator that the object is in Plasma, we set an + // in_plasma indicator on the message, and the caller will + // have to facilitate a Plasma object transfer to get the + // object value. + auto *object = reply->mutable_object(); + if (obj->HasData()) { + const auto &data = obj->GetData(); + object->set_data(data->Data(), data->Size()); + } + if (obj->HasMetadata()) { + const auto &metadata = obj->GetMetadata(); + object->set_metadata(metadata->Data(), metadata->Size()); + } + for (const auto &nested_id : obj->GetNestedIds()) { + object->add_nested_inlined_ids(nested_id.Binary()); + } + reply->set_status(rpc::GetObjectStatusReply::CREATED); + } + send_reply_callback(Status::OK(), nullptr, nullptr); + }); } RemoveLocalReference(object_id); -} +} // namespace ray void CoreWorker::HandleWaitForActorOutOfScope( const rpc::WaitForActorOutOfScopeRequest &request, diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h index 088ba346a70c..3002b9003630 100644 --- a/src/ray/core_worker/core_worker.h +++ b/src/ray/core_worker/core_worker.h @@ -559,8 +559,10 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { /// /// \param[in] object_id ID of the objects to check for. /// \param[out] has_object Whether or not the object is present. + /// \param[out] is_in_plasma Whether or not the object is in Plasma. /// \return Status. - Status Contains(const ObjectID &object_id, bool *has_object); + Status Contains(const ObjectID &object_id, bool *has_object, + bool *is_in_plasma = nullptr); /// Wait for a list of objects to appear in the object store. /// Duplicate object ids are supported, and `num_objects` includes duplicate ids in this diff --git a/src/ray/core_worker/future_resolver.cc b/src/ray/core_worker/future_resolver.cc index 8a1cc3f078ef..c625507cdbb5 100644 --- a/src/ray/core_worker/future_resolver.cc +++ b/src/ray/core_worker/future_resolver.cc @@ -28,30 +28,53 @@ void FutureResolver::ResolveFutureAsync(const ObjectID &object_id, rpc::GetObjectStatusRequest request; request.set_object_id(object_id.Binary()); request.set_owner_worker_id(owner_address.worker_id()); - conn->GetObjectStatus( - request, - [this, object_id](const Status &status, const rpc::GetObjectStatusReply &reply) { - if (!status.ok()) { - RAY_LOG(WARNING) << "Error retrieving the value of object ID " << object_id - << " that was deserialized: " << status.ToString(); - } + conn->GetObjectStatus(request, [this, object_id]( + const Status &status, + const rpc::GetObjectStatusReply &reply) { + if (!status.ok()) { + RAY_LOG(WARNING) << "Error retrieving the value of object ID " << object_id + << " that was deserialized: " << status.ToString(); + } - if (!status.ok() || reply.status() == rpc::GetObjectStatusReply::OUT_OF_SCOPE) { - // The owner is gone or the owner replied that the object has gone - // out of scope (this is an edge case in the distributed ref counting - // protocol where a borrower dies before it can notify the owner of - // another borrower). Store an error so that an exception will be - // thrown immediately when the worker tries to get the value. - RAY_UNUSED(in_memory_store_->Put( - RayObject(rpc::ErrorType::OBJECT_UNRECONSTRUCTABLE), object_id)); - } else { - // We can now try to fetch the object via plasma. If the owner later - // fails or the object is released, the raylet will eventually store - // an error in plasma on our behalf. - RAY_UNUSED(in_memory_store_->Put(RayObject(rpc::ErrorType::OBJECT_IN_PLASMA), - object_id)); - } - }); + if (!status.ok() || reply.status() == rpc::GetObjectStatusReply::OUT_OF_SCOPE) { + // The owner is gone or the owner replied that the object has gone + // out of scope (this is an edge case in the distributed ref counting + // protocol where a borrower dies before it can notify the owner of + // another borrower). Store an error so that an exception will be + // thrown immediately when the worker tries to get the value. + RAY_UNUSED(in_memory_store_->Put( + RayObject(rpc::ErrorType::OBJECT_UNRECONSTRUCTABLE), object_id)); + } else if (reply.status() == rpc::GetObjectStatusReply::CREATED) { + // The object is either an indicator that the object is in Plasma, or + // the object has been returned directly in the reply. In either + // case, we put the corresponding RayObject into the in-memory store. + // If the owner later fails or the object is released, the raylet + // will eventually store an error in Plasma on our behalf. + const auto &data = reply.object().data(); + std::shared_ptr data_buffer; + if (data.size() > 0) { + RAY_LOG(DEBUG) << "Object returned directly in GetObjectStatus reply, putting " + << object_id << " in memory store"; + data_buffer = std::make_shared( + const_cast(reinterpret_cast(data.data())), + data.size()); + } else { + RAY_LOG(DEBUG) << "Object not returned directly in GetObjectStatus reply, " + << object_id << " will have to be fetched from Plasma"; + } + const auto &metadata = reply.object().metadata(); + std::shared_ptr metadata_buffer; + if (metadata.size() > 0) { + metadata_buffer = std::make_shared( + const_cast(reinterpret_cast(metadata.data())), + metadata.size()); + } + auto inlined_ids = + IdVectorFromProtobuf(reply.object().nested_inlined_ids()); + RAY_UNUSED(in_memory_store_->Put( + RayObject(data_buffer, metadata_buffer, inlined_ids), object_id)); + } + }); } } // namespace ray diff --git a/src/ray/core_worker/future_resolver.h b/src/ray/core_worker/future_resolver.h index be504a582f3d..b774434b71da 100644 --- a/src/ray/core_worker/future_resolver.h +++ b/src/ray/core_worker/future_resolver.h @@ -16,6 +16,7 @@ #include +#include "ray/common/grpc_util.h" #include "ray/common/id.h" #include "ray/core_worker/store_provider/memory_store/memory_store.h" #include "ray/rpc/worker/core_worker_client.h" diff --git a/src/ray/protobuf/core_worker.proto b/src/ray/protobuf/core_worker.proto index 799530d274e9..43dfaa45bbe0 100644 --- a/src/ray/protobuf/core_worker.proto +++ b/src/ray/protobuf/core_worker.proto @@ -132,6 +132,15 @@ message GetObjectStatusRequest { bytes object_id = 2; } +message RayObject { + // Data of the object. + bytes data = 1; + // Metadata of the object. + bytes metadata = 2; + // ObjectIDs that were nested in data. This is only set for inlined objects. + repeated bytes nested_inlined_ids = 3; +} + message GetObjectStatusReply { enum ObjectStatus { CREATED = 0; @@ -139,6 +148,9 @@ message GetObjectStatusReply { FREED = 2; } ObjectStatus status = 1; + // The Ray object: either a concrete value, an in-Plasma indicator, or an + // exception. + RayObject object = 2; } message WaitForActorOutOfScopeRequest { From 3d87abae3cf8141feda09a565c3953a16eb32ab5 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 237/244] Revert "[Kubernetes] Unit test for cluster launch and teardown using K8s Operator (#13437)" This reverts commit 5b080c1262f6c052575656375d3f6b9aa9372629. --- ci/travis/ci.sh | 1 - python/ray/tests/BUILD | 3 +- .../ray/tests/test_k8s_operator_examples.py | 150 ------------------ 3 files changed, 1 insertion(+), 153 deletions(-) delete mode 100644 python/ray/tests/test_k8s_operator_examples.py diff --git a/ci/travis/ci.sh b/ci/travis/ci.sh index d9c679bc7218..a403a4a9f522 100755 --- a/ci/travis/ci.sh +++ b/ci/travis/ci.sh @@ -164,7 +164,6 @@ test_python() { -python/ray/tests:test_stress # timeout -python/ray/tests:test_stress_sharded # timeout -python/ray/tests:test_k8s_cluster_launcher - -python/ray/tests:test_k8s_operator_examples ) fi if [ 0 -lt "${#args[@]}" ]; then # Any targets to test? diff --git a/python/ray/tests/BUILD b/python/ray/tests/BUILD index 7f4c61bb1cfb..0f2709c82fc0 100644 --- a/python/ray/tests/BUILD +++ b/python/ray/tests/BUILD @@ -111,9 +111,8 @@ py_test_module_list( py_test_module_list( files = [ "test_k8s_cluster_launcher.py", - "test_k8s_operator_examples.py", ], - size = "medium", + size = "small", extra_srcs = SRCS, deps = ["//:ray_lib"], tags = ["kubernetes"] diff --git a/python/ray/tests/test_k8s_operator_examples.py b/python/ray/tests/test_k8s_operator_examples.py deleted file mode 100644 index 6ca2aca370b2..000000000000 --- a/python/ray/tests/test_k8s_operator_examples.py +++ /dev/null @@ -1,150 +0,0 @@ -"""Tests launch and teardown of multiple Ray clusters using Kubernetes -operator.""" -import sys -import os -import subprocess -import tempfile -import time -import unittest - -import kubernetes -import pytest -import yaml - -IMAGE_ENV = "KUBERNETES_OPERATOR_TEST_IMAGE" -IMAGE = os.getenv(IMAGE_ENV, "rayproject/ray:nightly") -NAMESPACE = "test-k8s-operator-examples" - - -def retry_until_true(f): - # Retry 60 times with 1 second delay between attempts. - def f_with_retries(*args, **kwargs): - for _ in range(60): - if f(*args, **kwargs): - return - else: - time.sleep(1) - pytest.fail("The condition wasn't met before the timeout expired.") - - return f_with_retries - - -@retry_until_true -def wait_for_pods(n): - client = kubernetes.client.CoreV1Api() - pods = client.list_namespaced_pod(namespace=NAMESPACE).items - # Double-check that the correct image is use. - for pod in pods: - assert pod.spec.containers[0].image == IMAGE - return len(pods) == n - - -@retry_until_true -def wait_for_logs(): - """Check if logs indicate presence of nodes of types "head-node" and - "worker-nodes" in the "example-cluster" cluster.""" - cmd = f"kubectl -n {NAMESPACE} logs ray-operator-pod"\ - "| grep ^example-cluster: | tail -n 100" - log_tail = subprocess.check_output(cmd, shell=True).decode() - return ("head-node" in log_tail) and ("worker-nodes" in log_tail) - - -def operator_configs_directory(): - here = os.path.realpath(__file__) - ray_python_root = os.path.dirname(os.path.dirname(here)) - relative_path = "autoscaler/kubernetes/operator_configs" - return os.path.join(ray_python_root, relative_path) - - -def get_operator_config_path(file_name): - return os.path.join(operator_configs_directory(), file_name) - - -class KubernetesOperatorTest(unittest.TestCase): - def test_examples(self): - with tempfile.NamedTemporaryFile("w+") as example_cluster_file, \ - tempfile.NamedTemporaryFile("w+") as example_cluster2_file,\ - tempfile.NamedTemporaryFile("w+") as operator_file: - - # Get paths to operator configs - example_cluster_config_path = get_operator_config_path( - "example_cluster.yaml") - example_cluster2_config_path = get_operator_config_path( - "example_cluster2.yaml") - operator_config_path = get_operator_config_path("operator.yaml") - self.crd_path = get_operator_config_path("cluster_crd.yaml") - - # Load operator configs - example_cluster_config = yaml.safe_load( - open(example_cluster_config_path).read()) - example_cluster2_config = yaml.safe_load( - open(example_cluster2_config_path).read()) - operator_config = list( - yaml.safe_load_all(open(operator_config_path).read())) - - # Fill image fields - podTypes = example_cluster_config["spec"]["podTypes"] - podTypes2 = example_cluster2_config["spec"]["podTypes"] - pod_configs = ([operator_config[-1]] + [ - podType["podConfig"] for podType in podTypes - ] + [podType["podConfig"] for podType in podTypes2]) - for pod_config in pod_configs: - pod_config["spec"]["containers"][0]["image"] = IMAGE - - # Dump to temporary files - yaml.dump(example_cluster_config, example_cluster_file) - yaml.dump(example_cluster2_config, example_cluster2_file) - yaml.dump_all(operator_config, operator_file) - files = [ - example_cluster_file, example_cluster2_file, operator_file - ] - for file in files: - file.flush() - - # Apply CR - cmd = f"kubectl apply -f {self.crd_path}" - subprocess.check_call(cmd, shell=True) - - # Create namespace - cmd = f"kubectl create namespace {NAMESPACE}" - subprocess.check_call(cmd, shell=True) - - # Start operator and two clusters - for file in files: - cmd = f"kubectl -n {NAMESPACE} apply -f {file.name}" - subprocess.check_call(cmd, shell=True) - - # Check that autoscaling respects minWorkers by waiting for - # six pods in the namespace. - wait_for_pods(6) - - # Check that logging output looks normal (two workers connected to - # ray cluster example-cluster.) - wait_for_logs() - - # Delete the second cluster - cmd = f"kubectl -n {NAMESPACE} delete -f"\ - f"{example_cluster2_file.name}" - subprocess.check_call(cmd, shell=True) - - # Four pods remain - wait_for_pods(4) - - # Delete the first cluster - cmd = f"kubectl -n {NAMESPACE} delete -f"\ - f"{example_cluster_file.name}" - subprocess.check_call(cmd, shell=True) - - # Only operator pod remains. - wait_for_pods(1) - - def __del__(self): - cmd = f"kubectl delete -f {self.crd_path}" - subprocess.check_call(cmd, shell=True) - cmd = f"kubectl delete namespace {NAMESPACE}" - subprocess.check_call(cmd, shell=True) - - -if __name__ == "__main__": - kubernetes.config.load_kube_config() - sys.exit(pytest.main(["-v", __file__])) From bfcee20a8b6949838961e4c9822708863707cf6f Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 238/244] Revert "[serve] Refactor BackendState to use ReplicaState classes (#13406)" This reverts commit fc379336f9dd10f0411853d5d7131acb458f50a5. --- python/ray/serve/backend_state.py | 533 ++++++++++++----------------- python/ray/serve/config.py | 4 +- python/ray/serve/controller.py | 4 +- python/ray/serve/tests/test_api.py | 3 - 4 files changed, 217 insertions(+), 327 deletions(-) diff --git a/python/ray/serve/backend_state.py b/python/ray/serve/backend_state.py index 4aad2671ea4e..673c4b2cfbc8 100644 --- a/python/ray/serve/backend_state.py +++ b/python/ray/serve/backend_state.py @@ -1,8 +1,7 @@ import asyncio +from asyncio.futures import Future from collections import defaultdict -from enum import Enum -import time -from typing import Dict, List, Optional, Tuple +from typing import Dict, Any, List, Optional, Set, Tuple import ray import ray.cloudpickle as pickle @@ -18,6 +17,7 @@ ) from ray.serve.config import BackendConfig, ReplicaConfig from ray.serve.constants import LongPollKey +from ray.serve.exceptions import RayServeException from ray.serve.kv_store import RayInternalKVStore from ray.serve.long_poll import LongPollHost from ray.serve.utils import (format_actor_name, get_random_letters, logger, @@ -30,150 +30,6 @@ _RESOURCE_CHECK_ENABLED = True -class ReplicaState(Enum): - SHOULD_START = 1 - STARTING = 2 - RUNNING = 3 - SHOULD_STOP = 4 - STOPPING = 5 - STOPPED = 6 - - -class BackendReplica: - def __init__(self, controller_name: str, detached: bool, - replica_tag: ReplicaTag, backend_tag: BackendTag): - self._actor_name = format_actor_name(replica_tag, controller_name) - self._controller_name = controller_name - self._detached = detached - self._replica_tag = replica_tag - self._backend_tag = backend_tag - self._actor_handle = None - self._startup_obj_ref = None - self._drain_obj_ref = None - self._state = ReplicaState.SHOULD_START - - def __get_state__(self): - clean_dict = self.__dict__.copy() - del clean_dict["_actor_handle"] - del clean_dict["_startup_obj_ref"] - del clean_dict["_drain_obj_ref"] - return clean_dict - - def __set_state__(self, d): - self.__dict__ = d - self._actor_handle = None - self._startup_obj_ref = None - self._drain_obj_ref = None - self._recover_from_checkpoint() - - def _recover_from_checkpoint(self): - if self._state == ReplicaState.STARTING: - # We do not need to pass in the class here because the actor - # creation has already been started if this class was checkpointed - # in the STARTING state. - self.start() - elif self._state == ReplicaState.RUNNING: - # Fetch actor handles for all backend replicas in the system. - # The actors must exist if this class was checkpointed in the - # RUNNING state. - self._actor_handle = ray.get_actor(self._actor_name) - elif self._state == ReplicaState.STOPPING: - self.stop() - - def start(self, backend_info: Optional[BackendInfo]): - assert self._state in { - ReplicaState.SHOULD_START, ReplicaState.STARTING - }, (f"State must be {ReplicaState.SHOULD_START} or " - f"{ReplicaState.STARTING}, *not* {self._state}") - try: - self._actor_handle = ray.get_actor(self._actor_name) - except ValueError: - logger.debug("Starting replica '{}' for backend '{}'.".format( - self._replica_tag, self._backend_tag)) - self._actor_handle = ray.remote(backend_info.worker_class).options( - name=self._actor_name, - lifetime="detached" if self._detached else None, - max_restarts=-1, - max_task_retries=-1, - **backend_info.replica_config.ray_actor_options).remote( - self._backend_tag, self._replica_tag, - backend_info.replica_config.actor_init_args, - backend_info.backend_config, self._controller_name) - self._startup_obj_ref = self._actor_handle.ready.remote() - self._state = ReplicaState.STARTING - - def check_started(self): - if self._state == ReplicaState.RUNNING: - return True - assert self._state == ReplicaState.STARTING, ( - f"State must be {ReplicaState.STARTING}, *not* {self._state}") - ready, _ = ray.wait([self._startup_obj_ref], timeout=0) - if len(ready) == 1: - self._state = ReplicaState.RUNNING - return True - return False - - def set_should_stop(self, graceful_shutdown_timeout_s: Duration): - self._state = ReplicaState.SHOULD_STOP - self._graceful_shutdown_timeout_s = graceful_shutdown_timeout_s - - def stop(self): - # We need to handle transitions from: - # SHOULD_START -> SHOULD_STOP -> STOPPING - # This means that the replica_handle may not have been created. - - assert self._state in { - ReplicaState.SHOULD_STOP, ReplicaState.STOPPING - }, (f"State must be {ReplicaState.SHOULD_STOP} or " - f"{ReplicaState.STOPPING}, *not* {self._state}") - - def drain_actor(actor_name): - # NOTE: the replicas may already be stopped if we failed - # after stopping them but before writing a checkpoint. - try: - replica = ray.get_actor(actor_name) - except ValueError: - return None - return replica.drain_pending_queries.remote() - - self._state = ReplicaState.STOPPING - self._drain_obj_ref = drain_actor(self._actor_name) - self._shutdown_deadline = time.time( - ) + self._graceful_shutdown_timeout_s - - def check_stopped(self): - if self._state == ReplicaState.STOPPED: - return True - assert self._state == ReplicaState.STOPPING, ( - f"State must be {ReplicaState.STOPPING}, *not* {self._state}") - - try: - replica = ray.get_actor(self._actor_name) - except ValueError: - self._state = ReplicaState.STOPPED - return True - - ready, _ = ray.wait([self._drain_obj_ref], timeout=0) - timeout_passed = time.time() > self._shutdown_deadline - - if len(ready) == 1 or timeout_passed: - if timeout_passed: - # Graceful period passed, kill it forcefully. - logger.debug( - f"{self._actor_name} did not shutdown after " - f"{self._graceful_shutdown_timeout_s}s, force-killing.") - - ray.kill(replica, no_restart=True) - self._state = ReplicaState.STOPPED - return True - return False - - def get_actor_handle(self): - assert self._state == ReplicaState.RUNNING, ( - f"State must be {ReplicaState.RUNNING}, *not* {self._state}") - return self._actor_handle - - class BackendState: """Manages all state for backends in the system. @@ -190,65 +46,79 @@ def __init__(self, controller_name: str, detached: bool, self._long_poll_host = long_poll_host self._goal_manager = goal_manager - self._replicas: Dict[BackendTag, Dict[ReplicaState, List[ - BackendReplica]]] = defaultdict(lambda: defaultdict(list)) - self._backend_metadata: Dict[BackendTag, BackendInfo] = dict() - self._target_replicas: Dict[BackendTag, int] = defaultdict(int) - self.backend_goals: Dict[BackendTag, GoalId] = dict() + # Non-checkpointed state. + self.currently_starting_replicas: Dict[asyncio.Future, Tuple[ + BackendTag, ReplicaTag, ActorHandle]] = dict() + self.currently_stopping_replicas: Dict[asyncio.Future, Tuple[ + BackendTag, ReplicaTag]] = dict() - # Un-Checkpointed state. - self.pending_goals: Dict[GoalId, asyncio.Event] = dict() + # Checkpointed state. + self.backends: Dict[BackendTag, BackendInfo] = dict() + self.backend_replicas: Dict[BackendTag, Dict[ + ReplicaTag, ActorHandle]] = defaultdict(dict) + self.backend_goals: Dict[BackendTag, GoalId] = dict() + self.backend_replicas_to_start: Dict[BackendTag, List[ + ReplicaTag]] = defaultdict(list) + self.backend_replicas_to_stop: Dict[BackendTag, List[Tuple[ + ReplicaTag, Duration]]] = defaultdict(list) + self.backends_to_remove: List[BackendTag] = list() checkpoint = self._kv_store.get(CHECKPOINT_KEY) if checkpoint is not None: - (self._replicas, self._backend_metadata, self._target_replicas, - self.backend_goals, pending_goal_ids) = pickle.loads(checkpoint) + (self.backends, self.backend_replicas, self.backend_goals, + self.backend_replicas_to_start, self.backend_replicas_to_stop, + self.backend_to_remove, + pending_goal_ids) = pickle.loads(checkpoint) for goal_id in pending_goal_ids: self._goal_manager.create_goal(goal_id) + # Fetch actor handles for all backend replicas in the system. + # All of these backend_replicas are guaranteed to already exist + # because they would not be written to a checkpoint in + # self.backend_replicas until they were created. + for backend_tag, replica_dict in self.backend_replicas.items(): + for replica_tag in replica_dict.keys(): + replica_name = format_actor_name(replica_tag, + self._controller_name) + self.backend_replicas[backend_tag][ + replica_tag] = ray.get_actor(replica_name) + self._notify_backend_configs_changed() self._notify_replica_handles_changed() def _checkpoint(self) -> None: self._kv_store.put( CHECKPOINT_KEY, - pickle.dumps((self._replicas, self._backend_metadata, - self._target_replicas, self.backend_goals, - self._goal_manager.get_pending_goal_ids()))) + pickle.dumps( + (self.backends, self.backend_replicas, self.backend_goals, + self.backend_replicas_to_start, self.backend_replicas_to_stop, + self.backends_to_remove, + self._goal_manager.get_pending_goal_ids()))) def _notify_backend_configs_changed(self) -> None: self._long_poll_host.notify_changed(LongPollKey.BACKEND_CONFIGS, self.get_backend_configs()) - def get_running_replica_handles( - self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]: - return { - backend_tag: { - backend_replica._replica_tag: - backend_replica.get_actor_handle() - for backend_replica in state_to_replica_dict[ - ReplicaState.RUNNING] - } - for backend_tag, state_to_replica_dict in self._replicas.items() - } - def _notify_replica_handles_changed(self) -> None: self._long_poll_host.notify_changed( LongPollKey.REPLICA_HANDLES, { backend_tag: list(replica_dict.values()) - for backend_tag, replica_dict in - self.get_running_replica_handles().items() + for backend_tag, replica_dict in self.backend_replicas.items() }) def get_backend_configs(self) -> Dict[BackendTag, BackendConfig]: return { tag: info.backend_config - for tag, info in self._backend_metadata.items() + for tag, info in self.backends.items() } + def get_replica_handles( + self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]: + return self.backend_replicas + def get_backend(self, backend_tag: BackendTag) -> Optional[BackendInfo]: - return self._backend_metadata.get(backend_tag) + return self.backends.get(backend_tag) def _set_backend_goal(self, backend_tag: BackendTag, backend_info: BackendInfo) -> None: @@ -256,11 +126,7 @@ def _set_backend_goal(self, backend_tag: BackendTag, new_goal_id = self._goal_manager.create_goal() if backend_info is not None: - self._backend_metadata[backend_tag] = backend_info - self._target_replicas[ - backend_tag] = backend_info.backend_config.num_replicas - else: - self._target_replicas[backend_tag] = 0 + self.backends[backend_tag] = backend_info self.backend_goals[backend_tag] = new_goal_id @@ -270,25 +136,31 @@ def create_backend(self, backend_tag: BackendTag, backend_config: BackendConfig, replica_config: ReplicaConfig) -> Optional[GoalId]: # Ensures this method is idempotent. - backend_info = self._backend_metadata.get(backend_tag) + backend_info = self.backends.get(backend_tag) if backend_info is not None: if (backend_info.backend_config == backend_config and backend_info.replica_config == replica_config): return None - backend_replica_class = create_backend_replica( - replica_config.func_or_class) + backend_replica = create_backend_replica(replica_config.func_or_class) # Save creator that starts replicas, the arguments to be passed in, # and the configuration for the backends. backend_info = BackendInfo( - worker_class=backend_replica_class, + worker_class=backend_replica, backend_config=backend_config, replica_config=replica_config) new_goal_id, existing_goal_id = self._set_backend_goal( backend_tag, backend_info) + try: + self.scale_backend_replicas(backend_tag, + backend_config.num_replicas) + except RayServeException as e: + del self.backends[backend_tag] + raise e + # NOTE(edoakes): we must write a checkpoint before starting new # or pushing the updated config to avoid inconsistent state if we # crash while making the change. @@ -303,15 +175,20 @@ def delete_backend(self, backend_tag: BackendTag, force_kill: bool = False) -> Optional[GoalId]: # This method must be idempotent. We should validate that the # specified backend exists on the client. - if backend_tag not in self._backend_metadata: + if backend_tag not in self.backends: return None + # Scale its replicas down to 0. + self.scale_backend_replicas(backend_tag, 0, force_kill) + + # Remove the backend's metadata. + del self.backends[backend_tag] + + # Add the intention to remove the backend from the routers. + self.backends_to_remove.append(backend_tag) + new_goal_id, existing_goal_id = self._set_backend_goal( backend_tag, None) - if force_kill: - self._backend_metadata[ - backend_tag].backend_config.\ - experimental_graceful_shutdown_timeout_s = 0 self._checkpoint() if existing_goal_id is not None: @@ -320,18 +197,20 @@ def delete_backend(self, backend_tag: BackendTag, def update_backend_config(self, backend_tag: BackendTag, config_options: BackendConfig): - if backend_tag not in self._backend_metadata: + if backend_tag not in self.backends: raise ValueError(f"Backend {backend_tag} is not registered") - stored_backend_config = self._backend_metadata[ - backend_tag].backend_config + stored_backend_config = self.backends[backend_tag].backend_config updated_config = stored_backend_config.copy( update=config_options.dict(exclude_unset=True)) updated_config._validate_complete() - self._backend_metadata[backend_tag].backend_config = updated_config + self.backends[backend_tag].backend_config = updated_config new_goal_id, existing_goal_id = self._set_backend_goal( - backend_tag, self._backend_metadata[backend_tag]) + backend_tag, self.backends[backend_tag]) + + # Scale the replicas with the new configuration. + self.scale_backend_replicas(backend_tag, updated_config.num_replicas) # NOTE(edoakes): we must write a checkpoint before pushing the # update to avoid inconsistent state if we crash after pushing the @@ -381,38 +260,31 @@ def _start_backend_replica(self, backend_tag: BackendTag, def scale_backend_replicas( self, backend_tag: BackendTag, - ) -> bool: + num_replicas: int, + force_kill: bool = False, + ) -> None: """Scale the given backend to the number of replicas. NOTE: this does not actually start or stop the replicas, but instead - adds them to ReplicaState.SHOULD_START or ReplicaState.SHOULD_STOP. - The caller is responsible for then first writing a checkpoint and then - actually starting/stopping the intended replicas. This avoids - inconsistencies with starting/stopping a replica and then crashing - before writing a checkpoint. + adds the intention to start/stop them to self.backend_replicas_to_start + and self.backend_replicas_to_stop. The caller is responsible for then + first writing a checkpoint and then actually starting/stopping the + intended replicas. This avoids inconsistencies with starting/stopping a + replica and then crashing before writing a checkpoint. """ - num_replicas = self._target_replicas.get(backend_tag, 0) logger.debug("Scaling backend '{}' to {} replicas".format( backend_tag, num_replicas)) - assert (backend_tag in self._backend_metadata + assert (backend_tag in self.backends ), "Backend {} is not registered.".format(backend_tag) assert num_replicas >= 0, ("Number of replicas must be" " greater than or equal to 0.") - current_num_replicas = sum([ - len(self._replicas[backend_tag][ReplicaState.SHOULD_START]), - len(self._replicas[backend_tag][ReplicaState.STARTING]), - len(self._replicas[backend_tag][ReplicaState.RUNNING]), - ]) - + current_num_replicas = len(self.backend_replicas[backend_tag]) delta_num_replicas = num_replicas - current_num_replicas - backend_info: BackendInfo = self._backend_metadata[backend_tag] - if delta_num_replicas == 0: - return False - - elif delta_num_replicas > 0: + backend_info: BackendInfo = self.backends[backend_tag] + if delta_num_replicas > 0: can_schedule = try_schedule_resources_on_nodes(requirements=[ backend_info.replica_config.resource_dict for _ in range(delta_num_replicas) @@ -420,11 +292,10 @@ def scale_backend_replicas( if _RESOURCE_CHECK_ENABLED and not all(can_schedule): num_possible = sum(can_schedule) - logger.error( + raise RayServeException( "Cannot scale backend {} to {} replicas. Ray Serve tried " "to add {} replicas but the resources only allows {} " - "to be added. This is not a problem if the cluster is " - "autoscaling. To fix this, consider scaling to replica to " + "to be added. To fix this, consider scaling to replica to " "{} or add more resources to the cluster. You can check " "avaiable resources with ray.nodes().".format( backend_tag, num_replicas, delta_num_replicas, @@ -434,132 +305,154 @@ def scale_backend_replicas( delta_num_replicas, backend_tag)) for _ in range(delta_num_replicas): replica_tag = "{}#{}".format(backend_tag, get_random_letters()) - self._replicas[backend_tag][ReplicaState.SHOULD_START].append( - BackendReplica(self._controller_name, self._detached, - replica_tag, backend_tag)) + self.backend_replicas_to_start[backend_tag].append(replica_tag) elif delta_num_replicas < 0: logger.debug("Removing {} replicas from backend '{}'".format( -delta_num_replicas, backend_tag)) - assert self._target_replicas[backend_tag] >= delta_num_replicas - + assert len( + self.backend_replicas[backend_tag]) >= delta_num_replicas + replicas_copy = self.backend_replicas.copy() for _ in range(-delta_num_replicas): - replica_state_dict = self._replicas[backend_tag] - list_to_use = replica_state_dict[ReplicaState.SHOULD_START] \ - or replica_state_dict[ReplicaState.STARTING] \ - or replica_state_dict[ReplicaState.RUNNING] - - assert len(list_to_use), replica_state_dict - replica_to_stop = list_to_use.pop() + replica_tag, _ = replicas_copy[backend_tag].popitem() graceful_timeout_s = (backend_info.backend_config. experimental_graceful_shutdown_timeout_s) - - replica_to_stop.set_should_stop(graceful_timeout_s) - self._replicas[backend_tag][ReplicaState.SHOULD_STOP].append( - replica_to_stop) - - return True - - def scale_all_backends(self): - checkpoint_needed = False - for backend_tag, num_replicas in list(self._target_replicas.items()): - checkpoint_needed = (checkpoint_needed - or self.scale_backend_replicas(backend_tag)) - if num_replicas == 0: - del self._backend_metadata[backend_tag] - del self._target_replicas[backend_tag] - - if checkpoint_needed: - self._checkpoint() - - def _pop_replicas_of_state(self, state: ReplicaState - ) -> List[Tuple[ReplicaState, BackendTag]]: - replicas = [] - for backend_tag, state_to_replica_dict in self._replicas.items(): - if state in state_to_replica_dict: - replicas.extend( - (replica, backend_tag) - for replica in state_to_replica_dict.pop(state)) - - return replicas + if force_kill: + graceful_timeout_s = 0 + self.backend_replicas_to_stop[backend_tag].append(( + replica_tag, + graceful_timeout_s, + )) + + def _start_pending_replicas(self): + for backend_tag, replicas_to_create in self.backend_replicas_to_start.\ + items(): + for replica_tag in replicas_to_create: + replica_handle = self._start_backend_replica( + backend_tag, replica_tag) + ready_future = replica_handle.ready.remote().as_future() + self.currently_starting_replicas[ready_future] = ( + backend_tag, replica_tag, replica_handle) + + def _stop_pending_replicas(self): + for backend_tag, replicas_to_stop in ( + self.backend_replicas_to_stop.items()): + for replica_tag, shutdown_timeout in replicas_to_stop: + replica_name = format_actor_name(replica_tag, + self._controller_name) + + async def kill_actor(replica_name_to_use): + # NOTE: the replicas may already be stopped if we failed + # after stopping them but before writing a checkpoint. + try: + replica = ray.get_actor(replica_name_to_use) + except ValueError: + return + + try: + await asyncio.wait_for( + replica.drain_pending_queries.remote(), + timeout=shutdown_timeout) + except asyncio.TimeoutError: + # Graceful period passed, kill it forcefully. + logger.debug( + f"{replica_name_to_use} did not shutdown after " + f"{shutdown_timeout}s, killing.") + finally: + ray.kill(replica, no_restart=True) + + self.currently_stopping_replicas[asyncio.ensure_future( + kill_actor(replica_name))] = (backend_tag, replica_tag) + + async def _check_currently_starting_replicas(self) -> int: + """Returns the number of pending replicas waiting to start""" + in_flight: Set[Future[Any]] = set() + + if self.currently_starting_replicas: + done, in_flight = await asyncio.wait( + list(self.currently_starting_replicas.keys()), timeout=0) + for fut in done: + (backend_tag, replica_tag, + replica_handle) = self.currently_starting_replicas.pop(fut) + self.backend_replicas[backend_tag][ + replica_tag] = replica_handle + + backend = self.backend_replicas_to_start.get(backend_tag) + if backend: + try: + backend.remove(replica_tag) + except ValueError: + pass + if len(backend) == 0: + del self.backend_replicas_to_start[backend_tag] + + async def _check_currently_stopping_replicas(self) -> int: + """Returns the number of replicas waiting to stop""" + in_flight: Set[Future[Any]] = set() + + if self.currently_stopping_replicas: + done_stopping, in_flight = await asyncio.wait( + list(self.currently_stopping_replicas.keys()), timeout=0) + for fut in done_stopping: + (backend_tag, + replica_tag) = self.currently_stopping_replicas.pop(fut) + + backend_to_stop = self.backend_replicas_to_stop.get( + backend_tag) + + if backend_to_stop: + try: + backend_to_stop.remove(replica_tag) + except ValueError: + pass + if len(backend_to_stop) == 0: + del self.backend_replicas_to_stop[backend_tag] + + backend = self.backend_replicas.get(backend_tag) + if backend: + try: + del backend[replica_tag] + except KeyError: + pass + + if len(self.backend_replicas[backend_tag]) == 0: + del self.backend_replicas[backend_tag] def _completed_goals(self) -> List[GoalId]: completed_goals = [] - all_tags = set(self._replicas.keys()).union( - set(self._backend_metadata.keys())) + all_tags = set(self.backend_replicas.keys()).union( + set(self.backends.keys())) for backend_tag in all_tags: - desired_num_replicas = self._target_replicas.get(backend_tag) - state_dict = self._replicas.get(backend_tag, {}) - existing_info = state_dict.get(ReplicaState.RUNNING, []) - - # If we have pending ops, the current goal is *not* ready - if (state_dict.get(ReplicaState.SHOULD_START) - or state_dict.get(ReplicaState.STARTING) - or state_dict.get(ReplicaState.SHOULD_STOP) - or state_dict.get(ReplicaState.STOPPING)): - continue - - # TODO(ilr): FIX + desired_info = self.backends.get(backend_tag) + existing_info = self.backend_replicas.get(backend_tag) # Check for deleting - if (not desired_num_replicas or - desired_num_replicas == 0) and \ + if (not desired_info or + desired_info.backend_config.num_replicas == 0) and \ (not existing_info or len(existing_info) == 0): - completed_goals.append( - self.backend_goals.pop(backend_tag, None)) + completed_goals.append(self.backend_goals.get(backend_tag)) # Check for a non-zero number of backends - if (desired_num_replicas and existing_info) \ - and desired_num_replicas == len(existing_info): - completed_goals.append( - self.backend_goals.pop(backend_tag, None)) + if desired_info and existing_info and desired_info.backend_config.\ + num_replicas == len(existing_info): + completed_goals.append(self.backend_goals.get(backend_tag)) return [goal for goal in completed_goals if goal] async def update(self) -> bool: - self.scale_all_backends() - for goal_id in self._completed_goals(): self._goal_manager.complete_goal(goal_id) - for replica_state, backend_tag in self._pop_replicas_of_state( - ReplicaState.SHOULD_START): - replica_state.start(self._backend_metadata[backend_tag]) - self._replicas[backend_tag][ReplicaState.STARTING].append( - replica_state) - - for replica_state, backend_tag in self._pop_replicas_of_state( - ReplicaState.SHOULD_STOP): - replica_state.stop() - self._replicas[backend_tag][ReplicaState.STOPPING].append( - replica_state) - - transition_triggered = False - - for replica_state, backend_tag in self._pop_replicas_of_state( - ReplicaState.STARTING): - if replica_state.check_started(): - self._replicas[backend_tag][ReplicaState.RUNNING].append( - replica_state) - transition_triggered = True - else: - self._replicas[backend_tag][ReplicaState.STARTING].append( - replica_state) - - for replica_state, backend_tag in self._pop_replicas_of_state( - ReplicaState.STOPPING): - if replica_state.check_stopped(): - transition_triggered = True - else: - self._replicas[backend_tag][ReplicaState.STOPPING].append( - replica_state) - - for backend_tag in list(self._replicas.keys()): - if not any(self._replicas[backend_tag]): - del self._replicas[backend_tag] - del self._backend_metadata[backend_tag] - del self._target_replicas[backend_tag] - - if transition_triggered: + self._start_pending_replicas() + self._stop_pending_replicas() + + num_starting = len(self.currently_starting_replicas) + num_stopping = len(self.currently_stopping_replicas) + + await self._check_currently_starting_replicas() + await self._check_currently_stopping_replicas() + + if (len(self.currently_starting_replicas) != num_starting) or \ + (len(self.currently_stopping_replicas) != num_stopping): self._checkpoint() self._notify_replica_handles_changed() diff --git a/python/ray/serve/config.py b/python/ray/serve/config.py index 41a1eca08ae8..205af81b065a 100644 --- a/python/ray/serve/config.py +++ b/python/ray/serve/config.py @@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional import pydantic -from pydantic import BaseModel, confloat, PositiveFloat, PositiveInt, validator +from pydantic import BaseModel, PositiveFloat, PositiveInt, validator from ray.serve.constants import (ASYNC_CONCURRENCY, DEFAULT_HTTP_HOST, DEFAULT_HTTP_PORT) @@ -64,7 +64,7 @@ class BackendConfig(BaseModel): user_config: Any = None experimental_graceful_shutdown_wait_loop_s: PositiveFloat = 2.0 - experimental_graceful_shutdown_timeout_s: confloat(ge=0) = 20.0 + experimental_graceful_shutdown_timeout_s: PositiveFloat = 20.0 class Config: validate_assignment = True diff --git a/python/ray/serve/controller.py b/python/ray/serve/controller.py index b5c65111a8f9..a3c75c711878 100644 --- a/python/ray/serve/controller.py +++ b/python/ray/serve/controller.py @@ -118,7 +118,7 @@ async def run_control_loop(self) -> None: def _all_replica_handles( self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]: """Used for testing.""" - return self.backend_state.get_running_replica_handles() + return self.backend_state.get_replica_handles() def get_all_backends(self) -> Dict[BackendTag, BackendConfig]: """Returns a dictionary of backend tag to backend config.""" @@ -235,7 +235,7 @@ async def shutdown(self) -> None: async with self.write_lock: for proxy in self.http_state.get_http_proxy_handles().values(): ray.kill(proxy, no_restart=True) - for replica_dict in self.backend_state.get_running_replica_handles( + for replica_dict in self.backend_state.get_replica_handles( ).values(): for replica in replica_dict.values(): ray.kill(replica, no_restart=True) diff --git a/python/ray/serve/tests/test_api.py b/python/ray/serve/tests/test_api.py index a35f7e54b361..202b01386059 100644 --- a/python/ray/serve/tests/test_api.py +++ b/python/ray/serve/tests/test_api.py @@ -683,9 +683,6 @@ def f(): client.create_endpoint("endpoint", backend="backend") -# This error is only printed because creation is run in the control loop, not -# in the API path. -@pytest.mark.skip() def test_create_infeasible_error(serve_instance): client = serve_instance From 5eb3cedb92b6a619903917cc0a89eaf6ada37097 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 239/244] Revert "Inline small objects in GetObjectStatus response. (#13309)" This reverts commit b03c51f6e7ba109d9ccef3a05f9fa6486b151827. --- python/ray/_raylet.pyx | 7 ++- python/ray/includes/libcoreworker.pxd | 3 +- python/ray/tests/test_advanced.py | 37 -------------- src/ray/core_worker/core_worker.cc | 48 +++++------------- src/ray/core_worker/core_worker.h | 4 +- src/ray/core_worker/future_resolver.cc | 69 +++++++++----------------- src/ray/core_worker/future_resolver.h | 1 - src/ray/protobuf/core_worker.proto | 12 ----- 8 files changed, 41 insertions(+), 140 deletions(-) diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 4b5f9deeef1a..8ba80852fb40 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -898,17 +898,16 @@ cdef class CoreWorker: return RayObjectsToDataMetadataPairs(results) - def object_exists(self, ObjectRef object_ref, memory_store_only=False): + def object_exists(self, ObjectRef object_ref): cdef: c_bool has_object - c_bool is_in_plasma CObjectID c_object_id = object_ref.native() with nogil: check_status(CCoreWorkerProcess.GetCoreWorker().Contains( - c_object_id, &has_object, &is_in_plasma)) + c_object_id, &has_object)) - return has_object and (not memory_store_only or not is_in_plasma) + return has_object cdef _create_put_buffer(self, shared_ptr[CBuffer] &metadata, size_t data_size, ObjectRef object_ref, diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd index 637dbd750020..f1acad1fadd8 100644 --- a/python/ray/includes/libcoreworker.pxd +++ b/python/ray/includes/libcoreworker.pxd @@ -183,8 +183,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: CRayStatus Get(const c_vector[CObjectID] &ids, int64_t timeout_ms, c_vector[shared_ptr[CRayObject]] *results, c_bool plasma_objects_only) - CRayStatus Contains(const CObjectID &object_id, c_bool *has_object, - c_bool *is_in_plasma) + CRayStatus Contains(const CObjectID &object_id, c_bool *has_object) CRayStatus Wait(const c_vector[CObjectID] &object_ids, int num_objects, int64_t timeout_ms, c_vector[c_bool] *results, c_bool fetch_local) diff --git a/python/ray/tests/test_advanced.py b/python/ray/tests/test_advanced.py index 8f607009ee49..6df746fdcd91 100644 --- a/python/ray/tests/test_advanced.py +++ b/python/ray/tests/test_advanced.py @@ -521,43 +521,6 @@ def method(self): assert ray.worker.global_worker.core_worker.object_exists(x_id) -@pytest.mark.skipif(client_test_enabled(), reason="internal api") -def test_future_resolution_skip_plasma(ray_start_cluster): - cluster = ray_start_cluster - # Disable worker caching so worker leases are not reused; set object - # inlining size threshold and enable storing of small objects in in-memory - # object store so the borrowed ref is inlined. - cluster.add_node( - num_cpus=1, - resources={"pin_head": 1}, - _system_config={ - "worker_lease_timeout_milliseconds": 0, - "max_direct_call_object_size": 100 * 1024, - "put_small_object_in_memory_store": True, - }, - ) - cluster.add_node(num_cpus=1, resources={"pin_worker": 1}) - ray.init(address=cluster.address) - - @ray.remote(resources={"pin_head": 1}) - def f(x): - return x + 1 - - @ray.remote(resources={"pin_worker": 1}) - def g(x): - borrowed_ref = x[0] - f_ref = f.remote(borrowed_ref) - # borrowed_ref should be inlined on future resolution and shouldn't be - # in Plasma. - assert ray.worker.global_worker.core_worker.object_exists( - borrowed_ref, memory_store_only=True) - return ray.get(f_ref) * 2 - - one = ray.put(1) - g_ref = g.remote([one]) - assert ray.get(g_ref) == 4 - - if __name__ == "__main__": import pytest sys.exit(pytest.main(["-v", __file__])) diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index dfbe8ef2ccd3..21fc462a7af6 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -1058,8 +1058,7 @@ Status CoreWorker::Get(const std::vector &ids, const int64_t timeout_m return Status::OK(); } -Status CoreWorker::Contains(const ObjectID &object_id, bool *has_object, - bool *is_in_plasma) { +Status CoreWorker::Contains(const ObjectID &object_id, bool *has_object) { bool found = false; bool in_plasma = false; found = memory_store_->Contains(object_id, &in_plasma); @@ -1067,9 +1066,6 @@ Status CoreWorker::Contains(const ObjectID &object_id, bool *has_object, RAY_RETURN_NOT_OK(plasma_store_provider_->Contains(object_id, &found)); } *has_object = found; - if (is_in_plasma != nullptr) { - *is_in_plasma = found && in_plasma; - } return Status::OK(); } @@ -2095,43 +2091,25 @@ void CoreWorker::HandleGetObjectStatus(const rpc::GetObjectStatusRequest &reques send_reply_callback(Status::OK(), nullptr, nullptr); } else { RAY_CHECK(owner_address.worker_id() == request.owner_worker_id()); - bool is_freed = reference_counter_->IsPlasmaObjectFreed(object_id); + if (reference_counter_->IsPlasmaObjectFreed(object_id)) { + reply->set_status(rpc::GetObjectStatusReply::FREED); + } else { + reply->set_status(rpc::GetObjectStatusReply::CREATED); + } // Send the reply once the value has become available. The value is // guaranteed to become available eventually because we own the object and // its ref count is > 0. - memory_store_->GetAsync(object_id, [reply, send_reply_callback, - is_freed](std::shared_ptr obj) { - if (is_freed) { - reply->set_status(rpc::GetObjectStatusReply::FREED); - } else { - // If obj is the concrete object value, it is small, so we - // send the object back to the caller in the GetObjectStatus - // reply, bypassing a Plasma put and object transfer. If obj - // is an indicator that the object is in Plasma, we set an - // in_plasma indicator on the message, and the caller will - // have to facilitate a Plasma object transfer to get the - // object value. - auto *object = reply->mutable_object(); - if (obj->HasData()) { - const auto &data = obj->GetData(); - object->set_data(data->Data(), data->Size()); - } - if (obj->HasMetadata()) { - const auto &metadata = obj->GetMetadata(); - object->set_metadata(metadata->Data(), metadata->Size()); - } - for (const auto &nested_id : obj->GetNestedIds()) { - object->add_nested_inlined_ids(nested_id.Binary()); - } - reply->set_status(rpc::GetObjectStatusReply::CREATED); - } - send_reply_callback(Status::OK(), nullptr, nullptr); - }); + // TODO(swang): We could probably just send the object value if it is small + // enough and we have it local. + memory_store_->GetAsync(object_id, + [send_reply_callback](std::shared_ptr obj) { + send_reply_callback(Status::OK(), nullptr, nullptr); + }); } RemoveLocalReference(object_id); -} // namespace ray +} void CoreWorker::HandleWaitForActorOutOfScope( const rpc::WaitForActorOutOfScopeRequest &request, diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h index 3002b9003630..088ba346a70c 100644 --- a/src/ray/core_worker/core_worker.h +++ b/src/ray/core_worker/core_worker.h @@ -559,10 +559,8 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { /// /// \param[in] object_id ID of the objects to check for. /// \param[out] has_object Whether or not the object is present. - /// \param[out] is_in_plasma Whether or not the object is in Plasma. /// \return Status. - Status Contains(const ObjectID &object_id, bool *has_object, - bool *is_in_plasma = nullptr); + Status Contains(const ObjectID &object_id, bool *has_object); /// Wait for a list of objects to appear in the object store. /// Duplicate object ids are supported, and `num_objects` includes duplicate ids in this diff --git a/src/ray/core_worker/future_resolver.cc b/src/ray/core_worker/future_resolver.cc index c625507cdbb5..8a1cc3f078ef 100644 --- a/src/ray/core_worker/future_resolver.cc +++ b/src/ray/core_worker/future_resolver.cc @@ -28,53 +28,30 @@ void FutureResolver::ResolveFutureAsync(const ObjectID &object_id, rpc::GetObjectStatusRequest request; request.set_object_id(object_id.Binary()); request.set_owner_worker_id(owner_address.worker_id()); - conn->GetObjectStatus(request, [this, object_id]( - const Status &status, - const rpc::GetObjectStatusReply &reply) { - if (!status.ok()) { - RAY_LOG(WARNING) << "Error retrieving the value of object ID " << object_id - << " that was deserialized: " << status.ToString(); - } + conn->GetObjectStatus( + request, + [this, object_id](const Status &status, const rpc::GetObjectStatusReply &reply) { + if (!status.ok()) { + RAY_LOG(WARNING) << "Error retrieving the value of object ID " << object_id + << " that was deserialized: " << status.ToString(); + } - if (!status.ok() || reply.status() == rpc::GetObjectStatusReply::OUT_OF_SCOPE) { - // The owner is gone or the owner replied that the object has gone - // out of scope (this is an edge case in the distributed ref counting - // protocol where a borrower dies before it can notify the owner of - // another borrower). Store an error so that an exception will be - // thrown immediately when the worker tries to get the value. - RAY_UNUSED(in_memory_store_->Put( - RayObject(rpc::ErrorType::OBJECT_UNRECONSTRUCTABLE), object_id)); - } else if (reply.status() == rpc::GetObjectStatusReply::CREATED) { - // The object is either an indicator that the object is in Plasma, or - // the object has been returned directly in the reply. In either - // case, we put the corresponding RayObject into the in-memory store. - // If the owner later fails or the object is released, the raylet - // will eventually store an error in Plasma on our behalf. - const auto &data = reply.object().data(); - std::shared_ptr data_buffer; - if (data.size() > 0) { - RAY_LOG(DEBUG) << "Object returned directly in GetObjectStatus reply, putting " - << object_id << " in memory store"; - data_buffer = std::make_shared( - const_cast(reinterpret_cast(data.data())), - data.size()); - } else { - RAY_LOG(DEBUG) << "Object not returned directly in GetObjectStatus reply, " - << object_id << " will have to be fetched from Plasma"; - } - const auto &metadata = reply.object().metadata(); - std::shared_ptr metadata_buffer; - if (metadata.size() > 0) { - metadata_buffer = std::make_shared( - const_cast(reinterpret_cast(metadata.data())), - metadata.size()); - } - auto inlined_ids = - IdVectorFromProtobuf(reply.object().nested_inlined_ids()); - RAY_UNUSED(in_memory_store_->Put( - RayObject(data_buffer, metadata_buffer, inlined_ids), object_id)); - } - }); + if (!status.ok() || reply.status() == rpc::GetObjectStatusReply::OUT_OF_SCOPE) { + // The owner is gone or the owner replied that the object has gone + // out of scope (this is an edge case in the distributed ref counting + // protocol where a borrower dies before it can notify the owner of + // another borrower). Store an error so that an exception will be + // thrown immediately when the worker tries to get the value. + RAY_UNUSED(in_memory_store_->Put( + RayObject(rpc::ErrorType::OBJECT_UNRECONSTRUCTABLE), object_id)); + } else { + // We can now try to fetch the object via plasma. If the owner later + // fails or the object is released, the raylet will eventually store + // an error in plasma on our behalf. + RAY_UNUSED(in_memory_store_->Put(RayObject(rpc::ErrorType::OBJECT_IN_PLASMA), + object_id)); + } + }); } } // namespace ray diff --git a/src/ray/core_worker/future_resolver.h b/src/ray/core_worker/future_resolver.h index b774434b71da..be504a582f3d 100644 --- a/src/ray/core_worker/future_resolver.h +++ b/src/ray/core_worker/future_resolver.h @@ -16,7 +16,6 @@ #include -#include "ray/common/grpc_util.h" #include "ray/common/id.h" #include "ray/core_worker/store_provider/memory_store/memory_store.h" #include "ray/rpc/worker/core_worker_client.h" diff --git a/src/ray/protobuf/core_worker.proto b/src/ray/protobuf/core_worker.proto index 43dfaa45bbe0..799530d274e9 100644 --- a/src/ray/protobuf/core_worker.proto +++ b/src/ray/protobuf/core_worker.proto @@ -132,15 +132,6 @@ message GetObjectStatusRequest { bytes object_id = 2; } -message RayObject { - // Data of the object. - bytes data = 1; - // Metadata of the object. - bytes metadata = 2; - // ObjectIDs that were nested in data. This is only set for inlined objects. - repeated bytes nested_inlined_ids = 3; -} - message GetObjectStatusReply { enum ObjectStatus { CREATED = 0; @@ -148,9 +139,6 @@ message GetObjectStatusReply { FREED = 2; } ObjectStatus status = 1; - // The Ray object: either a concrete value, an in-Plasma indicator, or an - // exception. - RayObject object = 2; } message WaitForActorOutOfScopeRequest { From 7d9f4c01b79b9e91b38a8003cbed1c5ef83dbfc5 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 240/244] Revert "[Java] Fix return of java doc (#13601)" This reverts commit 75a573d96e6f67d53adf17435a81629eb96c91d8. --- java/api/src/main/java/io/ray/api/Ray.java | 50 ++++++++----------- .../java/io/ray/api/call/ActorCreator.java | 5 +- .../java/io/ray/api/call/ActorTaskCaller.java | 2 +- .../io/ray/api/call/BaseActorCreator.java | 21 +++----- .../java/io/ray/api/call/BaseTaskCaller.java | 9 ++-- .../java/io/ray/api/call/PyActorCreator.java | 2 +- .../io/ray/api/call/PyActorTaskCaller.java | 2 +- .../java/io/ray/api/call/PyTaskCaller.java | 2 +- .../main/java/io/ray/api/call/TaskCaller.java | 2 +- .../io/ray/api/function/PyActorClass.java | 3 +- .../io/ray/api/function/PyActorMethod.java | 6 +-- .../java/io/ray/api/function/PyFunction.java | 6 +-- .../src/main/java/io/ray/api/id/BaseId.java | 2 +- .../ray/api/options/ActorCreationOptions.java | 25 ++++------ .../java/io/ray/api/options/CallOptions.java | 9 ++-- .../java/io/ray/api/runtime/RayRuntime.java | 50 +++++++------------ .../api/runtimecontext/RuntimeContext.java | 2 +- .../ray/runtime/actor/NativeActorHandle.java | 4 +- .../functionmanager/FunctionManager.java | 6 +-- .../java/io/ray/runtime/gcs/GcsClient.java | 5 +- .../java/io/ray/runtime/gcs/RedisClient.java | 2 +- .../java/io/ray/runtime/metric/Metric.java | 2 +- .../java/io/ray/runtime/metric/Metrics.java | 2 +- .../ray/runtime/object/ObjectSerializer.java | 6 +-- .../io/ray/runtime/object/ObjectStore.java | 24 ++++----- .../placementgroup/PlacementGroupImpl.java | 19 +++---- .../placementgroup/PlacementGroupUtils.java | 8 +-- .../io/ray/runtime/task/TaskSubmitter.java | 16 +++--- .../io/ray/runtime/util/BinaryFileUtil.java | 3 +- .../main/java/io/ray/runtime/util/IdUtil.java | 2 +- .../io/ray/runtime/util/ResourceUtil.java | 9 ++-- .../ray/streaming/api/function/Function.java | 2 +- .../api/function/impl/FilterFunction.java | 4 +- .../streaming/api/partition/Partition.java | 4 +- .../ray/streaming/api/stream/DataStream.java | 26 ++++------ .../api/stream/DataStreamSource.java | 3 +- .../streaming/api/stream/KeyDataStream.java | 6 +-- .../io/ray/streaming/jobgraph/JobGraph.java | 2 +- .../python/stream/PythonDataStream.java | 28 ++++------- .../python/stream/PythonKeyDataStream.java | 3 +- .../runtime/config/global/CommonConfig.java | 4 +- .../config/master/SchedulerConfig.java | 4 +- .../runtime/context/ContextBackend.java | 5 +- .../graph/executiongraph/ExecutionGraph.java | 30 +++++------ .../executiongraph/ExecutionJobVertex.java | 2 +- .../runtime/core/resource/Resources.java | 2 +- .../streaming/runtime/master/JobMaster.java | 5 +- .../master/graphmanager/GraphManager.java | 7 ++- .../resourcemanager/ResourceManager.java | 2 +- .../strategy/ResourceAssignStrategy.java | 3 +- .../strategy/impl/PipelineFirstStrategy.java | 16 +++--- .../master/scheduler/JobScheduler.java | 3 +- .../master/scheduler/JobSchedulerImpl.java | 6 +-- .../controller/WorkerLifecycleController.java | 12 ++--- .../runtime/rpc/RemoteCallWorker.java | 9 ++-- .../runtime/transfer/DataReader.java | 3 +- .../runtime/transfer/channel/ChannelId.java | 9 ++-- .../ray/streaming/runtime/util/EnvUtil.java | 2 +- .../ray/streaming/runtime/util/Platform.java | 5 +- .../ray/streaming/runtime/util/RayUtils.java | 4 +- .../runtime/util/ReflectionUtils.java | 2 +- .../streaming/runtime/util/ResourceUtil.java | 31 ++++++------ .../streaming/runtime/worker/JobWorker.java | 4 +- .../streaming/runtime/util/Mockitools.java | 4 +- .../state/keystate/KeyGroupAssignment.java | 4 +- .../state/keystate/state/MapState.java | 15 +++--- .../state/keystate/state/UnaryState.java | 2 +- 67 files changed, 229 insertions(+), 350 deletions(-) diff --git a/java/api/src/main/java/io/ray/api/Ray.java b/java/api/src/main/java/io/ray/api/Ray.java index da9047a66075..02ffc59c85e8 100644 --- a/java/api/src/main/java/io/ray/api/Ray.java +++ b/java/api/src/main/java/io/ray/api/Ray.java @@ -51,7 +51,7 @@ public static synchronized void shutdown() { /** * Check if {@link #init} has been called yet. * - * @return True if {@link #init} has already been called and false otherwise. + *

Returns True if {@link #init} has already been called and false otherwise. */ public static boolean isInitialized() { return runtime != null; @@ -60,8 +60,8 @@ public static boolean isInitialized() { /** * Store an object in the object store. * - * @param obj The Java object to be stored. - * @return A ObjectRef instance that represents the in-store object. + * @param obj The Java object to be stored. Returns A ObjectRef instance that represents the + * in-store object. */ public static ObjectRef put(T obj) { return internal().put(obj); @@ -70,8 +70,7 @@ public static ObjectRef put(T obj) { /** * Get an object by `ObjectRef` from the object store. * - * @param objectRef The reference of the object to get. - * @return The Java object. + * @param objectRef The reference of the object to get. Returns The Java object. */ public static T get(ObjectRef objectRef) { return internal().get(objectRef); @@ -80,8 +79,7 @@ public static T get(ObjectRef objectRef) { /** * Get a list of objects by `ObjectRef`s from the object store. * - * @param objectList A list of object references. - * @return A list of Java objects. + * @param objectList A list of object references. Returns A list of Java objects. */ public static List get(List> objectList) { return internal().get(objectList); @@ -93,8 +91,8 @@ public static List get(List> objectList) { * * @param waitList A list of object references to wait for. * @param numReturns The number of objects that should be returned. - * @param timeoutMs The maximum time in milliseconds to wait before returning. - * @return Two lists, one containing locally available objects, one containing the rest. + * @param timeoutMs The maximum time in milliseconds to wait before returning. Returns Two lists, + * one containing locally available objects, one containing the rest. */ public static WaitResult wait(List> waitList, int numReturns, int timeoutMs) { return internal().wait(waitList, numReturns, timeoutMs); @@ -105,8 +103,8 @@ public static WaitResult wait(List> waitList, int numReturns * objects are locally available. * * @param waitList A list of object references to wait for. - * @param numReturns The number of objects that should be returned. - * @return Two lists, one containing locally available objects, one containing the rest. + * @param numReturns The number of objects that should be returned. Returns Two lists, one + * containing locally available objects, one containing the rest. */ public static WaitResult wait(List> waitList, int numReturns) { return internal().wait(waitList, numReturns, Integer.MAX_VALUE); @@ -116,8 +114,8 @@ public static WaitResult wait(List> waitList, int numReturns * A convenient helper method for Ray.wait. It will wait infinitely until all objects are locally * available. * - * @param waitList A list of object references to wait for. - * @return Two lists, one containing locally available objects, one containing the rest. + * @param waitList A list of object references to wait for. Returns Two lists, one containing + * locally available objects, one containing the rest. */ public static WaitResult wait(List> waitList) { return internal().wait(waitList, waitList.size(), Integer.MAX_VALUE); @@ -129,9 +127,8 @@ public static WaitResult wait(List> waitList) { *

Gets a handle to a named actor with the given name. The actor must have been created with * name specified. * - * @param name The name of the named actor. - * @return an ActorHandle to the actor if the actor of specified name exists or an - * Optional.empty() + * @param name The name of the named actor. Returns an ActorHandle to the actor if the actor of + * specified name exists or an Optional.empty() */ public static Optional getActor(String name) { return internal().getActor(name, false); @@ -143,9 +140,8 @@ public static Optional getActor(String name) { *

Gets a handle to a global named actor with the given name. The actor must have been created * with global name specified. * - * @param name The global name of the named actor. - * @return an ActorHandle to the actor if the actor of specified name exists or an - * Optional.empty() + * @param name The global name of the named actor. Returns an ActorHandle to the actor if the + * actor of specified name exists or an Optional.empty() */ public static Optional getGlobalActor(String name) { return internal().getActor(name, true); @@ -155,7 +151,7 @@ public static Optional getGlobalActor(String name * If users want to use Ray API in their own threads, call this method to get the async context * and then call {@link #setAsyncContext} at the beginning of the new thread. * - * @return The async context. + *

Returns The async context. */ public static Object getAsyncContext() { return internal().getAsyncContext(); @@ -179,8 +175,7 @@ public static void setAsyncContext(Object asyncContext) { * If users want to use Ray API in their own threads, they should wrap their {@link Runnable} * objects with this method. * - * @param runnable The runnable to wrap. - * @return The wrapped runnable. + * @param runnable The runnable to wrap. Returns The wrapped runnable. */ public static Runnable wrapRunnable(Runnable runnable) { return internal().wrapRunnable(runnable); @@ -190,8 +185,7 @@ public static Runnable wrapRunnable(Runnable runnable) { * If users want to use Ray API in their own threads, they should wrap their {@link Callable} * objects with this method. * - * @param callable The callable to wrap. - * @return The wrapped callable. + * @param callable The callable to wrap. Returns The wrapped callable. */ public static Callable wrapCallable(Callable callable) { return internal().wrapCallable(callable); @@ -244,8 +238,7 @@ public static RuntimeContext getRuntimeContext() { * * @param name Name of the placement group. * @param bundles Pre-allocated resource list. - * @param strategy Actor placement strategy. - * @return A handle to the created placement group. + * @param strategy Actor placement strategy. Returns A handle to the created placement group. */ public static PlacementGroup createPlacementGroup( String name, List> bundles, PlacementStrategy strategy) { @@ -272,8 +265,7 @@ public static void exitActor() { /** * Get a placement group by placement group Id. * - * @param id placement group id. - * @return The placement group. + * @param id placement group id. Returns The placement group. */ public static PlacementGroup getPlacementGroup(PlacementGroupId id) { return internal().getPlacementGroup(id); @@ -282,7 +274,7 @@ public static PlacementGroup getPlacementGroup(PlacementGroupId id) { /** * Get all placement groups in this cluster. * - * @return All placement groups. + *

Returns All placement groups. */ public static List getAllPlacementGroups() { return internal().getAllPlacementGroups(); diff --git a/java/api/src/main/java/io/ray/api/call/ActorCreator.java b/java/api/src/main/java/io/ray/api/call/ActorCreator.java index b64a4fbcd0e5..c6bb9cce8ea7 100644 --- a/java/api/src/main/java/io/ray/api/call/ActorCreator.java +++ b/java/api/src/main/java/io/ray/api/call/ActorCreator.java @@ -23,8 +23,7 @@ public ActorCreator(RayFuncR func, Object[] args) { * *

Note, if this is set, this actor won't share Java worker with other actors or tasks. * - * @param jvmOptions JVM options for the Java worker that this actor is running in. - * @return self + * @param jvmOptions JVM options for the Java worker that this actor is running in. Returns self * @see io.ray.api.options.ActorCreationOptions.Builder#setJvmOptions(java.lang.String) */ public ActorCreator setJvmOptions(String jvmOptions) { @@ -35,7 +34,7 @@ public ActorCreator setJvmOptions(String jvmOptions) { /** * Create a java actor remotely and return a handle to the created actor. * - * @return a handle to the created java actor. + *

Returns a handle to the created java actor. */ public ActorHandle remote() { return Ray.internal().createActor(func, args, buildOptions()); diff --git a/java/api/src/main/java/io/ray/api/call/ActorTaskCaller.java b/java/api/src/main/java/io/ray/api/call/ActorTaskCaller.java index 4579acbb876d..4b9d25a21478 100644 --- a/java/api/src/main/java/io/ray/api/call/ActorTaskCaller.java +++ b/java/api/src/main/java/io/ray/api/call/ActorTaskCaller.java @@ -25,7 +25,7 @@ public ActorTaskCaller(ActorHandle actor, RayFuncR func, Object[] args) { * Execute an java actor method remotely and return an object reference to the result object in * the object store. * - * @return an object reference to an object in the object store. + *

Returns an object reference to an object in the object store. */ @SuppressWarnings("unchecked") public ObjectRef remote() { diff --git a/java/api/src/main/java/io/ray/api/call/BaseActorCreator.java b/java/api/src/main/java/io/ray/api/call/BaseActorCreator.java index 7e761b4c2859..5f488124b16c 100644 --- a/java/api/src/main/java/io/ray/api/call/BaseActorCreator.java +++ b/java/api/src/main/java/io/ray/api/call/BaseActorCreator.java @@ -18,8 +18,7 @@ public class BaseActorCreator { * name via {@link Ray#getActor(java.lang.String)}. If you want create a named actor that is * accessible from all jobs, use {@link BaseActorCreator#setGlobalName(java.lang.String)} instead. * - * @param name The name of the named actor. - * @return self + * @param name The name of the named actor. Returns self * @see io.ray.api.options.ActorCreationOptions.Builder#setName(String) */ public T setName(String name) { @@ -32,8 +31,7 @@ public T setName(String name) { * Ray#getGlobalActor(java.lang.String)}. If you want to create a named actor that is only * accessible from this job, use {@link BaseActorCreator#setName(java.lang.String)} instead. * - * @param name The name of the named actor. - * @return self + * @param name The name of the named actor. Returns self * @see io.ray.api.options.ActorCreationOptions.Builder#setGlobalName(String) */ public T setGlobalName(String name) { @@ -47,8 +45,7 @@ public T setGlobalName(String name) { * used. * * @param resourceName resource name - * @param resourceQuantity resource quantity - * @return self + * @param resourceQuantity resource quantity Returns self * @see ActorCreationOptions.Builder#setResource(java.lang.String, java.lang.Double) */ public T setResource(String resourceName, Double resourceQuantity) { @@ -61,8 +58,7 @@ public T setResource(String resourceName, Double resourceQuantity) { * called multiple times. If the same resource is set multiple times, the latest quantity will be * used. * - * @param resources requirements for multiple resources. - * @return self + * @param resources requirements for multiple resources. Returns self * @see BaseActorCreator#setResources(java.util.Map) */ public T setResources(Map resources) { @@ -75,8 +71,7 @@ public T setResources(Map resources) { * unexpectedly. The minimum valid value is 0 (default), which indicates that the actor doesn't * need to be restarted. A value of -1 indicates that an actor should be restarted indefinitely. * - * @param maxRestarts max number of actor restarts - * @return self + * @param maxRestarts max number of actor restarts Returns self * @see ActorCreationOptions.Builder#setMaxRestarts(int) */ public T setMaxRestarts(int maxRestarts) { @@ -90,8 +85,7 @@ public T setMaxRestarts(int maxRestarts) { *

The max concurrency defaults to 1 for threaded execution. Note that the execution order is * not guaranteed when {@code max_concurrency > 1}. * - * @param maxConcurrency The max number of concurrent calls to allow for this actor. - * @return self + * @param maxConcurrency The max number of concurrent calls to allow for this actor. Returns self * @see ActorCreationOptions.Builder#setMaxConcurrency(int) */ public T setMaxConcurrency(int maxConcurrency) { @@ -103,8 +97,7 @@ public T setMaxConcurrency(int maxConcurrency) { * Set the placement group to place this actor in. * * @param group The placement group of the actor. - * @param bundleIndex The index of the bundle to place this actor in. - * @return self + * @param bundleIndex The index of the bundle to place this actor in. Returns self * @see ActorCreationOptions.Builder#setPlacementGroup(PlacementGroup, int) */ public T setPlacementGroup(PlacementGroup group, int bundleIndex) { diff --git a/java/api/src/main/java/io/ray/api/call/BaseTaskCaller.java b/java/api/src/main/java/io/ray/api/call/BaseTaskCaller.java index 88c58e05350f..8b683c7bdf55 100644 --- a/java/api/src/main/java/io/ray/api/call/BaseTaskCaller.java +++ b/java/api/src/main/java/io/ray/api/call/BaseTaskCaller.java @@ -14,8 +14,7 @@ public class BaseTaskCaller> { /** * Set a name for this task. * - * @param name task name - * @return self + * @param name task name Returns self * @see CallOptions.Builder#setName(java.lang.String) */ public T setName(String name) { @@ -28,8 +27,7 @@ public T setName(String name) { * times. If the same resource is set multiple times, the latest quantity will be used. * * @param name resource name - * @param value resource capacity - * @return self + * @param value resource capacity Returns self * @see CallOptions.Builder#setResource(java.lang.String, java.lang.Double) */ public T setResource(String name, Double value) { @@ -41,8 +39,7 @@ public T setResource(String name, Double value) { * Set custom requirements for multiple resources. This method can be called multiple times. If * the same resource is set multiple times, the latest quantity will be used. * - * @param resources requirements for multiple resources. - * @return self + * @param resources requirements for multiple resources. Returns self * @see CallOptions.Builder#setResources(java.util.Map) */ public T setResources(Map resources) { diff --git a/java/api/src/main/java/io/ray/api/call/PyActorCreator.java b/java/api/src/main/java/io/ray/api/call/PyActorCreator.java index fb87a1eac7da..5add65346c73 100644 --- a/java/api/src/main/java/io/ray/api/call/PyActorCreator.java +++ b/java/api/src/main/java/io/ray/api/call/PyActorCreator.java @@ -17,7 +17,7 @@ public PyActorCreator(PyActorClass pyActorClass, Object[] args) { /** * Create a python actor remotely and return a handle to the created actor. * - * @return a handle to the created python actor. + *

Returns a handle to the created python actor. */ public PyActorHandle remote() { return Ray.internal().createActor(pyActorClass, args, buildOptions()); diff --git a/java/api/src/main/java/io/ray/api/call/PyActorTaskCaller.java b/java/api/src/main/java/io/ray/api/call/PyActorTaskCaller.java index 7ee7d8a13c92..c9444548f407 100644 --- a/java/api/src/main/java/io/ray/api/call/PyActorTaskCaller.java +++ b/java/api/src/main/java/io/ray/api/call/PyActorTaskCaller.java @@ -25,7 +25,7 @@ public PyActorTaskCaller(PyActorHandle actor, PyActorMethod method, Object[] * Execute a python actor method remotely and return an object reference to the result object in * the object store. * - * @return an object reference to an object in the object store. + *

Returns an object reference to an object in the object store. */ @SuppressWarnings("unchecked") public ObjectRef remote() { diff --git a/java/api/src/main/java/io/ray/api/call/PyTaskCaller.java b/java/api/src/main/java/io/ray/api/call/PyTaskCaller.java index ecd7aa3c8987..8d58e9b300a8 100644 --- a/java/api/src/main/java/io/ray/api/call/PyTaskCaller.java +++ b/java/api/src/main/java/io/ray/api/call/PyTaskCaller.java @@ -22,7 +22,7 @@ public PyTaskCaller(PyFunction func, Object[] args) { * Execute a python function remotely and return an object reference to the result object in the * object store. * - * @return an object reference to an object in the object store. + *

Returns an object reference to an object in the object store. */ @SuppressWarnings("unchecked") public ObjectRef remote() { diff --git a/java/api/src/main/java/io/ray/api/call/TaskCaller.java b/java/api/src/main/java/io/ray/api/call/TaskCaller.java index 80dacec2dfdc..82f72d63e6cd 100644 --- a/java/api/src/main/java/io/ray/api/call/TaskCaller.java +++ b/java/api/src/main/java/io/ray/api/call/TaskCaller.java @@ -22,7 +22,7 @@ public TaskCaller(RayFuncR func, Object[] args) { * Execute a java function remotely and return an object reference to the result object in the * object store. * - * @return an object reference to an object in the object store. + *

Returns an object reference to an object in the object store. */ @SuppressWarnings("unchecked") public ObjectRef remote() { diff --git a/java/api/src/main/java/io/ray/api/function/PyActorClass.java b/java/api/src/main/java/io/ray/api/function/PyActorClass.java index d76385919b9b..c753e1f27b72 100644 --- a/java/api/src/main/java/io/ray/api/function/PyActorClass.java +++ b/java/api/src/main/java/io/ray/api/function/PyActorClass.java @@ -38,8 +38,7 @@ private PyActorClass(String moduleName, String className) { * Create a python actor class. * * @param moduleName The full module name of this actor class - * @param className The name of this actor class - * @return a python actor class + * @param className The name of this actor class Returns a python actor class */ public static PyActorClass of(String moduleName, String className) { return new PyActorClass(moduleName, className); diff --git a/java/api/src/main/java/io/ray/api/function/PyActorMethod.java b/java/api/src/main/java/io/ray/api/function/PyActorMethod.java index 6f24b5d11a3c..f91b0c9f9c10 100644 --- a/java/api/src/main/java/io/ray/api/function/PyActorMethod.java +++ b/java/api/src/main/java/io/ray/api/function/PyActorMethod.java @@ -43,8 +43,7 @@ private PyActorMethod(String methodName, Class returnType) { /** * Create a python actor method. * - * @param methodName The name of this actor method - * @return a python actor method. + * @param methodName The name of this actor method Returns a python actor method. */ public static PyActorMethod of(String methodName) { return of(methodName, Object.class); @@ -55,8 +54,7 @@ public static PyActorMethod of(String methodName) { * * @param methodName The name of this actor method * @param returnType Class of the return value of this actor method - * @param The type of the return value of this actor method - * @return a python actor method. + * @param The type of the return value of this actor method Returns a python actor method. */ public static PyActorMethod of(String methodName, Class returnType) { return new PyActorMethod<>(methodName, returnType); diff --git a/java/api/src/main/java/io/ray/api/function/PyFunction.java b/java/api/src/main/java/io/ray/api/function/PyFunction.java index 2119b0bbf310..119bba4e5be2 100644 --- a/java/api/src/main/java/io/ray/api/function/PyFunction.java +++ b/java/api/src/main/java/io/ray/api/function/PyFunction.java @@ -49,8 +49,7 @@ private PyFunction(String moduleName, String functionName, Class returnType) * Create a python function. * * @param moduleName The full module name of this function - * @param functionName The name of this function - * @return a python function. + * @param functionName The name of this function Returns a python function. */ public static PyFunction of(String moduleName, String functionName) { return of(moduleName, functionName, Object.class); @@ -62,8 +61,7 @@ public static PyFunction of(String moduleName, String functionName) { * @param moduleName The full module name of this function * @param functionName The name of this function * @param returnType Class of the return value of this function - * @param Type of the return value of this function - * @return a python function. + * @param Type of the return value of this function Returns a python function. */ public static PyFunction of(String moduleName, String functionName, Class returnType) { return new PyFunction<>(moduleName, functionName, returnType); diff --git a/java/api/src/main/java/io/ray/api/id/BaseId.java b/java/api/src/main/java/io/ray/api/id/BaseId.java index ee91a77d63c4..573f549b2fa3 100644 --- a/java/api/src/main/java/io/ray/api/id/BaseId.java +++ b/java/api/src/main/java/io/ray/api/id/BaseId.java @@ -52,7 +52,7 @@ public boolean isNil() { /** * Derived class should implement this function. * - * @return The length of this id in bytes. + *

Returns The length of this id in bytes. */ public abstract int size(); diff --git a/java/api/src/main/java/io/ray/api/options/ActorCreationOptions.java b/java/api/src/main/java/io/ray/api/options/ActorCreationOptions.java index 303239735586..29a13c115052 100644 --- a/java/api/src/main/java/io/ray/api/options/ActorCreationOptions.java +++ b/java/api/src/main/java/io/ray/api/options/ActorCreationOptions.java @@ -50,8 +50,7 @@ public static class Builder { * this name via {@link Ray#getActor(java.lang.String)}. If you want create a named actor that * is accessible from all jobs, use {@link Builder#setGlobalName(java.lang.String)} instead. * - * @param name The name of the named actor. - * @return self + * @param name The name of the named actor. Returns self */ public Builder setName(String name) { this.name = name; @@ -64,8 +63,7 @@ public Builder setName(String name) { * {@link Ray#getGlobalActor(java.lang.String)}. If you want to create a named actor that is * only accessible from this job, use {@link Builder#setName(java.lang.String)} instead. * - * @param name The name of the named actor. - * @return self + * @param name The name of the named actor. Returns self */ public Builder setGlobalName(String name) { this.name = name; @@ -79,8 +77,7 @@ public Builder setGlobalName(String name) { * will be used. * * @param resourceName resource name - * @param resourceQuantity resource quantity - * @return self + * @param resourceQuantity resource quantity Returns self */ public Builder setResource(String resourceName, Double resourceQuantity) { this.resources.put(resourceName, resourceQuantity); @@ -92,8 +89,7 @@ public Builder setResource(String resourceName, Double resourceQuantity) { * be called multiple times. If the same resource is set multiple times, the latest quantity * will be used. * - * @param resources requirements for multiple resources. - * @return self + * @param resources requirements for multiple resources. Returns self */ public Builder setResources(Map resources) { this.resources.putAll(resources); @@ -105,8 +101,7 @@ public Builder setResources(Map resources) { * unexpectedly. The minimum valid value is 0 (default), which indicates that the actor doesn't * need to be restarted. A value of -1 indicates that an actor should be restarted indefinitely. * - * @param maxRestarts max number of actor restarts - * @return self + * @param maxRestarts max number of actor restarts Returns self */ public Builder setMaxRestarts(int maxRestarts) { this.maxRestarts = maxRestarts; @@ -118,8 +113,7 @@ public Builder setMaxRestarts(int maxRestarts) { * *

Note, if this is set, this actor won't share Java worker with other actors or tasks. * - * @param jvmOptions JVM options for the Java worker that this actor is running in. - * @return self + * @param jvmOptions JVM options for the Java worker that this actor is running in. Returns self */ public Builder setJvmOptions(String jvmOptions) { this.jvmOptions = jvmOptions; @@ -132,8 +126,8 @@ public Builder setJvmOptions(String jvmOptions) { *

The max concurrency defaults to 1 for threaded execution. Note that the execution order is * not guaranteed when {@code max_concurrency > 1}. * - * @param maxConcurrency The max number of concurrent calls to allow for this actor. - * @return self + * @param maxConcurrency The max number of concurrent calls to allow for this actor. Returns + * self */ public Builder setMaxConcurrency(int maxConcurrency) { if (maxConcurrency <= 0) { @@ -148,8 +142,7 @@ public Builder setMaxConcurrency(int maxConcurrency) { * Set the placement group to place this actor in. * * @param group The placement group of the actor. - * @param bundleIndex The index of the bundle to place this actor in. - * @return self + * @param bundleIndex The index of the bundle to place this actor in. Returns self */ public Builder setPlacementGroup(PlacementGroup group, int bundleIndex) { this.group = group; diff --git a/java/api/src/main/java/io/ray/api/options/CallOptions.java b/java/api/src/main/java/io/ray/api/options/CallOptions.java index 37e474d55a33..233c30aa3fe2 100644 --- a/java/api/src/main/java/io/ray/api/options/CallOptions.java +++ b/java/api/src/main/java/io/ray/api/options/CallOptions.java @@ -22,8 +22,7 @@ public static class Builder { /** * Set a name for this task. * - * @param name task name - * @return self + * @param name task name Returns self */ public Builder setName(String name) { this.name = name; @@ -35,8 +34,7 @@ public Builder setName(String name) { * multiple times. If the same resource is set multiple times, the latest quantity will be used. * * @param name resource name - * @param value resource capacity - * @return self + * @param value resource capacity Returns self */ public Builder setResource(String name, Double value) { this.resources.put(name, value); @@ -47,8 +45,7 @@ public Builder setResource(String name, Double value) { * Set custom requirements for multiple resources. This method can be called multiple times. If * the same resource is set multiple times, the latest quantity will be used. * - * @param resources requirements for multiple resources. - * @return self + * @param resources requirements for multiple resources. Returns self */ public Builder setResources(Map resources) { this.resources.putAll(resources); diff --git a/java/api/src/main/java/io/ray/api/runtime/RayRuntime.java b/java/api/src/main/java/io/ray/api/runtime/RayRuntime.java index 53da3d48dae8..2f3eeb2a7160 100644 --- a/java/api/src/main/java/io/ray/api/runtime/RayRuntime.java +++ b/java/api/src/main/java/io/ray/api/runtime/RayRuntime.java @@ -31,24 +31,22 @@ public interface RayRuntime { /** * Store an object in the object store. * - * @param obj The Java object to be stored. - * @return A ObjectRef instance that represents the in-store object. + * @param obj The Java object to be stored. Returns A ObjectRef instance that represents the + * in-store object. */ ObjectRef put(T obj); /** * Get an object from the object store. * - * @param objectRef The reference of the object to get. - * @return The Java object. + * @param objectRef The reference of the object to get. Returns The Java object. */ T get(ObjectRef objectRef); /** * Get a list of objects from the object store. * - * @param objectRefs The list of object references. - * @return A list of Java objects. + * @param objectRefs The list of object references. Returns A list of Java objects. */ List get(List> objectRefs); @@ -58,8 +56,8 @@ public interface RayRuntime { * * @param waitList A list of ObjectRef to wait for. * @param numReturns The number of objects that should be returned. - * @param timeoutMs The maximum time in milliseconds to wait before returning. - * @return Two lists, one containing locally available objects, one containing the rest. + * @param timeoutMs The maximum time in milliseconds to wait before returning. Returns Two lists, + * one containing locally available objects, one containing the rest. */ WaitResult wait(List> waitList, int numReturns, int timeoutMs); @@ -89,8 +87,7 @@ public interface RayRuntime { * name specified. * * @param name The name of the named actor. - * @param global Whether the named actor is global. - * @return ActorHandle to the actor. + * @param global Whether the named actor is global. Returns ActorHandle to the actor. */ Optional getActor(String name, boolean global); @@ -107,8 +104,7 @@ public interface RayRuntime { * * @param func The remote function to run. * @param args The arguments of the remote function. - * @param options The options for this call. - * @return The result object. + * @param options The options for this call. Returns The result object. */ ObjectRef call(RayFunc func, Object[] args, CallOptions options); @@ -117,8 +113,7 @@ public interface RayRuntime { * * @param pyFunction The Python function. * @param args Arguments of the function. - * @param options The options for this call. - * @return The result object. + * @param options The options for this call. Returns The result object. */ ObjectRef call(PyFunction pyFunction, Object[] args, CallOptions options); @@ -127,8 +122,7 @@ public interface RayRuntime { * * @param actor A handle to the actor. * @param func The remote function to run, it must be a method of the given actor. - * @param args The arguments of the remote function. - * @return The result object. + * @param args The arguments of the remote function. Returns The result object. */ ObjectRef callActor(ActorHandle actor, RayFunc func, Object[] args); @@ -137,8 +131,7 @@ public interface RayRuntime { * * @param pyActor A handle to the actor. * @param pyActorMethod The actor method. - * @param args Arguments of the function. - * @return The result object. + * @param args Arguments of the function. Returns The result object. */ ObjectRef callActor(PyActorHandle pyActor, PyActorMethod pyActorMethod, Object[] args); @@ -148,8 +141,7 @@ public interface RayRuntime { * @param actorFactoryFunc A remote function whose return value is the actor object. * @param args The arguments for the remote function. * @param The type of the actor object. - * @param options The options for creating actor. - * @return A handle to the actor. + * @param options The options for creating actor. Returns A handle to the actor. */ ActorHandle createActor( RayFunc actorFactoryFunc, Object[] args, ActorCreationOptions options); @@ -159,8 +151,7 @@ ActorHandle createActor( * * @param pyActorClass The Python actor class. * @param args Arguments of the actor constructor. - * @param options The options for creating actor. - * @return A handle to the actor. + * @param options The options for creating actor. Returns A handle to the actor. */ PyActorHandle createActor(PyActorClass pyActorClass, Object[] args, ActorCreationOptions options); @@ -179,16 +170,14 @@ PlacementGroup createPlacementGroup( /** * Wrap a {@link Runnable} with necessary context capture. * - * @param runnable The runnable to wrap. - * @return The wrapped runnable. + * @param runnable The runnable to wrap. Returns The wrapped runnable. */ Runnable wrapRunnable(Runnable runnable); /** * Wrap a {@link Callable} with necessary context capture. * - * @param callable The callable to wrap. - * @return The wrapped callable. + * @param callable The callable to wrap. Returns The wrapped callable. */ Callable wrapCallable(Callable callable); @@ -198,15 +187,14 @@ PlacementGroup createPlacementGroup( /** * Get a placement group by id. * - * @param id placement group id. - * @return The placement group. + * @param id placement group id. Returns The placement group. */ PlacementGroup getPlacementGroup(PlacementGroupId id); /** * Get all placement groups in this cluster. * - * @return All placement groups. + *

Returns All placement groups. */ List getAllPlacementGroups(); @@ -221,8 +209,8 @@ PlacementGroup createPlacementGroup( * Wait for the placement group to be ready within the specified time. * * @param id Id of placement group. - * @param timeoutMs Timeout in milliseconds. - * @return True if the placement group is created. False otherwise. + * @param timeoutMs Timeout in milliseconds. Returns True if the placement group is created. False + * otherwise. */ boolean waitPlacementGroupReady(PlacementGroupId id, int timeoutMs); } diff --git a/java/api/src/main/java/io/ray/api/runtimecontext/RuntimeContext.java b/java/api/src/main/java/io/ray/api/runtimecontext/RuntimeContext.java index d00ea4f1195b..b5fa486aa586 100644 --- a/java/api/src/main/java/io/ray/api/runtimecontext/RuntimeContext.java +++ b/java/api/src/main/java/io/ray/api/runtimecontext/RuntimeContext.java @@ -21,7 +21,7 @@ public interface RuntimeContext { boolean wasCurrentActorRestarted(); /** - * Returns true if Ray is running in single-process mode, false if Ray is running in cluster mode. + * Return true if Ray is running in single-process mode, false if Ray is running in cluster mode. */ boolean isSingleProcess(); diff --git a/java/runtime/src/main/java/io/ray/runtime/actor/NativeActorHandle.java b/java/runtime/src/main/java/io/ray/runtime/actor/NativeActorHandle.java index 85a46ad8b963..1dd4b84f5c2b 100644 --- a/java/runtime/src/main/java/io/ray/runtime/actor/NativeActorHandle.java +++ b/java/runtime/src/main/java/io/ray/runtime/actor/NativeActorHandle.java @@ -71,7 +71,7 @@ public void readExternal(ObjectInput in) throws IOException, ClassNotFoundExcept /** * Serialize this actor handle to bytes. * - * @return the bytes of the actor handle + *

Returns the bytes of the actor handle */ public byte[] toBytes() { return nativeSerialize(actorId); @@ -80,7 +80,7 @@ public byte[] toBytes() { /** * Deserialize an actor handle from bytes. * - * @return the bytes of an actor handle + *

Returns the bytes of an actor handle */ public static NativeActorHandle fromBytes(byte[] bytes) { byte[] actorId = nativeDeserialize(bytes); diff --git a/java/runtime/src/main/java/io/ray/runtime/functionmanager/FunctionManager.java b/java/runtime/src/main/java/io/ray/runtime/functionmanager/FunctionManager.java index c9ef7ce3bbe6..d26a13dca193 100644 --- a/java/runtime/src/main/java/io/ray/runtime/functionmanager/FunctionManager.java +++ b/java/runtime/src/main/java/io/ray/runtime/functionmanager/FunctionManager.java @@ -69,8 +69,7 @@ public FunctionManager(List codeSearchPath) { * Get the RayFunction from a RayFunc instance (a lambda). * * @param jobId current job id. - * @param func The lambda. - * @return A RayFunction object. + * @param func The lambda. Returns A RayFunction object. */ public RayFunction getFunction(JobId jobId, RayFunc func) { JavaFunctionDescriptor functionDescriptor = RAY_FUNC_CACHE.get().get(func.getClass()); @@ -91,8 +90,7 @@ public RayFunction getFunction(JobId jobId, RayFunc func) { * Get the RayFunction from a function descriptor. * * @param jobId Current job id. - * @param functionDescriptor The function descriptor. - * @return A RayFunction object. + * @param functionDescriptor The function descriptor. Returns A RayFunction object. */ public RayFunction getFunction(JobId jobId, JavaFunctionDescriptor functionDescriptor) { JobFunctionTable jobFunctionTable = jobFunctionTables.get(jobId); diff --git a/java/runtime/src/main/java/io/ray/runtime/gcs/GcsClient.java b/java/runtime/src/main/java/io/ray/runtime/gcs/GcsClient.java index cc70bbd7e963..df34212e7eec 100644 --- a/java/runtime/src/main/java/io/ray/runtime/gcs/GcsClient.java +++ b/java/runtime/src/main/java/io/ray/runtime/gcs/GcsClient.java @@ -35,8 +35,7 @@ public GcsClient(String redisAddress, String redisPassword) { /** * Get placement group by {@link PlacementGroupId}. * - * @param placementGroupId Id of placement group. - * @return The placement group. + * @param placementGroupId Id of placement group. Returns The placement group. */ public PlacementGroup getPlacementGroupInfo(PlacementGroupId placementGroupId) { byte[] result = globalStateAccessor.getPlacementGroupInfo(placementGroupId); @@ -46,7 +45,7 @@ public PlacementGroup getPlacementGroupInfo(PlacementGroupId placementGroupId) { /** * Get all placement groups in this cluster. * - * @return All placement groups. + *

Returns All placement groups. */ public List getAllPlacementGroupInfo() { List results = globalStateAccessor.getAllPlacementGroupInfo(); diff --git a/java/runtime/src/main/java/io/ray/runtime/gcs/RedisClient.java b/java/runtime/src/main/java/io/ray/runtime/gcs/RedisClient.java index 811402994e4e..77004a8493a4 100644 --- a/java/runtime/src/main/java/io/ray/runtime/gcs/RedisClient.java +++ b/java/runtime/src/main/java/io/ray/runtime/gcs/RedisClient.java @@ -88,7 +88,7 @@ public byte[] get(byte[] key, byte[] field) { /** * Return the specified elements of the list stored at the specified key. * - * @return Multi bulk reply, specifically a list of elements in the specified range. + *

Returns Multi bulk reply, specifically a list of elements in the specified range. */ public List lrange(byte[] key, long start, long end) { try (Jedis jedis = jedisPool.getResource()) { diff --git a/java/runtime/src/main/java/io/ray/runtime/metric/Metric.java b/java/runtime/src/main/java/io/ray/runtime/metric/Metric.java index 80c39cf96f50..961cbfe9a9b8 100644 --- a/java/runtime/src/main/java/io/ray/runtime/metric/Metric.java +++ b/java/runtime/src/main/java/io/ray/runtime/metric/Metric.java @@ -54,7 +54,7 @@ public void record() { /** * Get the value to record and then reset. * - * @return latest updating value. + *

Returns latest updating value. */ protected abstract double getAndReset(); diff --git a/java/runtime/src/main/java/io/ray/runtime/metric/Metrics.java b/java/runtime/src/main/java/io/ray/runtime/metric/Metrics.java index f3af834f6715..85939ed79abb 100644 --- a/java/runtime/src/main/java/io/ray/runtime/metric/Metrics.java +++ b/java/runtime/src/main/java/io/ray/runtime/metric/Metrics.java @@ -111,7 +111,7 @@ public B tags(Map tags) { /** * Creates a metric by sub-class. * - * @return a metric + *

Returns a metric */ protected abstract M create(); diff --git a/java/runtime/src/main/java/io/ray/runtime/object/ObjectSerializer.java b/java/runtime/src/main/java/io/ray/runtime/object/ObjectSerializer.java index 51ae9bfd2b98..76576b969e20 100644 --- a/java/runtime/src/main/java/io/ray/runtime/object/ObjectSerializer.java +++ b/java/runtime/src/main/java/io/ray/runtime/object/ObjectSerializer.java @@ -55,8 +55,7 @@ public class ObjectSerializer { * Deserialize an object from an {@link NativeRayObject} instance. * * @param nativeRayObject The object to deserialize. - * @param objectId The associated object ID of the object. - * @return The deserialized object. + * @param objectId The associated object ID of the object. Returns The deserialized object. */ public static Object deserialize( NativeRayObject nativeRayObject, ObjectId objectId, Class objectType) { @@ -111,8 +110,7 @@ public static Object deserialize( /** * Serialize an Java object to an {@link NativeRayObject} instance. * - * @param object The object to serialize. - * @return The serialized object. + * @param object The object to serialize. Returns The serialized object. */ public static NativeRayObject serialize(Object object) { if (object instanceof NativeRayObject) { diff --git a/java/runtime/src/main/java/io/ray/runtime/object/ObjectStore.java b/java/runtime/src/main/java/io/ray/runtime/object/ObjectStore.java index 8711811b24ad..df524af11c8a 100644 --- a/java/runtime/src/main/java/io/ray/runtime/object/ObjectStore.java +++ b/java/runtime/src/main/java/io/ray/runtime/object/ObjectStore.java @@ -26,8 +26,7 @@ public ObjectStore(WorkerContext workerContext) { /** * Put a raw object into object store. * - * @param obj The ray object. - * @return Generated ID of the object. + * @param obj The ray object. Returns Generated ID of the object. */ public abstract ObjectId putRaw(NativeRayObject obj); @@ -42,8 +41,7 @@ public ObjectStore(WorkerContext workerContext) { /** * Serialize and put an object to the object store. * - * @param object The object to put. - * @return Id of the object. + * @param object The object to put. Returns Id of the object. */ public ObjectId put(Object object) { if (object instanceof NativeRayObject) { @@ -73,8 +71,8 @@ public void put(Object object, ObjectId objectId) { * Get a list of raw objects from the object store. * * @param objectIds IDs of the objects to get. - * @param timeoutMs Timeout in milliseconds, wait infinitely if it's negative. - * @return Result list of objects data. + * @param timeoutMs Timeout in milliseconds, wait infinitely if it's negative. Returns Result list + * of objects data. */ public abstract List getRaw(List objectIds, long timeoutMs); @@ -82,8 +80,7 @@ public void put(Object object, ObjectId objectId) { * Get a list of objects from the object store. * * @param ids List of the object ids. - * @param Type of these objects. - * @return A list of GetResult objects. + * @param Type of these objects. Returns A list of GetResult objects. */ @SuppressWarnings("unchecked") public List get(List ids, Class elementType) { @@ -121,8 +118,8 @@ public List get(List ids, Class elementType) { * * @param objectIds IDs of the objects to wait for. * @param numObjects Number of objects that should appear. - * @param timeoutMs Timeout in milliseconds, wait infinitely if it's negative. - * @return A bitset that indicates each object has appeared or not. + * @param timeoutMs Timeout in milliseconds, wait infinitely if it's negative. Returns A bitset + * that indicates each object has appeared or not. */ public abstract List wait(List objectIds, int numObjects, long timeoutMs); @@ -132,8 +129,8 @@ public List get(List ids, Class elementType) { * * @param waitList A list of object references to wait for. * @param numReturns The number of objects that should be returned. - * @param timeoutMs The maximum time in milliseconds to wait before returning. - * @return Two lists, one containing locally available objects, one containing the rest. + * @param timeoutMs The maximum time in milliseconds to wait before returning. Returns Two lists, + * one containing locally available objects, one containing the rest. */ public WaitResult wait(List> waitList, int numReturns, int timeoutMs) { Preconditions.checkNotNull(waitList); @@ -188,8 +185,7 @@ public WaitResult wait(List> waitList, int numReturns, int t /** * Promote the given object to the underlying object store, and get the ownership info. * - * @param objectId The ID of the object to promote - * @return the serialized ownership address + * @param objectId The ID of the object to promote Returns the serialized ownership address */ public abstract byte[] promoteAndGetOwnershipInfo(ObjectId objectId); diff --git a/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupImpl.java b/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupImpl.java index 1d0d540848bf..b08f7c9f5c0f 100644 --- a/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupImpl.java +++ b/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupImpl.java @@ -53,8 +53,8 @@ public PlacementGroupState getState() { /** * Wait for the placement group to be ready within the specified time. * - * @param timeoutSeconds Timeout in seconds. - * @return True if the placement group is created. False otherwise. + * @param timeoutSeconds Timeout in seconds. Returns True if the placement group is created. False + * otherwise. */ public boolean wait(int timeoutSeconds) { return Ray.internal().waitPlacementGroupReady(id, timeoutSeconds); @@ -71,8 +71,7 @@ public static class Builder { /** * Set the Id of the placement group. * - * @param id Id of the placement group. - * @return self. + * @param id Id of the placement group. Returns self. */ public Builder setId(PlacementGroupId id) { this.id = id; @@ -82,8 +81,7 @@ public Builder setId(PlacementGroupId id) { /** * Set the name of the placement group. * - * @param name Name of the placement group. - * @return self. + * @param name Name of the placement group. Returns self. */ public Builder setName(String name) { this.name = name; @@ -93,8 +91,7 @@ public Builder setName(String name) { /** * Set the bundles of the placement group. * - * @param bundles the bundles of the placement group. - * @return self. + * @param bundles the bundles of the placement group. Returns self. */ public Builder setBundles(List> bundles) { this.bundles = bundles; @@ -104,8 +101,7 @@ public Builder setBundles(List> bundles) { /** * Set the placement strategy of the placement group. * - * @param strategy the placement strategy of the placement group. - * @return self. + * @param strategy the placement strategy of the placement group. Returns self. */ public Builder setStrategy(PlacementStrategy strategy) { this.strategy = strategy; @@ -115,8 +111,7 @@ public Builder setStrategy(PlacementStrategy strategy) { /** * Set the placement state of the placement group. * - * @param state the state of the placement group. - * @return self. + * @param state the state of the placement group. Returns self. */ public Builder setState(PlacementGroupState state) { this.state = state; diff --git a/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupUtils.java b/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupUtils.java index 8e9d03cc6407..75305ef1f4e2 100644 --- a/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupUtils.java +++ b/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupUtils.java @@ -61,8 +61,8 @@ private static PlacementGroupState covertToUserSpecifiedState( /** * Generate a PlacementGroupImpl from placementGroupTableData protobuf data. * - * @param placementGroupTableData protobuf data. - * @return placement group info {@link PlacementGroupImpl} + * @param placementGroupTableData protobuf data. Returns placement group info {@link + * PlacementGroupImpl} */ private static PlacementGroupImpl generatePlacementGroupFromPbData( PlacementGroupTableData placementGroupTableData) { @@ -90,8 +90,8 @@ private static PlacementGroupImpl generatePlacementGroupFromPbData( /** * Generate a PlacementGroupImpl from byte array. * - * @param placementGroupByteArray bytes array from native method. - * @return placement group info {@link PlacementGroupImpl} + * @param placementGroupByteArray bytes array from native method. Returns placement group info + * {@link PlacementGroupImpl} */ public static PlacementGroupImpl generatePlacementGroupFromByteArray( byte[] placementGroupByteArray) { diff --git a/java/runtime/src/main/java/io/ray/runtime/task/TaskSubmitter.java b/java/runtime/src/main/java/io/ray/runtime/task/TaskSubmitter.java index e8a8351716d5..ca195d6ced11 100644 --- a/java/runtime/src/main/java/io/ray/runtime/task/TaskSubmitter.java +++ b/java/runtime/src/main/java/io/ray/runtime/task/TaskSubmitter.java @@ -21,8 +21,7 @@ public interface TaskSubmitter { * @param functionDescriptor The remote function to execute. * @param args Arguments of this task. * @param numReturns Return object count. - * @param options Options for this task. - * @return Ids of the return objects. + * @param options Options for this task. Returns Ids of the return objects. */ List submitTask( FunctionDescriptor functionDescriptor, @@ -35,8 +34,7 @@ List submitTask( * * @param functionDescriptor The remote function that generates the actor object. * @param args Arguments of this task. - * @param options Options for this actor creation task. - * @return Handle to the actor. + * @param options Options for this actor creation task. Returns Handle to the actor. * @throws IllegalArgumentException if actor of specified name exists */ BaseActorHandle createActor( @@ -50,8 +48,7 @@ BaseActorHandle createActor( * @param functionDescriptor The remote function to execute. * @param args Arguments of this task. * @param numReturns Return object count. - * @param options Options for this task. - * @return Ids of the return objects. + * @param options Options for this task. Returns Ids of the return objects. */ List submitActorTask( BaseActorHandle actor, @@ -65,8 +62,7 @@ List submitActorTask( * * @param name Name of the placement group. * @param bundles Pre-allocated resource list. - * @param strategy Actor placement strategy. - * @return A handle to the created placement group. + * @param strategy Actor placement strategy. Returns A handle to the created placement group. */ PlacementGroup createPlacementGroup( String name, List> bundles, PlacementStrategy strategy); @@ -82,8 +78,8 @@ PlacementGroup createPlacementGroup( * Wait for the placement group to be ready within the specified time. * * @param id Id of placement group. - * @param timeoutMs Timeout in milliseconds. - * @return True if the placement group is created. False otherwise. + * @param timeoutMs Timeout in milliseconds. Returns True if the placement group is created. False + * otherwise. */ boolean waitPlacementGroupReady(PlacementGroupId id, int timeoutMs); diff --git a/java/runtime/src/main/java/io/ray/runtime/util/BinaryFileUtil.java b/java/runtime/src/main/java/io/ray/runtime/util/BinaryFileUtil.java index f3282ed08c56..85c327a446b7 100644 --- a/java/runtime/src/main/java/io/ray/runtime/util/BinaryFileUtil.java +++ b/java/runtime/src/main/java/io/ray/runtime/util/BinaryFileUtil.java @@ -21,8 +21,7 @@ public class BinaryFileUtil { * will be protected by a file lock. * * @param destDir a directory to extract resource file to - * @param fileName resource file name - * @return extracted resource file + * @param fileName resource file name Returns extracted resource file */ public static File getNativeFile(String destDir, String fileName) { final File dir = new File(destDir); diff --git a/java/runtime/src/main/java/io/ray/runtime/util/IdUtil.java b/java/runtime/src/main/java/io/ray/runtime/util/IdUtil.java index 239568afa51b..4f7bf2580af2 100644 --- a/java/runtime/src/main/java/io/ray/runtime/util/IdUtil.java +++ b/java/runtime/src/main/java/io/ray/runtime/util/IdUtil.java @@ -13,7 +13,7 @@ public class IdUtil { /** * Compute the actor ID of the task which created this object. * - * @return The actor ID of the task which created this object. + *

Returns The actor ID of the task which created this object. */ public static ActorId getActorIdFromObjectId(ObjectId objectId) { byte[] taskIdBytes = new byte[TaskId.LENGTH]; diff --git a/java/runtime/src/main/java/io/ray/runtime/util/ResourceUtil.java b/java/runtime/src/main/java/io/ray/runtime/util/ResourceUtil.java index e9676d07b2f6..0c7a93d27818 100644 --- a/java/runtime/src/main/java/io/ray/runtime/util/ResourceUtil.java +++ b/java/runtime/src/main/java/io/ray/runtime/util/ResourceUtil.java @@ -11,8 +11,8 @@ public class ResourceUtil { * Convert resources map to a string that is used for the command line argument of starting * raylet. * - * @param resources The resources map to be converted. - * @return The starting-raylet command line argument, like "CPU,4,GPU,0". + * @param resources The resources map to be converted. Returns The starting-raylet command line + * argument, like "CPU,4,GPU,0". */ public static String getResourcesStringFromMap(Map resources) { StringBuilder builder = new StringBuilder(); @@ -32,9 +32,8 @@ public static String getResourcesStringFromMap(Map resources) { /** * Parse the static resources configure field and convert to the resources map. * - * @param resources The static resources string to be parsed. - * @return The map whose key represents the resource name and the value represents the resource - * quantity. + * @param resources The static resources string to be parsed. Returns The map whose key represents + * the resource name and the value represents the resource quantity. * @throws IllegalArgumentException If the resources string's format does match, it will throw an * IllegalArgumentException. */ diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/function/Function.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/function/Function.java index c12bdf87c48c..fbfc4736e031 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/function/Function.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/function/Function.java @@ -11,7 +11,7 @@ public interface Function extends Serializable { * storage, and load it back when in fail-over through. {@link * Function#loadCheckpoint(Serializable)}. * - * @return A serializable object which represents function state. + *

Returns A serializable object which represents function state. */ default Serializable saveCheckpoint() { return null; diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/function/impl/FilterFunction.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/function/impl/FilterFunction.java index d60e335a9d1e..877a93ae0e74 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/function/impl/FilterFunction.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/function/impl/FilterFunction.java @@ -14,8 +14,8 @@ public interface FilterFunction extends Function { /** * The filter function that evaluates the predicate. * - * @param value The value to be filtered. - * @return True for values that should be retained, false for values to be filtered out. + * @param value The value to be filtered. Returns True for values that should be retained, false + * for values to be filtered out. */ boolean filter(T value) throws Exception; } diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/partition/Partition.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/partition/Partition.java index 80e9d92729bf..527f469c301a 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/partition/Partition.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/partition/Partition.java @@ -15,8 +15,8 @@ public interface Partition extends Function { * record. * * @param record The record. - * @param numPartition num of partitions - * @return IDs of the downstream partitions that should receive the record. + * @param numPartition num of partitions Returns IDs of the downstream partitions that should + * receive the record. */ int[] partition(T record, int numPartition); } diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/DataStream.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/DataStream.java index 999057d5a8b7..698eab29d2e3 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/DataStream.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/DataStream.java @@ -59,8 +59,7 @@ public DataStream(PythonDataStream referencedStream) { * Apply a map function to this stream. * * @param mapFunction The map function. - * @param Type of data returned by the map function. - * @return A new DataStream. + * @param Type of data returned by the map function. Returns A new DataStream. */ public DataStream map(MapFunction mapFunction) { return new DataStream<>(this, new MapOperator<>(mapFunction)); @@ -70,8 +69,7 @@ public DataStream map(MapFunction mapFunction) { * Apply a flat-map function to this stream. * * @param flatMapFunction The FlatMapFunction - * @param Type of data returned by the flatmap function. - * @return A new DataStream + * @param Type of data returned by the flatmap function. Returns A new DataStream */ public DataStream flatMap(FlatMapFunction flatMapFunction) { return new DataStream<>(this, new FlatMapOperator<>(flatMapFunction)); @@ -86,8 +84,7 @@ public DataStream filter(FilterFunction filterFunction) { * type with each other. * * @param stream The DataStream to union output with. - * @param others The other DataStreams to union output with. - * @return A new UnionStream. + * @param others The other DataStreams to union output with. Returns A new UnionStream. */ @SafeVarargs public final DataStream union(DataStream stream, DataStream... others) { @@ -101,8 +98,7 @@ public final DataStream union(DataStream stream, DataStream... others) * Apply union transformations to this stream by merging {@link DataStream} outputs of the same * type with each other. * - * @param streams The DataStreams to union output with. - * @return A new UnionStream. + * @param streams The DataStreams to union output with. Returns A new UnionStream. */ public final DataStream union(List> streams) { if (this instanceof UnionStream) { @@ -119,8 +115,7 @@ public final DataStream union(List> streams) { * * @param other Another stream. * @param The type of the other stream data. - * @param The type of the data in the joined stream. - * @return A new JoinStream. + * @param The type of the data in the joined stream. Returns A new JoinStream. */ public JoinStream join(DataStream other) { return new JoinStream<>(this, other); @@ -134,8 +129,7 @@ public DataStream process() { /** * Apply a sink function and get a StreamSink. * - * @param sinkFunction The sink function. - * @return A new StreamSink. + * @param sinkFunction The sink function. Returns A new StreamSink. */ public DataStreamSink sink(SinkFunction sinkFunction) { return new DataStreamSink<>(this, new SinkOperator<>(sinkFunction)); @@ -145,8 +139,7 @@ public DataStreamSink sink(SinkFunction sinkFunction) { * Apply a key-by function to this stream. * * @param keyFunction the key function. - * @param The type of the key. - * @return A new KeyDataStream. + * @param The type of the key. Returns A new KeyDataStream. */ public KeyDataStream keyBy(KeyFunction keyFunction) { checkPartitionCall(); @@ -156,7 +149,7 @@ public KeyDataStream keyBy(KeyFunction keyFunction) { /** * Apply broadcast to this stream. * - * @return This stream. + *

Returns This stream. */ public DataStream broadcast() { checkPartitionCall(); @@ -166,8 +159,7 @@ public DataStream broadcast() { /** * Apply a partition to this stream. * - * @param partition The partitioning strategy. - * @return This stream. + * @param partition The partitioning strategy. Returns This stream. */ public DataStream partitionBy(Partition partition) { checkPartitionCall(); diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/DataStreamSource.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/DataStreamSource.java index 53dd2a09738a..13de0b33bb4e 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/DataStreamSource.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/DataStreamSource.java @@ -27,8 +27,7 @@ public static DataStreamSource fromSource( * * @param context Stream context. * @param values A collection of values. - * @param The type of source data. - * @return A DataStreamSource. + * @param The type of source data. Returns A DataStreamSource. */ public static DataStreamSource fromCollection( StreamingContext context, Collection values) { diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/KeyDataStream.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/KeyDataStream.java index c50b232697e4..fb6431ef2da8 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/KeyDataStream.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/KeyDataStream.java @@ -33,8 +33,7 @@ public KeyDataStream(PythonDataStream referencedStream) { /** * Apply a reduce function to this stream. * - * @param reduceFunction The reduce function. - * @return A new DataStream. + * @param reduceFunction The reduce function. Returns A new DataStream. */ public DataStream reduce(ReduceFunction reduceFunction) { return new DataStream<>(this, new ReduceOperator(reduceFunction)); @@ -45,8 +44,7 @@ public DataStream reduce(ReduceFunction reduceFunction) { * * @param aggregateFunction The aggregate function * @param The type of aggregated intermediate data. - * @param The type of result data. - * @return A new DataStream. + * @param The type of result data. Returns A new DataStream. */ public DataStream aggregate(AggregateFunction aggregateFunction) { return new DataStream<>(this, null); diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/jobgraph/JobGraph.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/jobgraph/JobGraph.java index b192dbcc8a18..6e40ee441c32 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/jobgraph/JobGraph.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/jobgraph/JobGraph.java @@ -43,7 +43,7 @@ public JobGraph( * Generate direct-graph(made up of a set of vertices and connected by edges) by current job graph * for simple log printing. * - * @return Digraph in string type. + *

Returns Digraph in string type. */ public String generateDigraph() { StringBuilder digraph = new StringBuilder(); diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/python/stream/PythonDataStream.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/python/stream/PythonDataStream.java index 90f018ecdc89..25b5873105a6 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/python/stream/PythonDataStream.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/python/stream/PythonDataStream.java @@ -51,8 +51,7 @@ public PythonDataStream map(String moduleName, String funcName) { /** * Apply a map function to this stream. * - * @param func The python MapFunction. - * @return A new PythonDataStream. + * @param func The python MapFunction. Returns A new PythonDataStream. */ public PythonDataStream map(PythonFunction func) { func.setFunctionInterface(FunctionInterface.MAP_FUNCTION); @@ -66,8 +65,7 @@ public PythonDataStream flatMap(String moduleName, String funcName) { /** * Apply a flat-map function to this stream. * - * @param func The python FlapMapFunction. - * @return A new PythonDataStream + * @param func The python FlapMapFunction. Returns A new PythonDataStream */ public PythonDataStream flatMap(PythonFunction func) { func.setFunctionInterface(FunctionInterface.FLAT_MAP_FUNCTION); @@ -81,9 +79,8 @@ public PythonDataStream filter(String moduleName, String funcName) { /** * Apply a filter function to this stream. * - * @param func The python FilterFunction. - * @return A new PythonDataStream that contains only the elements satisfying the given filter - * predicate. + * @param func The python FilterFunction. Returns A new PythonDataStream that contains only the + * elements satisfying the given filter predicate. */ public PythonDataStream filter(PythonFunction func) { func.setFunctionInterface(FunctionInterface.FILTER_FUNCTION); @@ -95,8 +92,7 @@ public PythonDataStream filter(PythonFunction func) { * same type with each other. * * @param stream The DataStream to union output with. - * @param others The other DataStreams to union output with. - * @return A new UnionStream. + * @param others The other DataStreams to union output with. Returns A new UnionStream. */ public final PythonDataStream union(PythonDataStream stream, PythonDataStream... others) { List streams = new ArrayList<>(); @@ -109,8 +105,7 @@ public final PythonDataStream union(PythonDataStream stream, PythonDataStream... * Apply union transformations to this stream by merging {@link PythonDataStream} outputs of the * same type with each other. * - * @param streams The DataStreams to union output with. - * @return A new UnionStream. + * @param streams The DataStreams to union output with. Returns A new UnionStream. */ public final PythonDataStream union(List streams) { if (this instanceof PythonUnionStream) { @@ -129,8 +124,7 @@ public PythonStreamSink sink(String moduleName, String funcName) { /** * Apply a sink function and get a StreamSink. * - * @param func The python SinkFunction. - * @return A new StreamSink. + * @param func The python SinkFunction. Returns A new StreamSink. */ public PythonStreamSink sink(PythonFunction func) { func.setFunctionInterface(FunctionInterface.SINK_FUNCTION); @@ -144,8 +138,7 @@ public PythonKeyDataStream keyBy(String moduleName, String funcName) { /** * Apply a key-by function to this stream. * - * @param func the python keyFunction. - * @return A new KeyDataStream. + * @param func the python keyFunction. Returns A new KeyDataStream. */ public PythonKeyDataStream keyBy(PythonFunction func) { checkPartitionCall(); @@ -156,7 +149,7 @@ public PythonKeyDataStream keyBy(PythonFunction func) { /** * Apply broadcast to this stream. * - * @return This stream. + *

Returns This stream. */ public PythonDataStream broadcast() { checkPartitionCall(); @@ -166,8 +159,7 @@ public PythonDataStream broadcast() { /** * Apply a partition to this stream. * - * @param partition The partitioning strategy. - * @return This stream. + * @param partition The partitioning strategy. Returns This stream. */ public PythonDataStream partitionBy(PythonPartition partition) { checkPartitionCall(); diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/python/stream/PythonKeyDataStream.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/python/stream/PythonKeyDataStream.java index 078f84ac4a94..8116fd392923 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/python/stream/PythonKeyDataStream.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/python/stream/PythonKeyDataStream.java @@ -31,8 +31,7 @@ public PythonDataStream reduce(String moduleName, String funcName) { /** * Apply a reduce function to this stream. * - * @param func The reduce function. - * @return A new DataStream. + * @param func The reduce function. Returns A new DataStream. */ public PythonDataStream reduce(PythonFunction func) { func.setFunctionInterface(FunctionInterface.REDUCE_FUNCTION); diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/config/global/CommonConfig.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/config/global/CommonConfig.java index 2ec3b6dfb944..0c555e7c5ada 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/config/global/CommonConfig.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/config/global/CommonConfig.java @@ -11,7 +11,7 @@ public interface CommonConfig extends Config { /** * Ray streaming job id. Non-custom. * - * @return Job id with string type. + *

Returns Job id with string type. */ @DefaultValue(value = "default-job-id") @Key(value = JOB_ID) @@ -20,7 +20,7 @@ public interface CommonConfig extends Config { /** * Ray streaming job name. Non-custom. * - * @return Job name with string type. + *

Returns Job name with string type. */ @DefaultValue(value = "default-job-name") @Key(value = JOB_NAME) diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/config/master/SchedulerConfig.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/config/master/SchedulerConfig.java index 79189431a2ba..bc2fc2bd3662 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/config/master/SchedulerConfig.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/config/master/SchedulerConfig.java @@ -11,7 +11,7 @@ public interface SchedulerConfig extends Config { /** * The timeout ms of worker initiation. Default is: 10000ms(10s). * - * @return timeout ms + *

Returns timeout ms */ @Key(WORKER_INITIATION_WAIT_TIMEOUT_MS) @DefaultValue(value = "10000") @@ -20,7 +20,7 @@ public interface SchedulerConfig extends Config { /** * The timeout ms of worker starting. Default is: 10000ms(10s). * - * @return timeout ms + *

Returns timeout ms */ @Key(WORKER_STARTING_WAIT_TIMEOUT_MS) @DefaultValue(value = "10000") diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/context/ContextBackend.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/context/ContextBackend.java index 83b62696e6ba..faf8703905be 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/context/ContextBackend.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/context/ContextBackend.java @@ -12,15 +12,14 @@ public interface ContextBackend { /** * check if key exists in state * - * @return true if exists + *

Returns true if exists */ boolean exists(final String key) throws Exception; /** * get content by key * - * @param key key - * @return the StateBackend + * @param key key Returns the StateBackend */ byte[] get(final String key) throws Exception; diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/graph/executiongraph/ExecutionGraph.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/graph/executiongraph/ExecutionGraph.java index 2852e0f99141..b0d3b522ed10 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/graph/executiongraph/ExecutionGraph.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/graph/executiongraph/ExecutionGraph.java @@ -156,7 +156,7 @@ public AtomicInteger getExecutionVertexIdGenerator() { /** * Get all execution vertices from current execution graph. * - * @return all execution vertices. + *

Returns all execution vertices. */ public List getAllExecutionVertices() { return executionJobVertexMap.values().stream() @@ -168,7 +168,7 @@ public List getAllExecutionVertices() { /** * Get all execution vertices whose status is 'TO_ADD' from current execution graph. * - * @return all added execution vertices. + *

Returns all added execution vertices. */ public List getAllAddedExecutionVertices() { return executionJobVertexMap.values().stream() @@ -181,8 +181,7 @@ public List getAllAddedExecutionVertices() { /** * Get specified execution vertex from current execution graph by execution vertex id. * - * @param executionVertexId execution vertex id. - * @return the specified execution vertex. + * @param executionVertexId execution vertex id. Returns the specified execution vertex. */ public ExecutionVertex getExecutionVertexByExecutionVertexId(int executionVertexId) { if (executionVertexMap.containsKey(executionVertexId)) { @@ -194,8 +193,7 @@ public ExecutionVertex getExecutionVertexByExecutionVertexId(int executionVertex /** * Get specified execution vertex from current execution graph by actor id. * - * @param actorId the actor id of execution vertex. - * @return the specified execution vertex. + * @param actorId the actor id of execution vertex. Returns the specified execution vertex. */ public ExecutionVertex getExecutionVertexByActorId(ActorId actorId) { return actorIdExecutionVertexMap.get(actorId); @@ -204,8 +202,7 @@ public ExecutionVertex getExecutionVertexByActorId(ActorId actorId) { /** * Get specified actor by actor id. * - * @param actorId the actor id of execution vertex. - * @return the specified actor handle. + * @param actorId the actor id of execution vertex. Returns the specified actor handle. */ public Optional getActorById(ActorId actorId) { return getAllActors().stream().filter(actor -> actor.getId().equals(actorId)).findFirst(); @@ -215,8 +212,7 @@ public Optional getActorById(ActorId actorId) { * Get the peer actor in the other side of channelName of a given actor * * @param actor actor in this side - * @param channelName the channel name - * @return the peer actor in the other side + * @param channelName the channel name Returns the peer actor in the other side */ public BaseActorHandle getPeerActor(BaseActorHandle actor, String channelName) { Set set = getActorsByChannelId(channelName); @@ -233,8 +229,7 @@ public BaseActorHandle getPeerActor(BaseActorHandle actor, String channelName) { /** * Get actors in both sides of a channelId * - * @param channelId the channelId - * @return actors in both sides + * @param channelId the channelId Returns actors in both sides */ public Set getActorsByChannelId(String channelId) { return channelGroupedActors.getOrDefault(channelId, Sets.newHashSet()); @@ -243,7 +238,7 @@ public Set getActorsByChannelId(String channelId) { /** * Get all actors by graph. * - * @return actor list + *

Returns actor list */ public List getAllActors() { return getActorsFromJobVertices(getExecutionJobVertexList()); @@ -252,7 +247,7 @@ public List getAllActors() { /** * Get source actors by graph. * - * @return actor list + *

Returns actor list */ public List getSourceActors() { List executionJobVertices = @@ -266,7 +261,7 @@ public List getSourceActors() { /** * Get transformation and sink actors by graph. * - * @return actor list + *

Returns actor list */ public List getNonSourceActors() { List executionJobVertices = @@ -283,7 +278,7 @@ public List getNonSourceActors() { /** * Get sink actors by graph. * - * @return actor list + *

Returns actor list */ public List getSinkActors() { List executionJobVertices = @@ -297,8 +292,7 @@ public List getSinkActors() { /** * Get actors according to job vertices. * - * @param executionJobVertices specified job vertices - * @return actor list + * @param executionJobVertices specified job vertices Returns actor list */ public List getActorsFromJobVertices( List executionJobVertices) { diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/graph/executiongraph/ExecutionJobVertex.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/graph/executiongraph/ExecutionJobVertex.java index cf869c0c4f2a..0aa426672db6 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/graph/executiongraph/ExecutionJobVertex.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/graph/executiongraph/ExecutionJobVertex.java @@ -109,7 +109,7 @@ public String getExecutionJobVertexName() { /** * e.g. 1-SourceOperator * - * @return operator name with index + *

Returns operator name with index */ public String getExecutionJobVertexNameWithIndex() { return executionJobVertexId + "-" + executionJobVertexName; diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/resource/Resources.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/resource/Resources.java index 9b07d131f7c9..b0dec4aef0c0 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/resource/Resources.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/resource/Resources.java @@ -24,7 +24,7 @@ public Resources() {} /** * Get registered containers, the container list is read-only. * - * @return container list. + *

Returns container list. */ public ImmutableList getRegisteredContainers() { return ImmutableList.copyOf(registerContainers); diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/JobMaster.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/JobMaster.java index fd672978a4f2..a1dd5b6bc14b 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/JobMaster.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/JobMaster.java @@ -101,7 +101,7 @@ private void loadMasterCheckpoint() { /** * Init JobMaster. To initiate or recover other components(like metrics and extra coordinators). * - * @return init result + *

Returns init result */ public Boolean init(boolean isRecover) { LOG.info("Initializing job master, isRecover={}.", isRecover); @@ -136,8 +136,7 @@ public Boolean init(boolean isRecover) { * * * @param jobMasterActor JobMaster actor - * @param jobGraph logical plan - * @return submit result + * @param jobGraph logical plan Returns submit result */ public boolean submitJob(ActorHandle jobMasterActor, JobGraph jobGraph) { LOG.info("Begin submitting job using logical plan: {}.", jobGraph); diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/graphmanager/GraphManager.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/graphmanager/GraphManager.java index b563917d97b4..ce8dd474157a 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/graphmanager/GraphManager.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/graphmanager/GraphManager.java @@ -19,22 +19,21 @@ public interface GraphManager { /** * Build execution graph from job graph. * - * @param jobGraph logical plan of streaming job. - * @return physical plan of streaming job. + * @param jobGraph logical plan of streaming job. Returns physical plan of streaming job. */ ExecutionGraph buildExecutionGraph(JobGraph jobGraph); /** * Get job graph. * - * @return the job graph. + *

Returns the job graph. */ JobGraph getJobGraph(); /** * Get execution graph. * - * @return the execution graph. + *

Returns the execution graph. */ ExecutionGraph getExecutionGraph(); } diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/ResourceManager.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/ResourceManager.java index fbe3f696aa59..43671eea1b28 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/ResourceManager.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/ResourceManager.java @@ -10,7 +10,7 @@ public interface ResourceManager extends ResourceAssignStrategy { /** * Get registered containers, the container list is read-only. * - * @return the registered container list + *

Returns the registered container list */ ImmutableList getRegisteredContainers(); } diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/strategy/ResourceAssignStrategy.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/strategy/ResourceAssignStrategy.java index 9ce131d2599c..8df20790cb90 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/strategy/ResourceAssignStrategy.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/strategy/ResourceAssignStrategy.java @@ -13,8 +13,7 @@ public interface ResourceAssignStrategy { * Assign {@link Container} for {@link ExecutionVertex} * * @param containers registered container - * @param executionGraph execution graph - * @return allocating view + * @param executionGraph execution graph Returns allocating view */ ResourceAssignmentView assignResource(List containers, ExecutionGraph executionGraph); diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/strategy/impl/PipelineFirstStrategy.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/strategy/impl/PipelineFirstStrategy.java index 48f2366cd37d..74b646c67364 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/strategy/impl/PipelineFirstStrategy.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/strategy/impl/PipelineFirstStrategy.java @@ -42,8 +42,8 @@ public class PipelineFirstStrategy implements ResourceAssignStrategy { * Assign resource to each execution vertex in the given execution graph. * * @param containers registered containers - * @param executionGraph execution graph - * @return allocating map, key is container ID, value is list of vertextId, and contains vertices + * @param executionGraph execution graph Returns allocating map, key is container ID, value is + * list of vertextId, and contains vertices */ @Override public ResourceAssignmentView assignResource( @@ -133,8 +133,7 @@ private void updateContainerCapacity(List containers, int capacity) { * Find a container which matches required resource * * @param requiredResource required resource - * @param containers registered containers - * @return container that matches the required resource + * @param containers registered containers Returns container that matches the required resource */ private Container findMatchedContainer( Map requiredResource, List containers) { @@ -160,8 +159,7 @@ private Container findMatchedContainer( * Check if current container has enough resource * * @param requiredResource required resource - * @param container container - * @return true if matches, false else + * @param container container Returns true if matches, false else */ private boolean hasEnoughResource(Map requiredResource, Container container) { LOG.info("Check resource for index: {}, container: {}", currentContainerIndex, container); @@ -202,8 +200,7 @@ private boolean hasEnoughResource(Map requiredResource, Containe /** * Forward to next container * - * @param containers registered container list - * @return next container in the list + * @param containers registered container list Returns next container in the list */ private Container forwardToNextContainer(List containers) { this.currentContainerIndex = (this.currentContainerIndex + 1) % containers.size(); @@ -213,8 +210,7 @@ private Container forwardToNextContainer(List containers) { /** * Get current container * - * @param containers registered container - * @return current container to allocate actor + * @param containers registered container Returns current container to allocate actor */ private Container getCurrentContainer(List containers) { return containers.get(currentContainerIndex); diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/JobScheduler.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/JobScheduler.java index d0fb60d54878..962c0bdfa92b 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/JobScheduler.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/JobScheduler.java @@ -8,8 +8,7 @@ public interface JobScheduler { /** * Schedule streaming job using the physical plan. * - * @param executionGraph physical plan - * @return scheduling result + * @param executionGraph physical plan Returns scheduling result */ boolean scheduleJob(ExecutionGraph executionGraph); } diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/JobSchedulerImpl.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/JobSchedulerImpl.java index 039715ccbefd..6309bb334e32 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/JobSchedulerImpl.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/JobSchedulerImpl.java @@ -95,8 +95,7 @@ private void initAndStart(ExecutionGraph executionGraph) { /** * Create JobWorker actors according to the physical plan. * - * @param executionGraph physical plan - * @return actor creation result + * @param executionGraph physical plan Returns actor creation result */ public boolean createWorkers(ExecutionGraph executionGraph) { LOG.info("Begin creating workers."); @@ -149,8 +148,7 @@ public boolean startWorkers(ExecutionGraph executionGraph, long checkpointId) { /** * Build workers context. * - * @param executionGraph execution graph - * @return vertex to worker context map + * @param executionGraph execution graph Returns vertex to worker context map */ protected Map buildWorkersContext( ExecutionGraph executionGraph) { diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/controller/WorkerLifecycleController.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/controller/WorkerLifecycleController.java index 3cd3984b2043..f5c4be5f7ee1 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/controller/WorkerLifecycleController.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/controller/WorkerLifecycleController.java @@ -36,8 +36,7 @@ public boolean createWorkers(List executionVertices) { /** * Create JobWorker actor according to the execution vertex. * - * @param executionVertex target execution vertex - * @return creation result + * @param executionVertex target execution vertex Returns creation result */ private boolean createWorker(ExecutionVertex executionVertex) { LOG.info( @@ -85,8 +84,7 @@ private boolean createWorker(ExecutionVertex executionVertex) { * Using context to init JobWorker. * * @param vertexToContextMap target JobWorker actor - * @param timeout timeout for waiting, unit: ms - * @return initiation result + * @param timeout timeout for waiting, unit: ms Returns initiation result */ public boolean initWorkers( Map vertexToContextMap, int timeout) { @@ -122,8 +120,7 @@ public boolean initWorkers( * Start JobWorkers to run task. * * @param executionGraph physical plan - * @param timeout timeout for waiting, unit: ms - * @return starting result + * @param timeout timeout for waiting, unit: ms Returns starting result */ public boolean startWorkers(ExecutionGraph executionGraph, long lastCheckpointId, int timeout) { LOG.info("Begin starting workers."); @@ -153,8 +150,7 @@ public boolean startWorkers(ExecutionGraph executionGraph, long lastCheckpointId /** * Stop and destroy JobWorkers' actor. * - * @param executionVertices target vertices - * @return destroy result + * @param executionVertices target vertices Returns destroy result */ public boolean destroyWorkers(List executionVertices) { return asyncBatchExecute(this::destroyWorker, executionVertices); diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/rpc/RemoteCallWorker.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/rpc/RemoteCallWorker.java index 6cd788138883..5a5475350d65 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/rpc/RemoteCallWorker.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/rpc/RemoteCallWorker.java @@ -25,8 +25,7 @@ public class RemoteCallWorker { * Call JobWorker actor to init. * * @param actor target JobWorker actor - * @param context JobWorker's context - * @return init result + * @param context JobWorker's context Returns init result */ public static ObjectRef initWorker(BaseActorHandle actor, JobWorkerContext context) { LOG.info("Call worker to initiate, actor: {}, context: {}.", actor.getId(), context); @@ -51,8 +50,7 @@ public static ObjectRef initWorker(BaseActorHandle actor, JobWorkerCont * Call JobWorker actor to start. * * @param actor target JobWorker actor - * @param checkpointId checkpoint ID to be rollback - * @return start result + * @param checkpointId checkpoint ID to be rollback Returns start result */ public static ObjectRef rollback(BaseActorHandle actor, final Long checkpointId) { LOG.info("Call worker to start, actor: {}.", actor.getId()); @@ -81,8 +79,7 @@ public static ObjectRef rollback(BaseActorHandle actor, final Long checkpointId) /** * Call JobWorker actor to destroy without reconstruction. * - * @param actor target JobWorker actor - * @return destroy result + * @param actor target JobWorker actor Returns destroy result */ public static Boolean shutdownWithoutReconstruction(BaseActorHandle actor) { LOG.info("Call worker to shutdown without reconstruction, actor is {}.", actor.getId()); diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/transfer/DataReader.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/transfer/DataReader.java index ff3c62fee11c..17ab4fe1ec4a 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/transfer/DataReader.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/transfer/DataReader.java @@ -115,8 +115,7 @@ private static native long createDataReaderNative( /** * Read message from input channels, if timeout, return null. * - * @param timeoutMillis timeout - * @return message or null + * @param timeoutMillis timeout Returns message or null */ public ChannelMessage read(long timeoutMillis) { if (buf.isEmpty()) { diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/transfer/channel/ChannelId.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/transfer/channel/ChannelId.java index 731031d62a9b..d3a4b8d71773 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/transfer/channel/ChannelId.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/transfer/channel/ChannelId.java @@ -86,8 +86,7 @@ public static String genRandomIdStr() { * Generate channel name, which will be {@link ChannelId#ID_LENGTH} character * * @param fromTaskId upstream task id - * @param toTaskId downstream task id - * @return channel name + * @param toTaskId downstream task id Returns channel name */ public static String genIdStr(int fromTaskId, int toTaskId, long ts) { /* @@ -117,8 +116,7 @@ public static String genIdStr(int fromTaskId, int toTaskId, long ts) { } /** - * @param id hex string representation of channel id - * @return bytes representation of channel id + * @param id hex string representation of channel id Returns bytes representation of channel id */ public static byte[] idStrToBytes(String id) { byte[] idBytes = BaseEncoding.base16().decode(id.toUpperCase()); @@ -127,8 +125,7 @@ public static byte[] idStrToBytes(String id) { } /** - * @param id bytes representation of channel id - * @return hex string representation of channel id + * @param id bytes representation of channel id Returns hex string representation of channel id */ public static String idBytesToStr(byte[] id) { assert id.length == ChannelId.ID_LENGTH; diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/EnvUtil.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/EnvUtil.java index 29ac29f4d51e..07fda18a6c5a 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/EnvUtil.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/EnvUtil.java @@ -36,7 +36,7 @@ public static void loadNativeLibraries() { /** * Execute an external command. * - * @return Whether the command succeeded. + *

Returns Whether the command succeeded. */ public static boolean executeCommand(List command, int waitTimeoutSeconds) { try { diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/Platform.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/Platform.java index 324e1ab9dcd9..effafcc540a0 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/Platform.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/Platform.java @@ -77,10 +77,7 @@ public static void wrapDirectBuffer(ByteBuffer buffer, long address, int size) { buffer.clear(); } - /** - * @param buffer a DirectBuffer backed by off-heap memory - * @return address of off-heap memory - */ + /** @param buffer a DirectBuffer backed by off-heap memory Returns address of off-heap memory */ public static long getAddress(ByteBuffer buffer) { return ((DirectBuffer) buffer).address(); } diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/RayUtils.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/RayUtils.java index b3243d69f449..a97a2f5bab3b 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/RayUtils.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/RayUtils.java @@ -15,7 +15,7 @@ public class RayUtils { /** * Get all node info from GCS * - * @return node info list + *

Returns node info list */ public static List getAllNodeInfo() { if (Ray.getRuntimeContext().isSingleProcess()) { @@ -28,7 +28,7 @@ public static List getAllNodeInfo() { /** * Get all alive node info map * - * @return node info map, key is unique node id , value is node info + *

Returns node info map, key is unique node id , value is node info */ public static Map getAliveNodeInfoMap() { return getAllNodeInfo().stream() diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/ReflectionUtils.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/ReflectionUtils.java index 13a75f8ebc7b..bc04a1ded0f6 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/ReflectionUtils.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/ReflectionUtils.java @@ -20,7 +20,7 @@ public static Method findMethod(Class cls, String methodName) { /** * For covariant return type, return the most specific method. * - * @return all methods named by {@code methodName}, + *

Returns all methods named by {@code methodName}, */ public static List findMethods(Class cls, String methodName) { List> classes = new ArrayList<>(); diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/ResourceUtil.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/ResourceUtil.java index b00b6ee96b85..b8336cd145be 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/ResourceUtil.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/ResourceUtil.java @@ -52,8 +52,8 @@ public static void logProcessMemoryDetail() { } /** - * @return jvm heap usage ratio. note that one of the survivor space is not include in total - * memory while calculating this ratio. + * Returns jvm heap usage ratio. note that one of the survivor space is not include in total + * memory while calculating this ratio. */ public static double getJvmHeapUsageRatio() { Runtime runtime = Runtime.getRuntime(); @@ -61,8 +61,8 @@ public static double getJvmHeapUsageRatio() { } /** - * @return jvm heap usage(in bytes). note that this value doesn't include one of the survivor - * space. + * Returns jvm heap usage(in bytes). note that this value doesn't include one of the survivor + * space. */ public static long getJvmHeapUsageInBytes() { Runtime runtime = Runtime.getRuntime(); @@ -95,8 +95,8 @@ public static double getProcessCpuUsage() { } /** - * @return the system cpu usage. This value is a double in the [0.0,1.0] We will try to use `vsar` - * to get cpu usage by default, and use MXBean if any exception raised. + * Returns the system cpu usage. This value is a double in the [0.0,1.0] We will try to use `vsar` + * to get cpu usage by default, and use MXBean if any exception raised. */ public static double getSystemCpuUsage() { double cpuUsage = 0.0; @@ -109,10 +109,10 @@ public static double getSystemCpuUsage() { } /** - * @return the "recent cpu usage" for the whole system. This value is a double in the [0.0,1.0] - * interval. A value of 0.0 means that all CPUs were idle during the recent period of time - * observed, while a value of 1.0 means that all CPUs were actively running 100% of the time - * during the recent period being observed + * Returns the "recent cpu usage" for the whole system. This value is a double in the [0.0,1.0] + * interval. A value of 0.0 means that all CPUs were idle during the recent period of time + * observed, while a value of 1.0 means that all CPUs were actively running 100% of the time + * during the recent period being observed */ public static double getSystemCpuUtilByMXBean() { return osmxb.getSystemCpuLoad(); @@ -144,7 +144,7 @@ public static double getSystemCpuUtilByVsar() throws Exception { return cpuUsageFromVsar; } - /** Returns the system load average for the last minute */ + /** Returnss the system load average for the last minute */ public static double getSystemLoadAverage() { return osmxb.getSystemLoadAverage(); } @@ -158,8 +158,7 @@ public static int getCpuCores() { * Get containers by hostname of address * * @param containers container list - * @param containerHosts container hostname or address set - * @return matched containers + * @param containerHosts container hostname or address set Returns matched containers */ public static List getContainersByHostname( List containers, Collection containerHosts) { @@ -175,8 +174,7 @@ public static List getContainersByHostname( /** * Get container by hostname * - * @param hostName container hostname - * @return container + * @param hostName container hostname Returns container */ public static Optional getContainerByHostname( List containers, String hostName) { @@ -190,8 +188,7 @@ public static Optional getContainerByHostname( /** * Get container by id * - * @param containerID container id - * @return container + * @param containerID container id Returns container */ public static Optional getContainerById( List containers, ContainerId containerID) { diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/worker/JobWorker.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/worker/JobWorker.java index 15200c65633e..5a6554802bc3 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/worker/JobWorker.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/worker/JobWorker.java @@ -137,8 +137,8 @@ public Boolean init(JobWorkerContext workerContext) { /** * Start worker's stream tasks with specific checkpoint ID. * - * @return a {@link CallResult} with {@link ChannelRecoverInfo}, contains {@link - * ChannelCreationStatus} of each input queue. + *

Returns a {@link CallResult} with {@link ChannelRecoverInfo}, contains {@link + * ChannelCreationStatus} of each input queue. */ public CallResult rollback(Long checkpointId, Long startRollbackTs) { synchronized (initialStateChangeLock) { diff --git a/streaming/java/streaming-runtime/src/test/java/io/ray/streaming/runtime/util/Mockitools.java b/streaming/java/streaming-runtime/src/test/java/io/ray/streaming/runtime/util/Mockitools.java index eb48f1691a12..5fe774e20b22 100644 --- a/streaming/java/streaming-runtime/src/test/java/io/ray/streaming/runtime/util/Mockitools.java +++ b/streaming/java/streaming-runtime/src/test/java/io/ray/streaming/runtime/util/Mockitools.java @@ -49,8 +49,8 @@ public static List mockGetAllNodeInfo() { /** * Mock get node info map * - * @param nodeInfos all node infos fetched from GCS - * @return node info map, key is node unique id, value is node info + * @param nodeInfos all node infos fetched from GCS Returns node info map, key is node unique id, + * value is node info */ public static Map mockGetNodeInfoMap(List nodeInfos) { return nodeInfos.stream() diff --git a/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/KeyGroupAssignment.java b/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/KeyGroupAssignment.java index 921ea8598b43..10f99c0b6b2f 100644 --- a/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/KeyGroupAssignment.java +++ b/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/KeyGroupAssignment.java @@ -50,8 +50,8 @@ public static KeyGroup getKeyGroup(int maxParallelism, int parallelism, int inde * Assigning the key to a key-group index. * * @param key the key to assign. - * @param maxParallelism the maximum parallelism. - * @return the key-group index to which the given key is assigned. + * @param maxParallelism the maximum parallelism. Returns the key-group index to which the given + * key is assigned. */ public static int assignKeyGroupIndexForKey(Object key, int maxParallelism) { return Math.abs(key.hashCode() % maxParallelism); diff --git a/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/state/MapState.java b/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/state/MapState.java index a632d21d0728..933081af5383 100644 --- a/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/state/MapState.java +++ b/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/state/MapState.java @@ -28,8 +28,7 @@ public interface MapState extends UnaryState> { /** * Returns the current value associated with the given key. * - * @param key The key of the mapping - * @return The value of the mapping with the given key + * @param key The key of the mapping Returns The value of the mapping with the given key */ V get(K key); @@ -65,8 +64,8 @@ public interface MapState extends UnaryState> { /** * Returns whether there exists the given mapping. * - * @param key The key of the mapping - * @return True if there exists a mapping whose key equals to the given key + * @param key The key of the mapping Returns True if there exists a mapping whose key equals to + * the given key */ default boolean contains(K key) { return get().containsKey(key); @@ -75,7 +74,7 @@ default boolean contains(K key) { /** * Returns all the mappings in the state * - * @return An iterable view of all the key-value pairs in the state. + *

Returns An iterable view of all the key-value pairs in the state. */ default Iterable> entries() { return get().entrySet(); @@ -84,7 +83,7 @@ default Iterable> entries() { /** * Returns all the keys in the state * - * @return An iterable view of all the keys in the state. + *

Returns An iterable view of all the keys in the state. */ default Iterable keys() { return get().keySet(); @@ -93,7 +92,7 @@ default Iterable keys() { /** * Returns all the values in the state. * - * @return An iterable view of all the values in the state. + *

Returns An iterable view of all the values in the state. */ default Iterable values() { return get().values(); @@ -102,7 +101,7 @@ default Iterable values() { /** * Iterates over all the mappings in the state. * - * @return An iterator over all the mappings in the state + *

Returns An iterator over all the mappings in the state */ default Iterator> iterator() { return get().entrySet().iterator(); diff --git a/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/state/UnaryState.java b/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/state/UnaryState.java index 637b573144b8..5c250b594973 100644 --- a/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/state/UnaryState.java +++ b/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/state/UnaryState.java @@ -24,7 +24,7 @@ public interface UnaryState extends State { /** * get the value in state * - * @return the value in state + *

Returns the value in state */ O get(); } From 81ba4f7bc83f82bd47b4d3870f1a24452c254539 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 241/244] Revert "[RLlib] Support for D4RL + Semi-working CQL Benchmark (#13550)" This reverts commit 125c4d4534fcb5446c23d16cdb9a213de24f9a7a. --- rllib/agents/cql/cql.py | 2 - rllib/evaluation/worker_set.py | 5 +- rllib/offline/__init__.py | 2 - rllib/offline/d4rl_reader.py | 52 ------------------- rllib/tuned_examples/cql/halfcheetah-cql.yaml | 1 - 5 files changed, 1 insertion(+), 61 deletions(-) delete mode 100644 rllib/offline/d4rl_reader.py diff --git a/rllib/agents/cql/cql.py b/rllib/agents/cql/cql.py index 30bbe89d4553..04a63be72751 100644 --- a/rllib/agents/cql/cql.py +++ b/rllib/agents/cql/cql.py @@ -15,8 +15,6 @@ SAC_CONFIG, { # You should override this to point to an offline dataset. "input": "sampler", - # Offline RL does not need IS estimators - "input_evaluation": [], # Number of iterations with Behavior Cloning Pretraining "bc_iters": 20000, # CQL Loss Temperature diff --git a/rllib/evaluation/worker_set.py b/rllib/evaluation/worker_set.py index 8361e0af8777..80cf617bb029 100644 --- a/rllib/evaluation/worker_set.py +++ b/rllib/evaluation/worker_set.py @@ -8,7 +8,7 @@ from ray.rllib.evaluation.rollout_worker import RolloutWorker, \ _validate_multiagent_config from ray.rllib.offline import NoopOutput, JsonReader, MixedInput, JsonWriter, \ - ShuffledInput, D4RLReader + ShuffledInput from ray.rllib.env.env_context import EnvContext from ray.rllib.policy import Policy from ray.rllib.utils import merge_dicts @@ -266,9 +266,6 @@ def session_creator(): input_creator = ( lambda ioctx: ShuffledInput(MixedInput(config["input"], ioctx), config["shuffle_buffer_size"])) - elif "d4rl" in config["input"]: - env_name = config["input"].split(".")[1] - input_creator = (lambda ioctx: D4RLReader(env_name, ioctx)) else: input_creator = ( lambda ioctx: ShuffledInput(JsonReader(config["input"], ioctx), diff --git a/rllib/offline/__init__.py b/rllib/offline/__init__.py index 540151cc2d4d..69b07c657006 100644 --- a/rllib/offline/__init__.py +++ b/rllib/offline/__init__.py @@ -5,7 +5,6 @@ from ray.rllib.offline.input_reader import InputReader from ray.rllib.offline.mixed_input import MixedInput from ray.rllib.offline.shuffled_input import ShuffledInput -from ray.rllib.offline.d4rl_reader import D4RLReader __all__ = [ "IOContext", @@ -16,5 +15,4 @@ "InputReader", "MixedInput", "ShuffledInput", - "D4RLReader", ] diff --git a/rllib/offline/d4rl_reader.py b/rllib/offline/d4rl_reader.py deleted file mode 100644 index 2c02af08868c..000000000000 --- a/rllib/offline/d4rl_reader.py +++ /dev/null @@ -1,52 +0,0 @@ -import logging -import gym - -from ray.rllib.offline.input_reader import InputReader -from ray.rllib.offline.io_context import IOContext -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.utils.annotations import override, PublicAPI -from ray.rllib.utils.typing import SampleBatchType -from typing import Dict - -logger = logging.getLogger(__name__) - - -@PublicAPI -class D4RLReader(InputReader): - """Reader object that loads the dataset from the D4RL dataset.""" - - @PublicAPI - def __init__(self, inputs: str, ioctx: IOContext = None): - """Initialize a D4RLReader. - - Args: - inputs (str): String corresponding to D4RL environment name - ioctx (IOContext): Current IO context object. - """ - import d4rl - self.env = gym.make(inputs) - self.dataset = convert_to_batch(d4rl.qlearning_dataset(self.env)) - assert self.dataset.count >= 1 - self.dataset.shuffle() - self.counter = 0 - - @override(InputReader) - def next(self) -> SampleBatchType: - if self.counter >= self.dataset.count: - self.counter = 0 - self.dataset.shuffle() - - self.counter += 1 - return self.dataset.slice(start=self.counter, end=self.counter + 1) - - -def convert_to_batch(dataset: Dict) -> SampleBatchType: - # Converts D4RL dataset to SampleBatch - d = {} - d[SampleBatch.OBS] = dataset["observations"] - d[SampleBatch.ACTIONS] = dataset["actions"] - d[SampleBatch.NEXT_OBS] = dataset["next_observations"] - d[SampleBatch.REWARDS] = dataset["rewards"] - d[SampleBatch.DONES] = dataset["terminals"] - - return SampleBatch(d) diff --git a/rllib/tuned_examples/cql/halfcheetah-cql.yaml b/rllib/tuned_examples/cql/halfcheetah-cql.yaml index 9a5fa9982875..5bab20751c53 100644 --- a/rllib/tuned_examples/cql/halfcheetah-cql.yaml +++ b/rllib/tuned_examples/cql/halfcheetah-cql.yaml @@ -5,7 +5,6 @@ halfcheetah_cql: episode_reward_mean: 9000 config: # SAC Configs - input: d4rl.halfcheetah-medium-v0 framework: torch horizon: 1000 soft_horizon: false From a3b8ed0304538020b1f33d582f0f6a3c90da4243 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 242/244] Revert "[RLlib] Fix problem in preprocessing nested MultiDiscrete (#13308)" This reverts commit 598840a8846121efbc03ec1ced49941c5107fd63. --- rllib/models/preprocessors.py | 2 +- rllib/models/tests/test_preprocessors.py | 11 ----------- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/rllib/models/preprocessors.py b/rllib/models/preprocessors.py index 44312a807432..2b0bcb092062 100644 --- a/rllib/models/preprocessors.py +++ b/rllib/models/preprocessors.py @@ -174,7 +174,7 @@ def transform(self, observation: TensorType) -> np.ndarray: @override(Preprocessor) def write(self, observation: TensorType, array: np.ndarray, offset: int) -> None: - array[offset:offset + self.size] = self.transform(observation) + array[offset + observation] = 1 class NoPreprocessor(Preprocessor): diff --git a/rllib/models/tests/test_preprocessors.py b/rllib/models/tests/test_preprocessors.py index 4ce7b73e7e74..5515b6fea6b1 100644 --- a/rllib/models/tests/test_preprocessors.py +++ b/rllib/models/tests/test_preprocessors.py @@ -71,17 +71,6 @@ def test_one_hot_preprocessor(self): pp.transform(np.array([0, 1, 3])), [1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0]) - def test_nested_multidiscrete_one_hot_preprocessor(self): - space = Tuple((MultiDiscrete([2, 3, 4]), )) - pp = get_preprocessor(space)(space) - self.assertTrue(pp.shape == (9, )) - check( - pp.transform((np.array([1, 2, 0]), )), - [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0]) - check( - pp.transform((np.array([0, 1, 3]), )), - [1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0]) - if __name__ == "__main__": import pytest From fc4bc078e6d795dbeb8d4c89551e2eec718e386d Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 243/244] Revert "[RLlib] Dreamer: Fix broken import and add compilation test case. (#13553)" This reverts commit 237d253107f32dc3c080b33ff21d0f893ddb65c3. --- rllib/BUILD | 23 ++++-------- rllib/agents/dreamer/dreamer.py | 2 -- rllib/agents/dreamer/dreamer_model.py | 2 +- rllib/agents/dreamer/tests/test_dreamer.py | 41 ---------------------- rllib/env/wrappers/dm_control_wrapper.py | 2 +- 5 files changed, 9 insertions(+), 61 deletions(-) delete mode 100644 rllib/agents/dreamer/tests/test_dreamer.py diff --git a/rllib/BUILD b/rllib/BUILD index f8f1cbd3c6f8..daa623dff843 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -436,13 +436,13 @@ py_test( srcs = ["agents/a3c/tests/test_a3c.py"] ) -# APEXTrainer (DQN) -py_test( - name = "test_apex_dqn", - tags = ["agents_dir"], - size = "medium", - srcs = ["agents/dqn/tests/test_apex_dqn.py"] -) +## APEXTrainer (DQN) +#py_test( +# name = "test_apex_dqn", +# tags = ["agents_dir"], +# size = "large", +# srcs = ["agents/dqn/tests/test_apex_dqn.py"] +#) # APEXDDPGTrainer py_test( @@ -482,15 +482,6 @@ py_test( srcs = ["agents/dqn/tests/test_simple_q.py"] ) -# TODO: enable once we have a MuJoCo-independent test case. -## Dreamer -#py_test( -# name = "test_dreamer", -# tags = ["agents_dir"], -# size = "small", -# srcs = ["agents/dreamer/tests/test_dreamer.py"] -#) - # ES py_test( name = "test_es", diff --git a/rllib/agents/dreamer/dreamer.py b/rllib/agents/dreamer/dreamer.py index 21646d61871d..94774d9fec91 100644 --- a/rllib/agents/dreamer/dreamer.py +++ b/rllib/agents/dreamer/dreamer.py @@ -31,8 +31,6 @@ "discount": 0.99, # Lambda "lambda": 0.95, - # Clipping is done inherently via policy tanh. - "clip_actions": False, # Training iterations per data collection from real env "dreamer_train_iters": 100, # Horizon for Enviornment (1000 for Mujoco/DMC) diff --git a/rllib/agents/dreamer/dreamer_model.py b/rllib/agents/dreamer/dreamer_model.py index f2db417e512b..5483f664f839 100644 --- a/rllib/agents/dreamer/dreamer_model.py +++ b/rllib/agents/dreamer/dreamer_model.py @@ -1,6 +1,6 @@ import numpy as np from typing import Any, List, Tuple -from ray.rllib.models.torch.misc import Reshape +from ray.rllib.models.torch.modules.reshape import Reshape from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 from ray.rllib.utils.framework import try_import_torch from ray.rllib.utils.framework import TensorType diff --git a/rllib/agents/dreamer/tests/test_dreamer.py b/rllib/agents/dreamer/tests/test_dreamer.py deleted file mode 100644 index 2b318866ca48..000000000000 --- a/rllib/agents/dreamer/tests/test_dreamer.py +++ /dev/null @@ -1,41 +0,0 @@ -import unittest - -import ray -from ray import tune -import ray.rllib.agents.dreamer as dreamer -from ray.rllib.examples.env.dm_control_suite import hopper_hop -from ray.rllib.utils.test_utils import check_compute_single_action, \ - framework_iterator - - -class TestDreamer(unittest.TestCase): - """Sanity tests for DreamerTrainer.""" - - def setUp(self): - ray.init() - - def tearDown(self): - ray.shutdown() - - def test_dreamer_compilation(self): - """Test whether an DreamerTrainer can be built with all frameworks.""" - config = dreamer.DEFAULT_CONFIG.copy() - tune.register_env("dm_control_hopper_hop", lambda _: hopper_hop()) - - num_iterations = 1 - - # Test against all frameworks. - for _ in framework_iterator(config, frameworks="torch"): - for env in ["dm_control_hopper_hop"]: - trainer = dreamer.DREAMERTrainer(config=config, env=env) - for i in range(num_iterations): - results = trainer.train() - print(results) - check_compute_single_action(trainer) - trainer.stop() - - -if __name__ == "__main__": - import pytest - import sys - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/env/wrappers/dm_control_wrapper.py b/rllib/env/wrappers/dm_control_wrapper.py index 3286aae28adf..6734e2a3ab66 100644 --- a/rllib/env/wrappers/dm_control_wrapper.py +++ b/rllib/env/wrappers/dm_control_wrapper.py @@ -31,7 +31,7 @@ specs = None try: from dm_control import suite -except (ImportError, OSError): +except ImportError: suite = None import numpy as np From b549a4d53c90531d8a0be65d8c2e52b3101fba9a Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Mon, 15 Feb 2021 20:25:12 -0800 Subject: [PATCH 244/244] Revert "[Autoscaler] Display node status tag in autsocaler status (#13561)" This reverts commit 1e852ed3e6b829207b714b3fe4daa35f6d33074c. --- python/ray/autoscaler/_private/autoscaler.py | 2 +- python/ray/autoscaler/_private/util.py | 4 ++-- python/ray/tests/test_resource_demand_scheduler.py | 12 +++++------- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/python/ray/autoscaler/_private/autoscaler.py b/python/ray/autoscaler/_private/autoscaler.py index 1166597ed9d6..2838e24c18b4 100644 --- a/python/ray/autoscaler/_private/autoscaler.py +++ b/python/ray/autoscaler/_private/autoscaler.py @@ -765,7 +765,7 @@ def summary(self): ] is_pending = status in pending_states if is_pending: - pending_nodes.append((ip, node_type, status)) + pending_nodes.append((ip, node_type)) else: # TODO (Alex): Failed nodes are now immediately killed, so # this list will almost always be empty. We should ideally diff --git a/python/ray/autoscaler/_private/util.py b/python/ray/autoscaler/_private/util.py index 1e677e35bc7d..81a2c1fc00ff 100644 --- a/python/ray/autoscaler/_private/util.py +++ b/python/ray/autoscaler/_private/util.py @@ -362,8 +362,8 @@ def format_info_string(lm_summary, autoscaler_summary, time=None): for node_type, count in autoscaler_summary.pending_launches.items(): line = f" {node_type}, {count} launching" pending_lines.append(line) - for ip, node_type, status in autoscaler_summary.pending_nodes: - line = f" {ip}: {node_type}, {status.lower()}" + for ip, node_type in autoscaler_summary.pending_nodes: + line = f" {ip}: {node_type}, setting up" pending_lines.append(line) if pending_lines: pending_report = "\n".join(pending_lines) diff --git a/python/ray/tests/test_resource_demand_scheduler.py b/python/ray/tests/test_resource_demand_scheduler.py index 3bfe28f7cc83..4b2027af1d66 100644 --- a/python/ray/tests/test_resource_demand_scheduler.py +++ b/python/ray/tests/test_resource_demand_scheduler.py @@ -28,7 +28,7 @@ from ray.autoscaler.tags import TAG_RAY_USER_NODE_TYPE, TAG_RAY_NODE_KIND, \ NODE_KIND_WORKER, TAG_RAY_NODE_STATUS, \ STATUS_UP_TO_DATE, STATUS_UNINITIALIZED, \ - STATUS_UPDATE_FAILED, STATUS_WAITING_FOR_SSH, \ + STATUS_UPDATE_FAILED, \ NODE_KIND_HEAD, NODE_TYPE_LEGACY_WORKER, \ NODE_TYPE_LEGACY_HEAD from ray.test_utils import same_elements @@ -1419,8 +1419,7 @@ def testSummary(self): assert summary.active_nodes["empty_node"] == 1 assert len(summary.active_nodes) == 2, summary.active_nodes - assert summary.pending_nodes == [("172.0.0.3", "p2.xlarge", - STATUS_WAITING_FOR_SSH)] + assert summary.pending_nodes == [("172.0.0.3", "p2.xlarge")] assert summary.pending_launches == {"m4.16xlarge": 2} assert summary.failed_nodes == [("172.0.0.4", "m4.4xlarge")] @@ -2404,8 +2403,7 @@ def test_info_string(): "p3.2xlarge": 2, "m4.4xlarge": 20 }, - pending_nodes=[("1.2.3.4", "m4.4xlarge", STATUS_WAITING_FOR_SSH), - ("1.2.3.5", "m4.4xlarge", STATUS_WAITING_FOR_SSH)], + pending_nodes=[("1.2.3.4", "m4.4xlarge"), ("1.2.3.5", "m4.4xlarge")], pending_launches={"m4.4xlarge": 2}, failed_nodes=[("1.2.3.6", "p3.2xlarge")]) @@ -2418,8 +2416,8 @@ def test_info_string(): 20 m4.4xlarge Pending: m4.4xlarge, 2 launching - 1.2.3.4: m4.4xlarge, waiting-for-ssh - 1.2.3.5: m4.4xlarge, waiting-for-ssh + 1.2.3.4: m4.4xlarge, setting up + 1.2.3.5: m4.4xlarge, setting up Recent failures: (no failures)