diff --git a/.bazelrc b/.bazelrc index 8de20992a595..2baaa0fa2af5 100644 --- a/.bazelrc +++ b/.bazelrc @@ -95,7 +95,6 @@ test:asan --test_env=ASAN_OPTIONS="detect_leaks=0" test:asan --test_env=LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libasan.so.2 /usr/lib/gcc/x86_64-linux-gnu/7/libasan.so" # For example, for Ubuntu 18.04 libasan can be found here: # test:asan --test_env=LD_PRELOAD="/usr/lib/gcc/x86_64-linux-gnu/7/libasan.so" -test:asan-buildkite --test_env=LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libasan.so.5" # CI configuration: aquery:ci --color=no diff --git a/.buildkite/Dockerfile b/.buildkite/Dockerfile index d20a9170f31d..2f52fb92d1d1 100644 --- a/.buildkite/Dockerfile +++ b/.buildkite/Dockerfile @@ -2,33 +2,18 @@ FROM ubuntu:focal ARG REMOTE_CACHE_URL ARG BUILDKITE_PULL_REQUEST -ARG BUILDKITE_COMMIT -ARG BUILDKITE_PULL_REQUEST_BASE_BRANCH ENV DEBIAN_FRONTEND=noninteractive ENV TZ=America/Los_Angeles - ENV BUILDKITE=true ENV CI=true ENV PYTHON=3.6 -ENV RAY_USE_RANDOM_PORTS=1 -ENV RAY_DEFAULT_BUILD=1 -ENV BUILDKITE_PULL_REQUEST=${BUILDKITE_PULL_REQUEST} -ENV BUILDKITE_COMMIT=${BUILDKITE_COMMIT} -ENV BUILDKITE_PULL_REQUEST_BASE_BRANCH=${BUILDKITE_PULL_REQUEST_BASE_BRANCH} RUN apt-get update -qq RUN apt-get install -y -qq \ curl python-is-python3 git build-essential \ - sudo unzip apt-utils dialog tzdata wget rsync \ - language-pack-en tmux cmake gdb vim htop \ - libgtk2.0-dev zlib1g-dev libgl1-mesa-dev - -# System conf for tests + sudo unzip apt-utils dialog tzdata wget RUN locale -a -ENV LC_ALL=en_US.utf8 -ENV LANG=en_US.utf8 -RUN echo "ulimit -c 0" >> /root/.bashrc # Setup Bazel caches RUN (echo "build --remote_cache=${REMOTE_CACHE_URL}" >> /root/.bazelrc); \ @@ -42,7 +27,3 @@ WORKDIR /ray COPY . . RUN ./ci/travis/ci.sh init RUN bash --login -i ./ci/travis/ci.sh build - -# Run determine test to run -RUN bash --login -i -c "python ./ci/travis/determine_tests_to_run.py --output=json > affected_set.json" -RUN cat affected_set.json \ No newline at end of file diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 73e715cde885..91c673d52604 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -1,184 +1,6 @@ -- label: ":book: Lint" +- label: "Ray Core Tests (:buildkite: Experimental)" commands: - - export LINT=1 - - ./ci/travis/install-dependencies.sh - - ./ci/travis/ci.sh lint - - ./ci/travis/ci.sh build - -- label: ":java: Java" - conditions: ["RAY_CI_JAVA_AFFECTED"] + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only -- //:all -rllib/... +- label: "Ray Dashboard Tests" commands: - - apt-get install -y openjdk-8-jdk maven clang-format - # Compile Java again so bazel will compile Java as a language. - - RAY_INSTALL_JAVA=1 ./ci/travis/ci.sh build - - ./java/test.sh - -- label: ":java: Streaming" - conditions: - ["RAY_CI_STREAMING_PYTHON_AFFECTED", "RAY_CI_STREAMING_JAVA_AFFECTED"] - commands: - - apt-get install -y openjdk-8-jdk maven - # Compile Java again so bazel will compile Java as a language. - - RAY_INSTALL_JAVA=1 ./ci/travis/ci.sh build - - bazel test --config=ci $(./scripts/bazel_export_options) - //streaming:all - - bash streaming/src/test/run_streaming_queue_test.sh - -- label: ":cpp: Worker" - commands: - - ./ci/travis/ci.sh test_cpp - -- label: ":cpp: Tests" - commands: - - bazel test --config=ci $(./scripts/bazel_export_options) - --build_tests_only - -- //:all -rllib/... -core_worker_test - -- label: ":cpp: Tests (ASAN)" - commands: - - bazel test --config=ci --config=asan $(./scripts/bazel_export_options) - --build_tests_only - --config=asan-buildkite - --jobs=2 - -- //:all -//:core_worker_test - -- label: ":serverless: Dashboard + Serve Tests" - conditions: - [ - "RAY_CI_SERVE_AFFECTED", - "RAY_CI_DASHBOARD_AFFECTED", - "RAY_CI_PYTHON_AFFECTED", - ] - commands: - - TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh - - bazel test --config=ci $(./scripts/bazel_export_options) - python/ray/new_dashboard/... - - bazel test --config=ci $(./scripts/bazel_export_options) - python/ray/serve/... - -- label: ":python: (Small & Large)" - conditions: ["RAY_CI_PYTHON_AFFECTED"] - commands: - - bazel test --config=ci $(./scripts/bazel_export_options) - --test_tag_filters=-kubernetes,-jenkins_only,-medium_size_python_tests_a_to_j,-medium_size_python_tests_k_to_z - python/ray/tests/... - - bazel test --config=ci $(./scripts/bazel_export_options) - --test_tag_filters=-kubernetes,-jenkins_only,client_tests - --test_env=RAY_CLIENT_MODE=1 - python/ray/tests/... -- label: ":python: (Medium A-J)" - conditions: ["RAY_CI_PYTHON_AFFECTED"] - commands: - - bazel test --config=ci $(./scripts/bazel_export_options) - --test_tag_filters=-kubernetes,-jenkins_only,medium_size_python_tests_a_to_j - python/ray/tests/... -- label: ":python: (Medium K-Z)" - conditions: ["RAY_CI_PYTHON_AFFECTED"] - commands: - - bazel test --config=ci $(./scripts/bazel_export_options) - --test_tag_filters=-kubernetes,-jenkins_only,medium_size_python_tests_k_to_z - python/ray/tests/... - -- label: ":brain: RLlib: Learning tests (from rllib/tuned_examples/*.yaml)" - conditions: ["RAY_CI_RLLIB_AFFECTED"] - commands: - - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh - - bazel test --config=ci $(./scripts/bazel_export_options) - --build_tests_only - --test_tag_filters=learning_tests_tf - rllib/... -- label: ":brain: RLlib: Learning tests with tf=1.x (from rllib/tuned_examples/*.yaml)" - conditions: ["RAY_CI_RLLIB_AFFECTED"] - commands: - - RLLIB_TESTING=1 TF_VERSION=1.14.0 TFP_VERSION=0.7 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh - - bazel test --config=ci $(./scripts/bazel_export_options) - --build_tests_only - --test_tag_filters=learning_tests_tf - rllib/... -- label: ":brain: RLlib: Learning tests with Torch (from rllib/tuned_examples/*.yaml)" - conditions: ["RAY_CI_RLLIB_AFFECTED"] - commands: - - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh - - bazel test --config=ci $(./scripts/bazel_export_options) - --build_tests_only - --test_tag_filters=learning_tests_torch - rllib/... -- label: ":brain: RLlib: Quick Agent train.py runs" - conditions: ["RAY_CI_RLLIB_AFFECTED"] - commands: - - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh - - bazel test --config=ci $(./scripts/bazel_export_options) - --build_tests_only - --test_tag_filters=quick_train - --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 - rllib/... - # Test everything that does not have any of the "main" labels: - # "learning_tests|quick_train|examples|tests_dir". - - bazel test --config=ci $(./scripts/bazel_export_options) - --build_tests_only - --test_tag_filters=-learning_tests_tf,-learning_tests_torch,-quick_train,-examples,-tests_dir - --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 - rllib/... -- label: ":brain: RLlib: rllib/examples/" - conditions: ["RAY_CI_RLLIB_AFFECTED"] - commands: - - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only - --test_tag_filters=examples_A,examples_B --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only - --test_tag_filters=examples_C,examples_D --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only - --test_tag_filters=examples_E,examples_F,examples_G,examples_H,examples_I,examples_J,examples_K,examples_L,examples_M,examples_N,examples_O,examples_P --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 - rllib/... - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only - --test_tag_filters=examples_Q,examples_R,examples_S,examples_T,examples_U,examples_V,examples_W,examples_X,examples_Y,examples_Z --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 - rllib/... -- label: ":brain: RLlib: rllib/tests/ (A-L)" - conditions: ["RAY_CI_RLLIB_AFFECTED"] - commands: - - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only - --test_tag_filters=tests_dir_A,tests_dir_B,tests_dir_C,tests_dir_D,tests_dir_E,tests_dir_F,tests_dir_G,tests_dir_H,tests_dir_I,tests_dir_J,tests_dir_K,tests_dir_L --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 - rllib/... -- label: ":brain: RLlib: rllib/tests/ (M-Z)" - conditions: ["RAY_CI_RLLIB_AFFECTED"] - commands: - - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only - --test_tag_filters=tests_dir_M,tests_dir_N,tests_dir_O,tests_dir_P,tests_dir_Q,tests_dir_R,tests_dir_S,tests_dir_T,tests_dir_U,tests_dir_V,tests_dir_W,tests_dir_X,tests_dir_Y,tests_dir_Z --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 - rllib/... - -- label: ":octopus: Tune tests and examples" - conditions: ["RAY_CI_TUNE_AFFECTED"] - commands: - - TUNE_TESTING=1 ./ci/travis/install-dependencies.sh - - bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=-jenkins_only,-example python/ray/tune/... - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=example,-tf,-pytorch,-py37,-flaky python/ray/tune/... - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tf,-pytorch,-py37,-flaky python/ray/tune/... - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,pytorch,-py37,-flaky python/ray/tune/... - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-py37,flaky python/ray/tune/... - -- label: ":octopus: SGD tests and examples" - conditions: ["RAY_CI_SGD_AFFECTED"] - commands: - - SGD_TESTING=1 ./ci/travis/install-dependencies.sh - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tf,-pytorch,-py37 python/ray/util/sgd/... - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,pytorch,-py37 python/ray/util/sgd/... - -- label: ":octopus: Tune/SGD tests and examples. Python 3.7" - conditions: ["RAY_CI_TUNE_AFFECTED", "RAY_CI_SGD_AFFECTED"] - commands: - - TUNE_TESTING=1 PYTHON=3.7 INSTALL_HOROVOD=1 ./ci/travis/install-dependencies.sh - # Bcause Python version changed, we need to re-install Ray here - - rm -rf ./python/ray/thirdparty_files; ./ci/travis/ci.sh build - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=py37 python/ray/tune/... - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only python/ray/util/xgboost/... - -- label: ":book: Doc tests and examples" - conditions: - ["RAY_CI_PYTHON_AFFECTED", "RAY_CI_TUNE_AFFECTED", "RAY_CI_DOC_AFFECTED"] - commands: - - DOC_TESTING=1 ./ci/travis/install-dependencies.sh - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,-pytorch,-py37 doc/... - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tf,-pytorch,-py37 doc/... - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,pytorch,-py37 doc/... + - bazel test --config=ci $(./scripts/bazel_export_options) python/ray/new_dashboard/... diff --git a/.travis.yml b/.travis.yml index 6ee68c003d94..36e49aaa74ef 100644 --- a/.travis.yml +++ b/.travis.yml @@ -78,9 +78,7 @@ matrix: - . ./ci/travis/ci.sh build script: # Run all C++ unit tests with ASAN enabled. ASAN adds too much overhead to run Python tests. - # NOTE: core_worker_test is out-of-date and should already covered by - # Python tests. - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only -- //:all -core_worker_test + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only -- //:all - os: osx osx_image: xcode7 @@ -197,7 +195,6 @@ matrix: env: # - PYTHON=3.6 - LINUX_WHEELS=1 LINUX_JARS=1 - - DOCKER_BUILD_PY37=1 - PYTHONWARNINGS=ignore - RAY_INSTALL_JAVA=1 language: java @@ -210,32 +207,10 @@ matrix: - . ./ci/travis/ci.sh test_wheels - export PATH="$HOME/miniconda3/bin:$PATH" - python -m pip install docker - - if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then python $TRAVIS_BUILD_DIR/ci/travis/build-docker-images.py PY37; fi + - if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then python $TRAVIS_BUILD_DIR/ci/travis/build-docker-images.py; fi - bash ./java/build-jar-multiplatform.sh linux cache: false - - # Build Py36 & Py38 Docker Images - - os: linux - env: - - LINUX_WHEELS=1 - - DOCKER_BUILD_PY36_38=1 - - PYTHONWARNINGS=ignore - language: java - jdk: openjdk8 - install: - - . ./ci/travis/ci.sh init RAY_CI_LINUX_WHEELS_AFFECTED - before_script: - - . ./ci/travis/ci.sh build - script: - - wget --quiet "https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh" -O miniconda3.sh - - bash miniconda3.sh -b -p "$HOME/miniconda3" - - export PATH="$HOME/miniconda3/bin:$PATH" - - conda install -y python=3.7.6 - - python -m pip install docker - - if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then python $TRAVIS_BUILD_DIR/ci/travis/build-docker-images.py PY36_PY38; fi - cache: false - # Build and deploy multi-platform jars. - os: linux env: @@ -443,7 +418,6 @@ matrix: script: - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=py37 python/ray/tune/... - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only python/ray/util/xgboost/... - - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only python/ray/util/lightning_accelerators/... # There are no python 3.7 tests for RaySGD at the moment # - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=py37 python/ray/util/sgd/... # - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=py37 doc/... @@ -461,10 +435,11 @@ matrix: script: - . ./ci/travis/ci.sh test_cpp script: + # raylet integration tests (core_worker_tests included in bazel tests below) + - ./ci/suppress_output bash src/ray/test/run_object_manager_tests.sh + # cc bazel tests (w/o RLlib) - # NOTE: core_worker_test is out-of-date and should already covered by Python - # tests. - - ./ci/suppress_output bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only -- //:all -rllib/... -core_worker_test + - ./ci/suppress_output bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only -- //:all -rllib/... # ray serve tests - if [ $RAY_CI_SERVE_AFFECTED == "1" ]; then ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=-jenkins_only python/ray/serve/...; fi @@ -494,7 +469,7 @@ deploy: on: repo: ray-project/ray all_branches: true - condition: ($LINUX_WHEELS = 1 && $DOCKER_BUILD_PY37=1) || $MAC_WHEELS = 1 + condition: $LINUX_WHEELS = 1 || $MAC_WHEELS = 1 - provider: s3 edge: true # This supposedly opts in to deploy v2. @@ -510,16 +485,16 @@ deploy: on: branch: master repo: ray-project/ray - condition: ($LINUX_WHEELS = 1 && $DOCKER_BUILD_PY37=1) || $MAC_WHEELS = 1 + condition: $LINUX_WHEELS = 1 || $MAC_WHEELS = 1 - provider: script edge: true # This supposedly opts in to deploy v2. - script: export PATH="$HOME/miniconda3/bin:$PATH"; ./ci/keep_alive python $TRAVIS_BUILD_DIR/ci/travis/build-docker-images.py PY37 + script: export PATH="$HOME/miniconda3/bin:$PATH"; ./ci/keep_alive python $TRAVIS_BUILD_DIR/ci/travis/build-docker-images.py skip_cleanup: true on: repo: ray-project/ray all_branches: true - condition: $LINUX_WHEELS = 1 && $DOCKER_BUILD_PY37 = 1 + condition: $LINUX_WHEELS = 1 # Upload jars so that we can debug locally for every commit - provider: s3 @@ -553,12 +528,3 @@ deploy: repo: ray-project/ray branch: master condition: $MULTIPLATFORM_JARS = 1 || $MAC_JARS = 1 || $LINUX_JARS = 1 - - - provider: script - edge: true # This supposedly opts in to deploy v2. - script: export PATH="$HOME/miniconda3/bin:$PATH"; ./ci/keep_alive python $TRAVIS_BUILD_DIR/ci/travis/build-docker-images.py PY36_PY38 - skip_cleanup: true - on: - repo: ray-project/ray - all_branches: true - condition: $LINUX_WHEELS = 1 && $DOCKER_BUILD_PY36_38 = 1 diff --git a/BUILD.bazel b/BUILD.bazel index 7dbd8fadb526..a863727ecd95 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -702,16 +702,6 @@ cc_test( ], ) -cc_test( - name = "memory_store_test", - srcs = ["src/ray/core_worker/test/memory_store_test.cc"], - copts = COPTS, - deps = [ - ":core_worker_lib", - "@com_google_googletest//:gtest_main", - ], -) - cc_test( name = "direct_actor_transport_test", srcs = ["src/ray/core_worker/test/direct_actor_transport_test.cc"], @@ -1375,6 +1365,30 @@ cc_library( ], ) +cc_binary( + name = "object_manager_test", + testonly = 1, + srcs = ["src/ray/object_manager/test/object_manager_test.cc"], + copts = COPTS, + deps = [ + ":object_manager", + "//src/ray/protobuf:common_cc_proto", + "@com_google_googletest//:gtest_main", + ], +) + +cc_binary( + name = "object_manager_stress_test", + testonly = 1, + srcs = ["src/ray/object_manager/test/object_manager_stress_test.cc"], + copts = COPTS, + deps = [ + ":object_manager", + "//src/ray/protobuf:common_cc_proto", + "@com_google_googletest//:gtest_main", + ], +) + cc_library( name = "platform_shims", srcs = [] + select({ diff --git a/README.rst b/README.rst index c937160fd836..ee025cb38751 100644 --- a/README.rst +++ b/README.rst @@ -132,7 +132,7 @@ This example runs a parallel grid search to optimize an example objective functi "beta": tune.choice([1, 2, 3]) }) - print("Best config: ", analysis.get_best_config(metric="mean_loss", mode="min")) + print("Best config: ", analysis.get_best_config(metric="mean_loss")) # Get a dataframe for analyzing trial results. df = analysis.results_df @@ -300,6 +300,7 @@ More Information Getting Involved ---------------- +- `Community Slack`_: Join our Slack workspace. - `Forum`_: For discussions about development, questions about usage, and feature requests. - `GitHub Issues`_: For reporting bugs. - `Twitter`_: Follow updates on Twitter. @@ -310,4 +311,5 @@ Getting Involved .. _`GitHub Issues`: https://github.com/ray-project/ray/issues .. _`StackOverflow`: https://stackoverflow.com/questions/tagged/ray .. _`Meetup Group`: https://www.meetup.com/Bay-Area-Ray-Meetup/ +.. _`Community Slack`: https://forms.gle/9TSdDYUgxYs8SA9e8 .. _`Twitter`: https://twitter.com/raydistributed diff --git a/benchmarks/README.md b/benchmarks/README.md deleted file mode 100644 index 352845dd02b5..000000000000 --- a/benchmarks/README.md +++ /dev/null @@ -1,35 +0,0 @@ -# Ray Scalability Envelope - -### Note: This document is a WIP. This is not a scalability guarantee (yet). - -## Distributed Benchmarks - -All distributed tests are run on 64 nodes with 64 cores/node. Maximum number of nodes is achieved by adding 4 core nodes. - -| Dimension | Quantity | -| --------- | -------- | -| # nodes in cluster (with trivial task workload) | 250+ | -| # actors in cluster (with trivial workload) | 10k+ | -| # simultaneously running tasks | 10k+ | -| # simultaneously running placement groups | 1k+ | - -## Object Store Benchmarks - -| Dimension | Quantity | -| --------- | -------- | -| 1 GiB object broadcast (# of nodes) | 50+ | - - -## Single Node Benchmarks. - -All single node benchmarks are run on a single m4.16xlarge. - -| Dimension | Quantity | -| --------- | -------- | -| # of object arguments to a single task | 10000+ | -| # of objects returned from a single task | 3000+ | -| # of plasma objects in a single `ray.get` call | 10000+ | -| # of tasks queued on a single node | 1,000,000+ | -| Maximum `ray.get` numpy object size | 100GiB+ | - - diff --git a/benchmarks/distributed/config.yaml b/benchmarks/distributed/config.yaml deleted file mode 100644 index 630de0eef265..000000000000 --- a/benchmarks/distributed/config.yaml +++ /dev/null @@ -1,58 +0,0 @@ -cluster_name: distributed-benchmarks -min_workers: 0 -max_workers: 999999 - -upscaling_speed: 9999999 - -provider: - type: aws - region: us-west-2 - availability_zone: us-west-2a, us-west-2b, us-west-2c, us-west-2d - -auth: - ssh_user: ubuntu - -available_node_types: - head_node: - node_config: - InstanceType: m5.16xlarge - ImageId: ami-098555c9b343eb09c - resources: - node: 1 - small: 1 - max_workers: 999999 - worker_node: - node_config: - InstanceType: m5.16xlarge - ImageId: ami-098555c9b343eb09c - resources: - node: 1 - min_workers: 63 - max_workers: 63 - small_worker_node: - node_config: - InstanceType: m5.xlarge - ImageId: ami-098555c9b343eb09c - resources: - node: 1 - max_workers: 999999 - -head_node_type: head_node - -worker_default_node_type: worker_node - -setup_commands: - - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl - - pip install tqdm - - sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 65535" >> /etc/security/limits.conf; echo "* hard nofile 65535" >> /etc/security/limits.conf;' - -idle_timeout_minutes: 1 - -head_start_ray_commands: - - ray stop - - ulimit -n 65535; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml - -# Command to start ray on worker nodes. You don't need to change this. -worker_start_ray_commands: - - ray stop - - ulimit -n 65535; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/benchmarks/distributed/test_distributed.py b/benchmarks/distributed/test_distributed.py deleted file mode 100644 index c929cdba8c1a..000000000000 --- a/benchmarks/distributed/test_distributed.py +++ /dev/null @@ -1,204 +0,0 @@ -import ray -import ray.autoscaler.sdk -from ray.test_utils import Semaphore -from ray.util.placement_group import placement_group, remove_placement_group - -from time import sleep, perf_counter -from tqdm import tqdm, trange - -TEST_NUM_NODES = 64 -MAX_ACTORS_IN_CLUSTER = 10000 -MAX_RUNNING_TASKS_IN_CLUSTER = 10000 -MAX_PLACEMENT_GROUPS = 1000 -MAX_NUM_NODES = 250 - - -def num_alive_nodes(): - n = 0 - for node in ray.nodes(): - if node["Alive"]: - n += 1 - return n - - -def scale_to(target): - while num_alive_nodes() != target: - ray.autoscaler.sdk.request_resources(bundles=[{"node": 1}] * target) - print(f"Current # nodes: {num_alive_nodes()}, target: {target}") - print("Waiting ...") - sleep(5) - - -def test_nodes(): - scale_to(MAX_NUM_NODES) - assert num_alive_nodes() == MAX_NUM_NODES - # Treat this as a trivial task to ensure the nodes are all functioning - test_max_running_tasks() - - -def test_max_actors(): - # TODO (Alex): Dynamically set this based on number of cores - cpus_per_actor = 0.25 - - @ray.remote(num_cpus=cpus_per_actor) - class Actor: - def foo(self): - pass - - actors = [ - Actor.remote() - for _ in trange(MAX_ACTORS_IN_CLUSTER, desc="Launching actors") - ] - - for actor in tqdm(actors, desc="Ensuring actors have started"): - assert ray.get(actor.foo.remote()) is None - - -def test_max_running_tasks(): - counter = Semaphore.remote(0) - blocker = Semaphore.remote(0) - - @ray.remote(num_cpus=0.25) - def task(counter, blocker): - sleep(300) - - refs = [ - task.remote(counter, blocker) - for _ in trange(MAX_RUNNING_TASKS_IN_CLUSTER, desc="Launching tasks") - ] - - max_cpus = ray.cluster_resources()["CPU"] - min_cpus_available = max_cpus - for _ in trange(int(300 / 0.1), desc="Waiting"): - try: - cur_cpus = ray.available_resources().get("CPU", 0) - min_cpus_available = min(min_cpus_available, cur_cpus) - except Exception: - # There are race conditions `.get` can fail if a new heartbeat - # comes at the same time. - pass - sleep(0.1) - - # There are some relevant magic numbers in this check. 10k tasks each - # require 1/4 cpus. Therefore, ideally 2.5k cpus will be used. - err_str = f"Only {max_cpus - min_cpus_available}/{max_cpus} cpus used." - assert max_cpus - min_cpus_available > 2000, err_str - - for _ in trange( - MAX_RUNNING_TASKS_IN_CLUSTER, - desc="Ensuring all tasks have finished"): - done, refs = ray.wait(refs) - assert ray.get(done[0]) is None - - -def test_many_placement_groups(): - @ray.remote(num_cpus=1, resources={"node": 0.02}) - def f1(): - sleep(10) - pass - - @ray.remote(num_cpus=1) - def f2(): - sleep(10) - pass - - @ray.remote(resources={"node": 0.02}) - def f3(): - sleep(10) - pass - - bundle1 = {"node": 0.02, "CPU": 1} - bundle2 = {"CPU": 1} - bundle3 = {"node": 0.02} - - pgs = [] - for _ in trange(MAX_PLACEMENT_GROUPS, desc="Creating pgs"): - pg = placement_group(bundles=[bundle1, bundle2, bundle3]) - pgs.append(pg) - - for pg in tqdm(pgs, desc="Waiting for pgs to be ready"): - ray.get(pg.ready()) - - refs = [] - for pg in tqdm(pgs, desc="Scheduling tasks"): - ref1 = f1.options(placement_group=pg).remote() - ref2 = f2.options(placement_group=pg).remote() - ref3 = f3.options(placement_group=pg).remote() - refs.extend([ref1, ref2, ref3]) - - for _ in trange(10, desc="Waiting"): - sleep(1) - - with tqdm() as p_bar: - while refs: - done, refs = ray.wait(refs) - p_bar.update() - - for pg in tqdm(pgs, desc="Cleaning up pgs"): - remove_placement_group(pg) - - -ray.init(address="auto") - -scale_to(TEST_NUM_NODES) -assert num_alive_nodes( -) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes()) - -cluster_resources = ray.cluster_resources() - -available_resources = ray.available_resources() -assert available_resources == cluster_resources, ( - str(available_resources) + " != " + str(cluster_resources)) -print("Done launching nodes") - -actor_start = perf_counter() -test_max_actors() -actor_end = perf_counter() - -sleep(1) -assert num_alive_nodes( -) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes()) -assert available_resources == cluster_resources, ( - str(available_resources) + " != " + str(cluster_resources)) -print("Done testing actors") - -task_start = perf_counter() -test_max_running_tasks() -task_end = perf_counter() - -sleep(1) -assert num_alive_nodes( -) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes()) -assert available_resources == cluster_resources, ( - str(available_resources) + " != " + str(cluster_resources)) -print("Done testing tasks") - -pg_start = perf_counter() -test_many_placement_groups() -pg_end = perf_counter() - -sleep(1) -assert num_alive_nodes( -) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes()) -assert available_resources == cluster_resources, ( - str(available_resources) + " != " + str(cluster_resources)) -print("Done testing placement groups") - -launch_start = perf_counter() -test_nodes() -launch_end = perf_counter() - -sleep(1) -assert num_alive_nodes( -) == MAX_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes()) -print("Done.") - -actor_time = actor_end - actor_start -task_time = task_end - task_start -pg_time = pg_end - pg_start -launch_time = launch_end - launch_start - -print(f"Actor time: {actor_time} ({MAX_ACTORS_IN_CLUSTER} actors)") -print(f"Task time: {task_time} ({MAX_RUNNING_TASKS_IN_CLUSTER} tasks)") -print(f"PG time: {pg_time} ({MAX_PLACEMENT_GROUPS} placement groups)") -print(f"Node launch time: {launch_time} ({MAX_NUM_NODES} nodes)") diff --git a/benchmarks/object_store/config.yaml b/benchmarks/object_store/config.yaml deleted file mode 100644 index 5ea3ce8352af..000000000000 --- a/benchmarks/object_store/config.yaml +++ /dev/null @@ -1,48 +0,0 @@ -cluster_name: object-store-benchmarks -min_workers: 0 -max_workers: 999999 - -upscaling_speed: 9999999 - -provider: - type: aws - region: us-west-2 - availability_zone: us-west-2a - -auth: - ssh_user: ubuntu - -available_node_types: - head_node: - node_config: - InstanceType: m4.4xlarge - ImageId: ami-098555c9b343eb09c - resources: - node: 1 - max_workers: 999999 - worker_node: - node_config: - InstanceType: m4.xlarge - ImageId: ami-098555c9b343eb09c - resources: - node: 1 - max_workers: 999999 - -head_node_type: head_node - -worker_default_node_type: worker_node - -setup_commands: - - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl - - pip install tqdm numpy - -idle_timeout_minutes: 5 - -head_start_ray_commands: - - ray stop - - ulimit -n 1000000; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml - -# Command to start ray on worker nodes. You don't need to change this. -worker_start_ray_commands: - - ray stop - - ulimit -n 1000000; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/benchmarks/object_store/test_object_store.py b/benchmarks/object_store/test_object_store.py deleted file mode 100644 index 83312fddd90e..000000000000 --- a/benchmarks/object_store/test_object_store.py +++ /dev/null @@ -1,61 +0,0 @@ -import numpy as np - -import ray -import ray.autoscaler.sdk - -from time import sleep, perf_counter -from tqdm import tqdm - -NUM_NODES = 50 -OBJECT_SIZE = 2**30 - - -def num_alive_nodes(): - n = 0 - for node in ray.nodes(): - if node["Alive"]: - n += 1 - return n - - -def scale_to(target): - while num_alive_nodes() != target: - ray.autoscaler.sdk.request_resources(bundles=[{"node": 1}] * target) - print(f"Current # nodes: {num_alive_nodes()}, target: {target}") - print("Waiting ...") - sleep(5) - - -def test_object_broadcast(): - scale_to(NUM_NODES) - - @ray.remote(num_cpus=1, resources={"node": 1}) - class Actor: - def foo(self): - pass - - def sum(self, arr): - return np.sum(arr) - - actors = [Actor.remote() for _ in range(NUM_NODES)] - - arr = np.ones(OBJECT_SIZE, dtype=np.uint8) - ref = ray.put(arr) - - for actor in tqdm(actors, desc="Ensure all actors have started."): - ray.get(actor.foo.remote()) - - result_refs = [] - for actor in tqdm(actors, desc="Broadcasting objects"): - result_refs.append(actor.sum.remote(ref)) - - results = ray.get(result_refs) - for result in results: - assert result == OBJECT_SIZE - - -ray.init(address="auto") -start = perf_counter() -test_object_broadcast() -end = perf_counter() -print(f"Broadcast time: {end - start} ({OBJECT_SIZE} B x {NUM_NODES} nodes)") diff --git a/benchmarks/single_node/config.yaml b/benchmarks/single_node/config.yaml deleted file mode 100644 index e5798541f9c1..000000000000 --- a/benchmarks/single_node/config.yaml +++ /dev/null @@ -1,41 +0,0 @@ -cluster_name: single-node-benchmarks -min_workers: 0 -max_workers: 0 - -upscaling_speed: 9999999 - -provider: - type: aws - region: us-west-2 - availability_zone: us-west-2a - -auth: - ssh_user: ubuntu - -available_node_types: - head_node: - node_config: - InstanceType: m4.16xlarge - ImageId: ami-098555c9b343eb09c - resources: - node: 1 - max_workers: 999999 - worker_node: - node_config: - InstanceType: m4.xlarge - ImageId: ami-098555c9b343eb09c - -head_node_type: head_node - -worker_default_node_type: worker_node - -setup_commands: - - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl - - pip install numpy tqdm - - sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1000000" >> /etc/security/limits.conf; echo "* hard nofile 1000000" >> /etc/security/limits.conf;' - -idle_timeout_minutes: 5 - -head_start_ray_commands: - - ray stop - - ulimit -n 1000000; ray start --head --port=6379 --object-manager-port=8076 --object-store-memory=128000000000 --autoscaling-config=~/ray_bootstrap_config.yaml diff --git a/benchmarks/single_node/test_single_node.py b/benchmarks/single_node/test_single_node.py deleted file mode 100644 index 75d783124523..000000000000 --- a/benchmarks/single_node/test_single_node.py +++ /dev/null @@ -1,175 +0,0 @@ -import numpy as np -import ray -import ray.autoscaler.sdk -from ray.test_utils import Semaphore - -from time import perf_counter -from tqdm import trange, tqdm - -MAX_ARGS = 10000 -MAX_RETURNS = 3000 -MAX_RAY_GET_ARGS = 10000 -MAX_QUEUED_TASKS = 1_000_000 -MAX_RAY_GET_SIZE = 100 * 2**30 - - -def test_many_args(): - @ray.remote - def sum_args(*args): - return sum(sum(arg) for arg in args) - - args = [[1 for _ in range(10000)] for _ in range(MAX_ARGS)] - result = ray.get(sum_args.remote(*args)) - assert result == MAX_ARGS * 10000 - - -def test_many_returns(): - @ray.remote(num_returns=MAX_RETURNS) - def f(): - to_return = [] - for _ in range(MAX_RETURNS): - obj = list(range(10000)) - to_return.append(obj) - - return tuple(to_return) - - returned_refs = f.remote() - assert len(returned_refs) == MAX_RETURNS - - for ref in returned_refs: - expected = list(range(10000)) - obj = ray.get(ref) - assert obj == expected - - -def test_ray_get_args(): - def with_dese(): - print("Putting test objects:") - refs = [] - for _ in trange(MAX_RAY_GET_ARGS): - obj = list(range(10000)) - refs.append(ray.put(obj)) - - print("Getting objects") - results = ray.get(refs) - assert len(results) == MAX_RAY_GET_ARGS - - print("Asserting correctness") - for obj in tqdm(results): - expected = list(range(10000)) - assert obj == expected - - def with_zero_copy(): - print("Putting test objects:") - refs = [] - for _ in trange(MAX_RAY_GET_ARGS): - obj = np.arange(10000) - refs.append(ray.put(obj)) - - print("Getting objects") - results = ray.get(refs) - assert len(results) == MAX_RAY_GET_ARGS - - print("Asserting correctness") - for obj in tqdm(results): - expected = np.arange(10000) - assert (obj == expected).all() - - with_dese() - print("Done with dese") - with_zero_copy() - print("Done with zero copy") - - -def test_many_queued_tasks(): - sema = Semaphore.remote(0) - - @ray.remote(num_cpus=1) - def block(): - ray.get(sema.acquire.remote()) - - @ray.remote(num_cpus=1) - def f(): - pass - - num_cpus = int(ray.cluster_resources()["CPU"]) - blocked_tasks = [] - for _ in range(num_cpus): - blocked_tasks.append(block.remote()) - - print("Submitting many tasks") - pending_tasks = [] - for _ in trange(MAX_QUEUED_TASKS): - pending_tasks.append(f.remote()) - - # Make sure all the tasks can actually run. - for _ in range(num_cpus): - sema.release.remote() - - print("Unblocking tasks") - for ref in tqdm(pending_tasks): - assert ray.get(ref) is None - - -def test_large_object(): - print("Generating object") - obj = np.zeros(MAX_RAY_GET_SIZE, dtype=np.int8) - print("Putting object") - ref = ray.put(obj) - del obj - print("Getting object") - big_obj = ray.get(ref) - - assert big_obj[0] == 0 - assert big_obj[-1] == 0 - - -ray.init(address="auto") - -args_start = perf_counter() -test_many_args() -args_end = perf_counter() - -assert ray.cluster_resources() == ray.available_resources() -print("Finished many args") - -returns_start = perf_counter() -test_many_returns() -returns_end = perf_counter() - -assert ray.cluster_resources() == ray.available_resources() -print("Finished many returns") - -get_start = perf_counter() -test_ray_get_args() -get_end = perf_counter() - -assert ray.cluster_resources() == ray.available_resources() -print("Finished ray.get on many objects") - -queued_start = perf_counter() -test_many_queued_tasks() -queued_end = perf_counter() - -assert ray.cluster_resources() == ray.available_resources() -print("Finished queueing many tasks") - -large_object_start = perf_counter() -test_large_object() -large_object_end = perf_counter() - -assert ray.cluster_resources() == ray.available_resources() -print("Done") - -args_time = args_end - args_start -returns_time = returns_end - returns_start -get_time = get_end - get_start -queued_time = queued_end - queued_start -large_object_time = large_object_end - large_object_start - -print(f"Many args time: {args_time} ({MAX_ARGS} args)") -print(f"Many returns time: {returns_time} ({MAX_RETURNS} returns)") -print(f"Ray.get time: {get_time} ({MAX_RAY_GET_ARGS} args)") -print(f"Queued task time: {queued_time} ({MAX_QUEUED_TASKS} tasks)") -print(f"Ray.get large object time: {large_object_time} " - f"({MAX_RAY_GET_SIZE} bytes)") diff --git a/build-docker.sh b/build-docker.sh index 42f9068954f1..3a09b4896010 100755 --- a/build-docker.sh +++ b/build-docker.sh @@ -8,8 +8,7 @@ set -x GPU="" BASE_IMAGE="ubuntu:focal" WHEEL_URL="https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl" -PYTHON_VERSION="3.7.7" - +PYTHON_VERSION="" while [[ $# -gt 0 ]] do @@ -17,7 +16,7 @@ key="$1" case $key in --gpu) GPU="-gpu" - BASE_IMAGE="nvidia/cuda:11.0-cudnn8-runtime-ubuntu18.04" + BASE_IMAGE="nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04" ;; --no-cache-build) NO_CACHE="--no-cache" @@ -42,7 +41,6 @@ case $key in --python-version) # Python version to install. e.g. 3.7.7. # Changing python versions may require a different wheel. - # If not provided defaults to 3.7.7 shift PYTHON_VERSION=$1 ;; @@ -61,7 +59,7 @@ for IMAGE in "base-deps" "ray-deps" "ray" do cp "$WHEEL" "docker/$IMAGE/$(basename "$WHEEL")" if [ $OUTPUT_SHA ]; then - IMAGE_SHA=$(docker build $NO_CACHE --build-arg GPU="$GPU" --build-arg BASE_IMAGE="$BASE_IMAGE" --build-arg WHEEL_PATH="$(basename "$WHEEL")" --build-arg PYTHON_VERSION="$PYTHON_VERSION" -q -t rayproject/$IMAGE:nightly$GPU docker/$IMAGE) + IMAGE_SHA=$(docker build $NO_CACHE --build-arg GPU="$GPU" --build-arg BASE_IMAGE="$BASE_IMAGE" --build-arg WHEEL_PATH="$(basename "$WHEEL")" -q -t rayproject/$IMAGE:nightly$GPU docker/$IMAGE) echo "rayproject/$IMAGE:nightly$GPU SHA:$IMAGE_SHA" else docker build $NO_CACHE --build-arg GPU="$GPU" --build-arg BASE_IMAGE="$BASE_IMAGE" --build-arg WHEEL_PATH="$(basename "$WHEEL")" --build-arg PYTHON_VERSION="$PYTHON_VERSION" -t rayproject/$IMAGE:nightly$GPU docker/$IMAGE diff --git a/ci/travis/bazel-format.sh b/ci/travis/bazel-format.sh index a97b97e6f777..3910529a4997 100755 --- a/ci/travis/bazel-format.sh +++ b/ci/travis/bazel-format.sh @@ -45,6 +45,6 @@ done pushd "$ROOT_DIR"/../.. BAZEL_FILES=(bazel/BUILD bazel/ray.bzl BUILD.bazel java/BUILD.bazel \ - cpp/BUILD.bazel cpp/example/BUILD.bazel streaming/BUILD.bazel streaming/java/BUILD.bazel WORKSPACE) + cpp/BUILD.bazel streaming/BUILD.bazel streaming/java/BUILD.bazel WORKSPACE) buildifier -mode=$RUN_TYPE -diff_command="diff -u" "${BAZEL_FILES[@]}" popd diff --git a/ci/travis/build-docker-images.py b/ci/travis/build-docker-images.py index 8283f5c8fb0f..a2ae7a18d13c 100644 --- a/ci/travis/build-docker-images.py +++ b/ci/travis/build-docker-images.py @@ -1,12 +1,13 @@ import datetime -import json import functools import glob import os import re +import runpy import shutil -import subprocess import sys +from contextlib import redirect_stdout +from io import StringIO from typing import List, Tuple import docker @@ -14,7 +15,7 @@ print = functools.partial(print, file=sys.stderr, flush=True) DOCKER_USERNAME = "raytravisbot" DOCKER_CLIENT = None -PYTHON_WHL_VERSION = "cp3" +PYTHON_WHL_VERSION = "cp37m" DOCKER_HUB_DESCRIPTION = { "base-deps": ("Internal Image, refer to " @@ -28,8 +29,6 @@ "https://hub.docker.com/repository/docker/rayproject/ray-ml") } -PY_MATRIX = {"-py36": "3.6.12", "-py37": "3.7.7", "-py38": "3.8.5"} - def _merge_build(): return os.environ.get("TRAVIS_PULL_REQUEST").lower() == "false" @@ -53,30 +52,28 @@ def _get_root_dir(): return os.path.join(_get_curr_dir(), "../../") -def _get_wheel_name(minor_version_number): - if minor_version_number: - matches = glob.glob(f"{_get_root_dir()}/.whl/*{PYTHON_WHL_VERSION}" - f"{minor_version_number}*-manylinux*") - assert len(matches) == 1, ( - f"Found ({len(matches)}) matches for '*{PYTHON_WHL_VERSION}" - f"{minor_version_number}*-manylinux*' instead of 1") - return os.path.basename(matches[0]) - else: - matches = glob.glob( - f"{_get_root_dir()}/.whl/*{PYTHON_WHL_VERSION}*-manylinux*") - return [os.path.basename(i) for i in matches] +def _get_wheel_name(): + matches = glob.glob( + f"{_get_root_dir()}/.whl/*{PYTHON_WHL_VERSION}-manylinux*") + assert len(matches) == 1, ( + f"Found ({len(matches)}) matches " + f"'*{PYTHON_WHL_VERSION}-manylinux*' instead of 1") + return os.path.basename(matches[0]) def _docker_affected(): - proc = subprocess.run( - [ - sys.executable, f"{_get_curr_dir()}/determine_tests_to_run.py", - "--output=json" - ], - capture_output=True) - affected_env_var_list = json.loads(proc.stdout) - affected = ("RAY_CI_DOCKER_AFFECTED" in affected_env_var_list or - "RAY_CI_PYTHON_DEPENDENCIES_AFFECTED" in affected_env_var_list) + result = StringIO() + with redirect_stdout(result): + runpy.run_path( + f"{_get_curr_dir()}/determine_tests_to_run.py", + run_name="__main__") + variable_definitions = result.getvalue().split() + env_var_dict = { + x.split("=")[0]: x.split("=")[1] + for x in variable_definitions + } + affected = env_var_dict["RAY_CI_DOCKER_AFFECTED"] == "1" or \ + env_var_dict["RAY_CI_PYTHON_DEPENDENCIES_AFFECTED"] == "1" print(f"Docker affected: {affected}") return affected @@ -84,76 +81,64 @@ def _docker_affected(): def _build_cpu_gpu_images(image_name, no_cache=True) -> List[str]: built_images = [] for gpu in ["-cpu", "-gpu"]: - for py_name, py_version in PY_MATRIX.items(): - build_args = {} - build_args["PYTHON_VERSION"] = py_version - # I.e. "-py36"[-1] == 6 - build_args["PYTHON_MINOR_VERSION"] = py_name[-1] - - if image_name == "base-deps": - build_args["BASE_IMAGE"] = ( - "nvidia/cuda:11.0-cudnn8-runtime-ubuntu18.04" - if gpu == "-gpu" else "ubuntu:focal") + build_args = {} + if image_name == "base-deps": + build_args["BASE_IMAGE"] = ( + "nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04" + if gpu == "-gpu" else "ubuntu:focal") + else: + build_args["GPU"] = gpu + + if "ray" in image_name: + build_args["WHEEL_PATH"] = f".whl/{_get_wheel_name()}" + + tagged_name = f"rayproject/{image_name}:nightly{gpu}" + for i in range(2): + output = DOCKER_CLIENT.api.build( + path=os.path.join(_get_root_dir(), "docker", image_name), + tag=tagged_name, + nocache=no_cache, + buildargs=build_args) + + full_output = "" + try: + start = datetime.datetime.now() + current_iter = start + for line in output: + if datetime.datetime.now( + ) - current_iter >= datetime.timedelta(minutes=5): + current_iter = datetime.datetime.now() + elapsed = datetime.datetime.now() - start + print(f"Still building {tagged_name} after " + f"{elapsed.seconds} seconds") + full_output += line.decode("utf-8") + except Exception as e: + print(f"FAILURE with error {e}") + + if len(DOCKER_CLIENT.api.images(tagged_name)) == 0: + print(f"ERROR building: {tagged_name} & error below:") + print(full_output) + if (i == 1): + raise Exception("FAILED TO BUILD IMAGE") + print("TRYING AGAIN") else: - # NOTE(ilr) This is a bit of an abuse of the name "GPU" - build_args["GPU"] = f"{py_name}{gpu}" - - if image_name in ["ray", "ray-deps"]: - wheel = _get_wheel_name(build_args["PYTHON_MINOR_VERSION"]) - build_args["WHEEL_PATH"] = f".whl/{wheel}" - - tagged_name = f"rayproject/{image_name}:nightly{py_name}{gpu}" - for i in range(2): - cleanup = DOCKER_CLIENT.containers.prune().get( - "SpaceReclaimed") - if cleanup is not None: - print(f"Cleaned up {cleanup / (2**20)}MB") - output = DOCKER_CLIENT.api.build( - path=os.path.join(_get_root_dir(), "docker", image_name), - tag=tagged_name, - nocache=no_cache, - buildargs=build_args) - - full_output = "" - try: - start = datetime.datetime.now() - current_iter = start - for line in output: - if datetime.datetime.now( - ) - current_iter >= datetime.timedelta(minutes=5): - current_iter = datetime.datetime.now() - elapsed = datetime.datetime.now() - start - print(f"Still building {tagged_name} after " - f"{elapsed.seconds} seconds") - full_output += line.decode("utf-8") - except Exception as e: - print(f"FAILURE with error {e}") - - if len(DOCKER_CLIENT.api.images(tagged_name)) == 0: - print(f"ERROR building: {tagged_name} & error below:") - print(full_output) - if (i == 1): - raise Exception("FAILED TO BUILD IMAGE") - print("TRYING AGAIN") - else: - break - - print("BUILT: ", tagged_name) - built_images.append(tagged_name) + break + + print("BUILT: ", tagged_name) + built_images.append(tagged_name) return built_images def copy_wheels(): root_dir = _get_root_dir() - wheels = _get_wheel_name(None) - for wheel in wheels: - source = os.path.join(root_dir, ".whl", wheel) - ray_dst = os.path.join(root_dir, "docker/ray/.whl/") - ray_dep_dst = os.path.join(root_dir, "docker/ray-deps/.whl/") - os.makedirs(ray_dst, exist_ok=True) - shutil.copy(source, ray_dst) - os.makedirs(ray_dep_dst, exist_ok=True) - shutil.copy(source, ray_dep_dst) + wheel = _get_wheel_name() + source = os.path.join(root_dir, ".whl", wheel) + ray_dst = os.path.join(root_dir, "docker/ray/.whl/") + ray_dep_dst = os.path.join(root_dir, "docker/ray-deps/.whl/") + os.makedirs(ray_dst, exist_ok=True) + shutil.copy(source, ray_dst) + os.makedirs(ray_dep_dst, exist_ok=True) + shutil.copy(source, ray_dep_dst) def build_or_pull_base_images(is_docker_affected: bool) -> List[str]: @@ -254,48 +239,31 @@ def get_new_tag(old_tag, new_tag): image_list.extend(["base-deps", "ray-deps"]) for image in image_list: - for py_version in PY_MATRIX.keys(): - full_image = f"rayproject/{image}" + full_image = f"rayproject/{image}" - # Tag "nightly-py3x" from "nightly-py3x-cpu" + # Generate :nightly from nightly-cpu + DOCKER_CLIENT.api.tag( + image=f"{full_image}:nightly-cpu", + repository=full_image, + tag="nightly") + + for arch_tag in ["-cpu", "-gpu", ""]: + full_arch_tag = f"nightly{arch_tag}" + # Do not tag release builds because they are no longer up to date + # after the branch cut. + if not _release_build(): + # Tag and push rayproject/:nightly + docker_push(full_image, full_arch_tag) + + # Ex: specific_tag == "1.0.1" or "" or "" + specific_tag = get_new_tag( + full_arch_tag, date_tag if "-deps" in image else sha_tag) + # Tag and push rayproject/: DOCKER_CLIENT.api.tag( - image=f"{full_image}:nightly{py_version}-cpu", + image=f"{full_image}:{full_arch_tag}", repository=full_image, - tag=f"nightly{py_version}") - - for arch_tag in ["-cpu", "-gpu", ""]: - full_arch_tag = f"nightly{py_version}{arch_tag}" - # Do not tag release builds because they are no longer up to - # date after the branch cut. - if not _release_build(): - # Tag and push rayproject/:nightly - docker_push(full_image, full_arch_tag) - - # Ex: specific_tag == "1.0.1" or "" or "" - specific_tag = get_new_tag( - full_arch_tag, date_tag if "-deps" in image else sha_tag) - - # Tag and push rayproject/: - DOCKER_CLIENT.api.tag( - image=f"{full_image}:{full_arch_tag}", - repository=full_image, - tag=specific_tag) - docker_push(full_image, specific_tag) - - if "-py37" in py_version: - non_python_specific_tag = specific_tag.replace("-py37", "") - DOCKER_CLIENT.api.tag( - image=f"{full_image}:{full_arch_tag}", - repository=full_image, - tag=non_python_specific_tag) - docker_push(full_image, non_python_specific_tag) - - non_python_nightly_tag = full_arch_tag.replace("-py37", "") - DOCKER_CLIENT.api.tag( - image=f"{full_image}:{full_arch_tag}", - repository=full_image, - tag=non_python_nightly_tag) - docker_push(full_image, non_python_nightly_tag) + tag=specific_tag) + docker_push(full_image, specific_tag) # Push infra here: @@ -338,14 +306,6 @@ def push_readmes(): if __name__ == "__main__": print("RUNNING WITH: ", sys.version) - if len(sys.argv) == 2: - version_to_drop = sys.argv[1] - if version_to_drop == "PY37": - PY_MATRIX.pop("-py36") - PY_MATRIX.pop("-py38") - else: - PY_MATRIX.pop("-py37") - print("Building the following python versions: ", PY_MATRIX) if os.environ.get("TRAVIS") == "true": is_docker_affected = _docker_affected() if _merge_build() or is_docker_affected: diff --git a/ci/travis/build-docker-images.sh b/ci/travis/build-docker-images.sh index 6463c880f649..c894da23a662 100755 --- a/ci/travis/build-docker-images.sh +++ b/ci/travis/build-docker-images.sh @@ -22,7 +22,7 @@ build_and_push_tags() { # $2 tag for image (e.g. hash of commit) for GPU in "" "-gpu" do - BASE_IMAGE=$(if [ "$GPU" ]; then echo "nvidia/cuda:11.0-cudnn8-runtime-ubuntu18.04"; else echo "ubuntu:focal"; fi;) + BASE_IMAGE=$(if [ "$GPU" ]; then echo "nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04"; else echo "ubuntu:focal"; fi;) FULL_NAME_WITH_TAG="rayproject/$1:$2$GPU" NIGHTLY_FULL_NAME_WITH_TAG="rayproject/$1:nightly$GPU" docker build --no-cache --build-arg GPU="$GPU" --build-arg BASE_IMAGE="$BASE_IMAGE" --build-arg WHEEL_PATH=".whl/$WHEEL" --label "SHA=$2" -t "$FULL_NAME_WITH_TAG" /"$ROOT_DIR"/docker/"$1" diff --git a/ci/travis/ci.sh b/ci/travis/ci.sh index 9324853fee34..a403a4a9f522 100755 --- a/ci/travis/ci.sh +++ b/ci/travis/ci.sh @@ -140,19 +140,14 @@ test_python() { python/ray/serve/... python/ray/tests/... -python/ray/serve:test_api # segfault on windows? https://github.com/ray-project/ray/issues/12541 - -python/ray/serve:test_handle # "fatal error" (?) https://github.com/ray-project/ray/pull/13695 -python/ray/tests:test_actor_advanced # timeout -python/ray/tests:test_advanced_2 -python/ray/tests:test_advanced_3 # test_invalid_unicode_in_worker_log() fails on Windows -python/ray/tests:test_autoscaler_aws -python/ray/tests:test_component_failures - -python/ray/tests:test_component_failures_3 # timeout -python/ray/tests:test_basic_2 # hangs on shared cluster tests -python/ray/tests:test_basic_2_client_mode - -python/ray/tests:test_basic_3 # timeout - -python/ray/tests:test_basic_3_client_mode -python/ray/tests:test_cli - -python/ray/tests:test_client_init # timeout -python/ray/tests:test_failure -python/ray/tests:test_global_gc -python/ray/tests:test_job @@ -161,18 +156,14 @@ test_python() { -python/ray/tests:test_metrics_agent # timeout -python/ray/tests:test_multi_node -python/ray/tests:test_multi_node_2 - -python/ray/tests:test_multi_node_3 -python/ray/tests:test_multiprocessing # test_connect_to_ray() fails to connect to raylet -python/ray/tests:test_node_manager -python/ray/tests:test_object_manager - -python/ray/tests:test_placement_group # timeout and OOM -python/ray/tests:test_ray_init # test_redis_port() seems to fail here, but pass in isolation -python/ray/tests:test_resource_demand_scheduler -python/ray/tests:test_stress # timeout -python/ray/tests:test_stress_sharded # timeout -python/ray/tests:test_k8s_cluster_launcher - -python/ray/tests:test_k8s_operator_examples - -python/ray/tests:test_k8s_operator_mock ) fi if [ 0 -lt "${#args[@]}" ]; then # Any targets to test? @@ -192,9 +183,6 @@ test_cpp() { bazel build --config=ci //cpp:all # shellcheck disable=SC2046 bazel test --config=ci $(./scripts/bazel_export_options) //cpp:all --build_tests_only - # run the cpp example - bazel run //cpp/example:example - } test_wheels() { @@ -364,13 +352,9 @@ lint_web() { ( cd "${WORKSPACE_DIR}"/python/ray/new_dashboard/client set +x # suppress set -x since it'll get very noisy here - - if [ -z "${BUILDKITE-}" ]; then - . "${HOME}/.nvm/nvm.sh" - nvm use --silent node - fi - + . "${HOME}/.nvm/nvm.sh" install_npm_project + nvm use --silent node local filenames # shellcheck disable=SC2207 filenames=($(find src -name "*.ts" -or -name "*.tsx")) diff --git a/ci/travis/determine_tests_to_run.py b/ci/travis/determine_tests_to_run.py index be37a29469cc..70eefc16a566 100644 --- a/ci/travis/determine_tests_to_run.py +++ b/ci/travis/determine_tests_to_run.py @@ -9,7 +9,6 @@ import subprocess import sys from pprint import pformat -import argparse def list_changed_files(commit_range): @@ -31,44 +30,7 @@ def list_changed_files(commit_range): return [s.strip() for s in out.decode().splitlines() if s is not None] -def is_pull_request(): - event_type = None - - for key in ["GITHUB_EVENT_NAME", "TRAVIS_EVENT_TYPE"]: - event_type = os.getenv(key, event_type) - - if (os.environ.get("BUILDKITE") - and os.environ.get("BUILDKITE_PULL_REQUEST") != "false"): - event_type = "pull_request" - - return event_type == "pull_request" - - -def get_commit_range(): - commit_range = None - - if os.environ.get("TRAVIS"): - commit_range = os.environ["TRAVIS_COMMIT_RANGE"] - elif os.environ.get("GITHUB_EVENT_PATH"): - with open(os.environ["GITHUB_EVENT_PATH"], "rb") as f: - event = json.loads(f.read()) - base = event["pull_request"]["base"]["sha"] - commit_range = "{}...{}".format(base, event.get("after", "")) - elif os.environ.get("BUILDKITE"): - commit_range = "{}...{}".format( - os.environ["BUILDKITE_PULL_REQUEST_BASE_BRANCH"], - os.environ["BUILDKITE_COMMIT"], - ) - - assert commit_range is not None - return commit_range - - if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--output", type=str, help="json or envvars", default="envvars") - args = parser.parse_args() RAY_CI_TUNE_AFFECTED = 0 RAY_CI_SGD_AFFECTED = 0 @@ -88,10 +50,20 @@ def get_commit_range(): RAY_CI_DOC_AFFECTED = 0 RAY_CI_PYTHON_DEPENDENCIES_AFFECTED = 0 - if is_pull_request(): - commit_range = get_commit_range() + event_type = None + for key in ["GITHUB_EVENT_NAME", "TRAVIS_EVENT_TYPE"]: + event_type = os.getenv(key, event_type) + + if event_type == "pull_request": + + commit_range = os.getenv("TRAVIS_COMMIT_RANGE") + if commit_range is None: + with open(os.environ["GITHUB_EVENT_PATH"], "rb") as f: + event = json.loads(f.read()) + base = event["pull_request"]["base"]["sha"] + commit_range = "{}...{}".format(base, event.get("after", "")) files = list_changed_files(commit_range) - print(pformat(commit_range), file=sys.stderr) + print(pformat(files), file=sys.stderr) skip_prefix_list = [ @@ -152,8 +124,6 @@ def get_commit_range(): for prefix in skip_prefix_list): # nothing is run but linting in these cases pass - elif changed_file.endswith("build-docker-images.py"): - RAY_CI_DOCKER_AFFECTED = 1 elif changed_file.startswith("src/"): RAY_CI_TUNE_AFFECTED = 1 RAY_CI_SGD_AFFECTED = 1 @@ -215,7 +185,7 @@ def get_commit_range(): RAY_CI_ONLY_RLLIB_AFFECTED = 1 # Log the modified environment variables visible in console. - output_string = " ".join([ + print(" ".join([ "RAY_CI_TUNE_AFFECTED={}".format(RAY_CI_TUNE_AFFECTED), "RAY_CI_SGD_AFFECTED={}".format(RAY_CI_SGD_AFFECTED), "RAY_CI_ONLY_RLLIB_AFFECTED={}".format(RAY_CI_ONLY_RLLIB_AFFECTED), @@ -237,15 +207,4 @@ def get_commit_range(): "RAY_CI_DOCKER_AFFECTED={}".format(RAY_CI_DOCKER_AFFECTED), "RAY_CI_PYTHON_DEPENDENCIES_AFFECTED={}".format( RAY_CI_PYTHON_DEPENDENCIES_AFFECTED), - ]) - - # Debug purpose - print(output_string, file=sys.stderr) - - # Used by buildkite log format - if args.output.lower() == "json": - pairs = [item.split("=") for item in output_string.split(" ")] - affected_vars = [key for key, affected in pairs if affected == "1"] - print(json.dumps(affected_vars)) - else: - print(output_string) + ])) diff --git a/ci/travis/format.sh b/ci/travis/format.sh index bb916869cca2..3f4b753f4d12 100755 --- a/ci/travis/format.sh +++ b/ci/travis/format.sh @@ -107,8 +107,8 @@ MYPY_FILES=( 'autoscaler/node_provider.py' 'autoscaler/sdk.py' 'autoscaler/_private/commands.py' - 'ray_operator/operator.py' - 'ray_operator/operator_utils.py' + 'operator/operator.py' + 'operator/operator_utils.py' ) YAPF_EXCLUDES=( diff --git a/ci/travis/install-dependencies.sh b/ci/travis/install-dependencies.sh index ea4691723d99..8c42f694ce57 100755 --- a/ci/travis/install-dependencies.sh +++ b/ci/travis/install-dependencies.sh @@ -23,13 +23,6 @@ pkg_install_helper() { } install_bazel() { - if command -v bazel; then - if [ -n "${BUILDKITE-}" ]; then - echo "Bazel exists, skipping the install" - return - fi - fi - "${ROOT_DIR}"/install-bazel.sh if [ -f /etc/profile.d/bazel.sh ]; then . /etc/profile.d/bazel.sh @@ -37,11 +30,6 @@ install_bazel() { } install_base() { - if [ -n "${BUILDKITE-}" ]; then - echo "Skipping install_base in Buildkite" - return - fi - case "${OSTYPE}" in linux*) # Expired apt key error: https://github.com/bazelbuild/bazel/issues/11470#issuecomment-633205152 @@ -200,7 +188,9 @@ install_nvm() { > "${NVM_HOME}/nvm.sh" fi elif [ -n "${BUILDKITE-}" ]; then - echo "Skipping nvm on Buildkite because we will use apt-get." + # https://github.com/nodesource/distributions/blob/master/README.md#installation-instructions + curl -sL https://deb.nodesource.com/setup_14.x | sudo -E bash - + sudo apt-get install -y nodejs else test -f "${NVM_HOME}/nvm.sh" # double-check NVM is already available on other platforms fi @@ -226,19 +216,10 @@ install_upgrade_pip() { } install_node() { - if command -v node; then - if [ -n "${BUILDKITE-}" ]; then - echo "Node existed, skipping install"; - return - fi - fi - if [ "${OSTYPE}" = msys ] ; then { echo "WARNING: Skipping running Node.js due to incompatibilities with Windows"; } 2> /dev/null elif [ -n "${BUILDKITE-}" ] ; then - # https://github.com/nodesource/distributions/blob/master/README.md#installation-instructions - curl -sL https://deb.nodesource.com/setup_14.x | sudo -E bash - - sudo apt-get install -y nodejs + { echo "WARNING: Skipping running Node.js on buildkite because it's already there"; } 2> /dev/null else # Install the latest version of Node.js in order to build the dashboard. ( @@ -277,7 +258,7 @@ install_dependencies() { if [ -n "${PYTHON-}" ]; then # Remove this entire section once RLlib and Serve dependencies are fixed. - if [ "${DOC_TESTING-}" != 1 ] && [ "${SGD_TESTING-}" != 1 ] && [ "${TUNE_TESTING-}" != 1 ]; then + if [ -z "${BUILDKITE-}" ] && [ "${DOC_TESTING-}" != 1 ] && [ "${SGD_TESTING-}" != 1 ] && [ "${TUNE_TESTING-}" != 1 ]; then # PyTorch is installed first since we are using a "-f" directive to find the wheels. # We want to install the CPU version only. local torch_url="https://download.pytorch.org/whl/torch_stable.html" @@ -324,7 +305,13 @@ install_dependencies() { # Additional Tune/SGD/Doc test dependencies. if [ "${TUNE_TESTING-}" = 1 ] || [ "${SGD_TESTING-}" = 1 ] || [ "${DOC_TESTING-}" = 1 ]; then - pip install -r "${WORKSPACE_DIR}"/python/requirements/requirements_tune.txt + if [ -n "${PYTHON-}" ] && [ "${PYTHON-}" = "3.7" ]; then + # Install Python 3.7 dependencies if 3.7 is set. + pip install -r "${WORKSPACE_DIR}"/python/requirements/linux-py3.7-requirements_tune.txt + else + # Else default to Python 3.6. + pip install -r "${WORKSPACE_DIR}"/python/requirements/linux-py3.6-requirements_tune.txt + fi fi # For Tune, install upstream dependencies. diff --git a/cpp/BUILD.bazel b/cpp/BUILD.bazel index a4dc5b505dcb..af82486a0d2d 100644 --- a/cpp/BUILD.bazel +++ b/cpp/BUILD.bazel @@ -21,6 +21,7 @@ cc_library( "src/ray/util/*.h", "src/ray/*.cc", "src/ray/*.h", + "src/ray/worker/default_worker.cc", ]), hdrs = glob([ "include/ray/*.h", @@ -44,36 +45,18 @@ cc_library( ) cc_binary( - name = "default_worker", + name = "example", + testonly = 1, srcs = glob([ - "src/ray/worker/default_worker.cc", + "src/example/example.cc", ]), copts = COPTS, - linkstatic = True, + linkstatic = False, deps = [ "ray_api", ], ) -genrule( - name = "ray_cpp_pkg", - srcs = [ - "default_worker", - "ray_api", - ], - outs = ["ray_cpp_pkg.out"], - cmd = """ - WORK_DIR="$$(pwd)" && - mkdir -p "$$WORK_DIR/python/ray/core/src/ray/cpp/" && - cp -f $(location default_worker) "$$WORK_DIR/python/ray/core/src/ray/cpp/default_worker" && - cp -f $(locations ray_api) "$$WORK_DIR/python/ray/core/src/ray/cpp/" && - echo "$$WORK_DIR" > $@ - """, - local = 1, - visibility = ["//visibility:public"], -) - -# test cc_test( name = "api_test", srcs = glob([ @@ -93,32 +76,27 @@ cc_test( srcs = glob([ "src/ray/test/cluster/*.cc", ]), - args = [ - "$(location cluster_mode_test.so)", - ], copts = COPTS, - data = [ - "cluster_mode_test.so", - "ray_cpp_pkg", - ], linkstatic = True, deps = [ "ray_api", - "@com_github_gflags_gflags//:gflags", "@com_google_googletest//:gtest_main", ], ) -cc_binary( - name = "cluster_mode_test.so", - srcs = glob([ - "src/ray/test/cluster/*.cc", - ]), - copts = COPTS, - linkstatic = True, - deps = [ +genrule( + name = "ray_cpp_pkg", + srcs = [ + "cluster_mode_test", "ray_api", - "@com_github_gflags_gflags//:gflags", - "@com_google_googletest//:gtest_main", ], + outs = ["ray_cpp_pkg.out"], + cmd = """ + WORK_DIR="$$(pwd)" && + mkdir -p "$$WORK_DIR/python/ray/core/src/ray/cpp/" && + cp -f $(location cluster_mode_test) "$$WORK_DIR/python/ray/core/src/ray/cpp/default_worker" && + cp -f $(locations ray_api) "$$WORK_DIR/python/ray/core/src/ray/cpp/" && + echo "$$WORK_DIR" > $@ + """, + local = 1, ) diff --git a/cpp/dev_BUILD.bazel b/cpp/dev_BUILD.bazel new file mode 100644 index 000000000000..8c7470b99cbe --- /dev/null +++ b/cpp/dev_BUILD.bazel @@ -0,0 +1,74 @@ +# Bazel development build for C++ API. +# C/C++ documentation: https://docs.bazel.build/versions/master/be/c-cpp.html + +load("//bazel:ray.bzl", "COPTS") + +cc_library( + name = "ray_api", + srcs = glob([ + "src/ray/api.cc", + "src/ray/api/*.cc", + "src/ray/api/*.h", + "src/ray/app/*.cc", + "src/ray/app/*.h", + "src/ray/runtime/*.cc", + "src/ray/runtime/*.h", + "src/ray/runtime/**/*.cc", + "src/ray/runtime/**/*.h", + "src/ray/runtime/task/*.cc", + "src/ray/runtime/task/*.h", + "src/ray/util/*.cc", + "src/ray/util/*.h", + "src/ray/*.cc", + "src/ray/*.h", + "src/ray/worker/default_worker.cc", + ]), + hdrs = glob([ + "include/ray/*.h", + "include/ray/**/*.h", + "include/ray/**/**/*.h", + ]), + copts = COPTS, + linkopts = ["-ldl"], + linkstatic = True, + strip_include_prefix = "include", + visibility = ["//visibility:public"], + deps = [ + "//:core_worker_lib", + "//:ray_common", + "//:ray_util", + "@boost//:asio", + "@boost//:thread", + "@com_google_absl//absl/synchronization", + "@msgpack", + ], +) + +cc_binary( + name = "example", + srcs = glob([ + "src/ray/example/*.cc", + ]), + copts = COPTS, + linkstatic = True, + deps = [ + "ray_api", + ], +) + +genrule( + name = "ray_cpp_pkg", + srcs = [ + "example", + "ray_api", + ], + outs = ["ray_cpp_pkg.out"], + cmd = """ + WORK_DIR="$$(pwd)" && + mkdir -p "$$WORK_DIR/python/ray/core/src/ray/cpp/" && + cp -f $(location example) "$$WORK_DIR/python/ray/core/src/ray/cpp/default_worker" && + cp -f $(locations ray_api) "$$WORK_DIR/python/ray/core/src/ray/cpp/" && + echo "$$WORK_DIR" > $@ + """, + local = 1, +) diff --git a/cpp/example/BUILD.bazel b/cpp/example/BUILD.bazel deleted file mode 100644 index a14212042812..000000000000 --- a/cpp/example/BUILD.bazel +++ /dev/null @@ -1,37 +0,0 @@ -# Bazel development build for C++ API. -# C/C++ documentation: https://docs.bazel.build/versions/master/be/c-cpp.html - -load("//bazel:ray.bzl", "COPTS") - -cc_binary( - name = "example", - srcs = glob([ - "*.cc", - ]), - args = [ - "--dynamic-library-path $(location example.so)", - ], - copts = COPTS, - data = [ - "example.so", - "//cpp:ray_cpp_pkg", - ], - linkstatic = True, - deps = [ - "//cpp:ray_api", - "@com_github_gflags_gflags//:gflags", - ], -) - -cc_binary( - name = "example.so", - srcs = glob([ - "*.cc", - ]), - copts = COPTS, - linkstatic = True, - deps = [ - "//cpp:ray_api", - "@com_github_gflags_gflags//:gflags", - ], -) diff --git a/cpp/include/ray/api/ray_config.h b/cpp/include/ray/api/ray_config.h index b8c4f0cd285e..b6bc55d5dcfe 100644 --- a/cpp/include/ray/api/ray_config.h +++ b/cpp/include/ray/api/ray_config.h @@ -34,13 +34,6 @@ class RayConfig { static std::shared_ptr GetInstance(); - void SetRedisAddress(const std::string address) { - auto pos = address.find(':'); - RAY_CHECK(pos != std::string::npos); - redis_ip = address.substr(0, pos); - redis_port = std::stoi(address.substr(pos + 1, address.length())); - } - private: static std::shared_ptr config_; }; diff --git a/cpp/include/ray/experimental/default_worker.h b/cpp/include/ray/experimental/default_worker.h new file mode 100644 index 000000000000..2c0e02259d6e --- /dev/null +++ b/cpp/include/ray/experimental/default_worker.h @@ -0,0 +1,9 @@ +#pragma once + +namespace ray { +namespace api { + +int default_worker_main(int argc, char **argv); + +} // namespace api +} // namespace ray diff --git a/cpp/src/example/example.cc b/cpp/src/example/example.cc new file mode 100644 index 000000000000..1375136caac0 --- /dev/null +++ b/cpp/src/example/example.cc @@ -0,0 +1,76 @@ + +/// This is a complete example of writing a distributed program using the C ++ worker API. + +/// including the header +#include + +/// using namespace +using namespace ::ray::api; + +/// general function of user code +int Return1() { return 1; } +int Plus1(int x) { return x + 1; } +int Plus(int x, int y) { return x + y; } + +/// a class of user code +class Counter { + public: + int count; + + Counter() { count = 0; } + + static Counter *FactoryCreate() { return new Counter(); } + /// non static function + int Add(int x) { + count += x; + return count; + } +}; + +int main() { + /// initialization + Ray::Init(); + + /// put and get object + auto obj = Ray::Put(123); + auto get_result = obj.Get(); + + /// general function remote call(args passed by value) + auto r0 = Ray::Task(Return1).Remote(); + auto r1 = Ray::Task(Plus1, 1).Remote(); + auto r2 = Ray::Task(Plus, 1, 2).Remote(); + + int result0 = *(r0.Get()); + int result1 = *(r1.Get()); + int result2 = *(r2.Get()); + + std::cout << "Ray::call with value results: " << result0 << " " << result1 << " " + << result2 << std::endl; + + /// general function remote call(args passed by reference) + auto r3 = Ray::Task(Return1).Remote(); + auto r4 = Ray::Task(Plus1, r3).Remote(); + auto r5 = Ray::Task(Plus, r4, 1).Remote(); + + int result3 = *(r3.Get()); + int result4 = *(r4.Get()); + int result5 = *(r5.Get()); + + std::cout << "Ray::call with reference results: " << result3 << " " << result4 << " " + << result5 << std::endl; + + /// create actor and actor function remote call + ActorHandle actor = Ray::Actor(Counter::FactoryCreate).Remote(); + auto r6 = actor.Task(&Counter::Add, 5).Remote(); + auto r7 = actor.Task(&Counter::Add, 1).Remote(); + auto r8 = actor.Task(&Counter::Add, 1).Remote(); + auto r9 = actor.Task(&Counter::Add, r8).Remote(); + + int result6 = *(r6.Get()); + int result7 = *(r7.Get()); + int result8 = *(r8.Get()); + int result9 = *(r9.Get()); + + std::cout << "Ray::call with actor results: " << result6 << " " << result7 << " " + << result8 << " " << result9 << std::endl; +} diff --git a/cpp/example/example.cc b/cpp/src/ray/example/example.cc similarity index 81% rename from cpp/example/example.cc rename to cpp/src/ray/example/example.cc index 13f82192d0ab..7ada6f1f5f22 100644 --- a/cpp/example/example.cc +++ b/cpp/src/ray/example/example.cc @@ -1,12 +1,8 @@ -/// This is a complete example of writing a distributed program using the C ++ worker API. - -/// including the header #include #include -#include "gflags/gflags.h" +#include -/// using namespace using namespace ::ray::api; /// general function of user code @@ -36,25 +32,22 @@ class Counter { } }; -DEFINE_string(redis_address, "", "The ip address of redis server."); - -DEFINE_string(dynamic_library_path, "", "The local path of the dynamic library."); - int main(int argc, char **argv) { - /// configuration - gflags::ParseCommandLineFlags(&argc, &argv, true); - const std::string dynamic_library_path = FLAGS_dynamic_library_path; - const std::string redis_address = FLAGS_redis_address; - gflags::ShutDownCommandLineFlags(); - RAY_CHECK(!dynamic_library_path.empty()) - << "Please add a local dynamic library by '--dynamic-library-path'"; - ray::api::RayConfig::GetInstance()->lib_name = dynamic_library_path; - if (!redis_address.empty()) { - ray::api::RayConfig::GetInstance()->SetRedisAddress(redis_address); + /// Currently, we compile `default_worker` and `example` in one single binary, + /// to work around a symbol conflicting issue. + /// This is the main function of the binary, and we use the `is_default_worker` arg to + /// tell if this binary is used as `default_worker` or `example`. + const char *default_worker_magic = "is_default_worker"; + /// `is_default_worker` is the last arg of `argv` + if (argc > 1 && + memcmp(argv[argc - 1], default_worker_magic, strlen(default_worker_magic)) == 0) { + default_worker_main(argc, argv); + return 0; } - ::ray::api::RayConfig::GetInstance()->run_mode = RunMode::CLUSTER; - - /// initialization + /// initialization to cluster mode + ray::api::RayConfig::GetInstance()->run_mode = RunMode::CLUSTER; + /// Dynamic library loading is not supported yet. + ray::api::RayConfig::GetInstance()->lib_name = ""; Ray::Init(); /// put and get object @@ -93,6 +86,7 @@ int main(int argc, char **argv) { /// general function remote call(args passed by value) auto r0 = Ray::Task(Return1).Remote(); auto r2 = Ray::Task(Plus, 3, 22).Remote(); + int task_result3 = *(Ray::Get(r2)); std::cout << "task_result3 = " << task_result3 << std::endl; @@ -101,6 +95,7 @@ int main(int argc, char **argv) { auto r4 = Ray::Task(Plus1, r3).Remote(); auto r5 = Ray::Task(Plus, r4, r3).Remote(); auto r6 = Ray::Task(Plus, r4, 10).Remote(); + int task_result4 = *(Ray::Get(r6)); int task_result5 = *(Ray::Get(r5)); std::cout << "task_result4 = " << task_result4 << ", task_result5 = " << task_result5 @@ -109,30 +104,31 @@ int main(int argc, char **argv) { /// create actor and actor function remote call with args passed by value ActorHandle actor4 = Ray::Actor(Counter::FactoryCreate, 10).Remote(); auto r10 = actor4.Task(&Counter::Add, 8).Remote(); + int actor_result4 = *(Ray::Get(r10)); std::cout << "actor_result4 = " << actor_result4 << std::endl; /// create actor and task function remote call with args passed by reference ActorHandle actor5 = Ray::Actor(Counter::FactoryCreate, r10, 0).Remote(); + auto r11 = actor5.Task(&Counter::Add, r0).Remote(); auto r12 = actor5.Task(&Counter::Add, r11).Remote(); auto r13 = actor5.Task(&Counter::Add, r10).Remote(); auto r14 = actor5.Task(&Counter::Add, r13).Remote(); auto r15 = Ray::Task(Plus, r0, r11).Remote(); auto r16 = Ray::Task(Plus1, r15).Remote(); + int result12 = *(Ray::Get(r12)); int result14 = *(Ray::Get(r14)); int result11 = *(Ray::Get(r11)); int result13 = *(Ray::Get(r13)); int result16 = *(Ray::Get(r16)); int result15 = *(Ray::Get(r15)); + std::cout << "Final result:" << std::endl; std::cout << "result11 = " << result11 << ", result12 = " << result12 << ", result13 = " << result13 << ", result14 = " << result14 << ", result15 = " << result15 << ", result16 = " << result16 << std::endl; - - /// shutdown Ray::Shutdown(); - return 0; } diff --git a/cpp/src/ray/runtime/task/task_executor.cc b/cpp/src/ray/runtime/task/task_executor.cc index d0879112fcf3..f2b06af09370 100644 --- a/cpp/src/ray/runtime/task/task_executor.cc +++ b/cpp/src/ray/runtime/task/task_executor.cc @@ -29,7 +29,7 @@ Status TaskExecutor::ExecuteTask( const std::vector &arg_reference_ids, const std::vector &return_ids, const std::string &debugger_breakpoint, std::vector> *results) { - RAY_LOG(INFO) << "Execute task: " << TaskType_Name(task_type); + RAY_LOG(INFO) << "TaskExecutor::ExecuteTask"; RAY_CHECK(ray_function.GetLanguage() == Language::CPP); auto function_descriptor = ray_function.GetFunctionDescriptor(); RAY_CHECK(function_descriptor->Type() == diff --git a/cpp/src/ray/test/cluster/cluster_mode_test.cc b/cpp/src/ray/test/cluster/cluster_mode_test.cc index e00c6af14958..780fb0d3024c 100644 --- a/cpp/src/ray/test/cluster/cluster_mode_test.cc +++ b/cpp/src/ray/test/cluster/cluster_mode_test.cc @@ -2,6 +2,7 @@ #include #include #include +#include using namespace ::ray::api; @@ -32,16 +33,11 @@ class Counter { } }; -std::string lib_name = ""; - -std::string redis_ip = ""; - TEST(RayClusterModeTest, FullTest) { /// initialization to cluster mode ray::api::RayConfig::GetInstance()->run_mode = RunMode::CLUSTER; /// TODO(Guyang Song): add the dynamic library name - ray::api::RayConfig::GetInstance()->lib_name = lib_name; - ray::api::RayConfig::GetInstance()->redis_ip = redis_ip; + ray::api::RayConfig::GetInstance()->lib_name = ""; Ray::Init(); /// put and get object @@ -148,11 +144,18 @@ TEST(RayClusterModeTest, FullTest) { Ray::Shutdown(); } +/// TODO(Guyang Song): Separate default worker from this test. +/// Currently, we compile `default_worker` and `cluster_mode_test` in one single binary, +/// to work around a symbol conflicting issue. +/// This is the main function of the binary, and we use the `is_default_worker` arg to +/// tell if this binary is used as `default_worker` or `cluster_mode_test`. int main(int argc, char **argv) { - RAY_CHECK(argc == 2 || argc == 3); - lib_name = std::string(argv[1]); - if (argc == 3) { - redis_ip = std::string(argv[2]); + const char *default_worker_magic = "is_default_worker"; + /// `is_default_worker` is the last arg of `argv` + if (argc > 1 && + memcmp(argv[argc - 1], default_worker_magic, strlen(default_worker_magic)) == 0) { + default_worker_main(argc, argv); + return 0; } ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/cpp/src/ray/util/function_helper.cc b/cpp/src/ray/util/function_helper.cc index 8693ea6b1466..5dfa8a012904 100644 --- a/cpp/src/ray/util/function_helper.cc +++ b/cpp/src/ray/util/function_helper.cc @@ -14,14 +14,19 @@ uintptr_t base_addr = 0; static const uintptr_t BaseAddressForHandle(void *handle) { /// TODO(Guyang Song): Implement a cross-platform function. - return (uintptr_t)((NULL == handle) ? NULL : (void *)*(size_t const *)(handle)); + /// Not Implemented. + return -1; } uintptr_t FunctionHelper::LoadLibrary(std::string lib_name) { + if (dynamic_library_base_addr != 0) { + /// Base address has been generated. + return dynamic_library_base_addr; + } /// Generate base address from library. RAY_LOG(INFO) << "Start load library " << lib_name; - void *handle = dlopen(lib_name.c_str(), RTLD_LAZY); - uintptr_t base_addr = BaseAddressForHandle(handle); + void *example = dlopen(lib_name.c_str(), RTLD_LAZY); + uintptr_t base_addr = BaseAddressForHandle(example); RAY_CHECK(base_addr > 0); RAY_LOG(INFO) << "Loaded library " << lib_name << " to base address " << base_addr; loaded_library_.emplace(lib_name, base_addr); diff --git a/cpp/src/ray/util/process_helper.cc b/cpp/src/ray/util/process_helper.cc index 6511b5b8b96b..7227337edf4d 100644 --- a/cpp/src/ray/util/process_helper.cc +++ b/cpp/src/ray/util/process_helper.cc @@ -70,12 +70,7 @@ void ProcessHelper::RayStart(std::shared_ptr config, options.store_socket = store_socket; options.raylet_socket = raylet_socket; if (options.worker_type == WorkerType::DRIVER) { - /// TODO(Guyang Song): Get next job id from core worker by GCS client. - /// Random a number to avoid repeated job ids. - /// The repeated job ids will lead to task hang when driver connects to a existing - /// cluster more than once. - std::srand(std::time(nullptr)); - options.job_id = JobID::FromInt(std::rand()); + options.job_id = JobID::FromInt(1); } options.gcs_options = gcs_options; options.enable_logging = true; diff --git a/cpp/src/ray/worker/default_worker.cc b/cpp/src/ray/worker/default_worker.cc index dd61bb457bed..2ebfb8d6ca9c 100644 --- a/cpp/src/ray/worker/default_worker.cc +++ b/cpp/src/ray/worker/default_worker.cc @@ -3,11 +3,14 @@ #include #include -using namespace ::ray::api; +using namespace ::ray; + +namespace ray { +namespace api { int default_worker_main(int argc, char **argv) { RAY_LOG(INFO) << "CPP default worker started"; - RAY_CHECK(argc == 7); + RAY_CHECK(argc == 8); auto config = ray::api::RayConfig::GetInstance(); config->run_mode = RunMode::CLUSTER; @@ -16,7 +19,10 @@ int default_worker_main(int argc, char **argv) { config->raylet_socket = std::string(argv[2]); config->node_manager_port = std::stoi(std::string(argv[3])); std::string redis_address = std::string(std::string(argv[4])); - config->SetRedisAddress(redis_address); + auto pos = redis_address.find(':'); + RAY_CHECK(pos != std::string::npos); + config->redis_ip = redis_address.substr(0, pos); + config->redis_port = std::stoi(redis_address.substr(pos + 1, redis_address.length())); config->redis_password = std::string(std::string(argv[5])); config->session_dir = std::string(std::string(argv[6])); @@ -26,7 +32,5 @@ int default_worker_main(int argc, char **argv) { return 0; } -int main(int argc, char **argv) { - default_worker_main(argc, argv); - return 0; -} +} // namespace api +} // namespace ray diff --git a/dashboard/agent.py b/dashboard/agent.py index 7f77e2f3c09c..f34024e545c7 100644 --- a/dashboard/agent.py +++ b/dashboard/agent.py @@ -7,7 +7,6 @@ import sys import socket import json -import time import traceback import aiohttp @@ -76,9 +75,8 @@ def __init__(self, logger.info("Dashboard agent grpc address: %s:%s", self.ip, self.grpc_port) self.aioredis_client = None - options = (("grpc.enable_http_proxy", 0), ) self.aiogrpc_raylet_channel = aiogrpc.insecure_channel( - f"{self.ip}:{self.node_manager_port}", options=options) + f"{self.ip}:{self.node_manager_port}") self.http_session = None def _load_modules(self): @@ -186,11 +184,8 @@ async def _check_parent(): agent_port=self.grpc_port, agent_ip_address=self.ip)) - tasks = [m.run(self.server) for m in modules] - if sys.platform not in ["win32", "cygwin"]: - tasks.append(check_parent_task) - await asyncio.gather(*tasks) - + await asyncio.gather(check_parent_task, + *(m.run(self.server) for m in modules)) await self.server.wait_for_termination() # Wait for finish signal. await runner.cleanup() @@ -300,16 +295,6 @@ async def _check_parent(): max_bytes=args.logging_rotate_bytes, backup_count=args.logging_rotate_backup_count) - # The dashboard is currently broken on Windows. - # https://github.com/ray-project/ray/issues/14026. - if sys.platform == "win32": - logger.warning( - "The dashboard is currently disabled on windows." - "See https://github.com/ray-project/ray/issues/14026" - "for more details") - while True: - time.sleep(999) - agent = DashboardAgent( args.node_ip_address, args.redis_address, diff --git a/dashboard/client/package-lock.json b/dashboard/client/package-lock.json index eccde1558ae4..8b66129425d1 100644 --- a/dashboard/client/package-lock.json +++ b/dashboard/client/package-lock.json @@ -1,41 +1,29 @@ { - "name": "ray-dashboard-client", - "version": "1.0.0", + "name": "client", + "version": "0.1.0", "lockfileVersion": 2, "requires": true, "packages": { "": { - "name": "ray-dashboard-client", - "version": "1.0.0", + "version": "0.1.0", "dependencies": { "@material-ui/core": "4.11.0", "@material-ui/icons": "^4.9.1", "@material-ui/lab": "^4.0.0-alpha.56", - "@material-ui/pickers": "^3.2.10", "@reduxjs/toolkit": "^1.3.1", "@types/classnames": "^2.2.10", "@types/jest": "25.1.4", - "@types/lodash": "^4.14.161", - "@types/lowlight": "^0.0.1", "@types/node": "13.9.5", - "@types/numeral": "^0.0.26", "@types/react": "16.9.26", "@types/react-dom": "16.9.5", "@types/react-redux": "^7.1.7", "@types/react-router-dom": "^5.1.3", - "@types/react-window": "^1.8.2", - "axios": "^0.21.1", "classnames": "^2.2.6", - "dayjs": "^1.9.4", - "lodash": "^4.17.20", - "lowlight": "^1.14.0", - "numeral": "^2.0.6", "react": "^16.13.1", "react-dom": "^16.13.1", "react-redux": "^7.2.0", "react-router-dom": "^5.1.2", "react-scripts": "^3.4.3", - "react-window": "^1.8.5", "typeface-roboto": "0.0.75", "typescript": "3.8.3", "use-debounce": "^3.4.3" @@ -1332,11 +1320,6 @@ "resolved": "https://registry.npmjs.org/@csstools/normalize.css/-/normalize.css-10.1.0.tgz", "integrity": "sha512-ij4wRiunFfaJxjB0BdrYHIH8FxBJpOwNPhhAcunlmPdXudL1WQV1qoP9un6JsEBAgQH+7UXyyjh0g7jTxXK6tg==" }, - "node_modules/@date-io/core": { - "version": "1.3.13", - "resolved": "https://registry.npmjs.org/@date-io/core/-/core-1.3.13.tgz", - "integrity": "sha512-AlEKV7TxjeK+jxWVKcCFrfYAk8spX9aCyiToFIiLPtfQbsjmRGLIhb5VZgptQcJdHtLXo7+m0DuurwFgUToQuA==" - }, "node_modules/@emotion/hash": { "version": "0.8.0", "resolved": "https://registry.npmjs.org/@emotion/hash/-/hash-0.8.0.tgz", @@ -1876,26 +1859,6 @@ "node": ">=8.0.0" } }, - "node_modules/@material-ui/pickers": { - "version": "3.2.10", - "resolved": "https://registry.npmjs.org/@material-ui/pickers/-/pickers-3.2.10.tgz", - "integrity": "sha512-B8G6Obn5S3RCl7hwahkQj9sKUapwXWFjiaz/Bsw1fhYFdNMnDUolRiWQSoKPb1/oKe37Dtfszoywi1Ynbo3y8w==", - "dependencies": { - "@babel/runtime": "^7.6.0", - "@date-io/core": "1.x", - "@types/styled-jsx": "^2.2.8", - "clsx": "^1.0.2", - "react-transition-group": "^4.0.0", - "rifm": "^0.7.0" - }, - "peerDependencies": { - "@date-io/core": "^1.3.6", - "@material-ui/core": "^4.0.0", - "prop-types": "^15.6.0", - "react": "^16.8.4", - "react-dom": "^16.8.4" - } - }, "node_modules/@material-ui/styles": { "version": "4.10.0", "resolved": "https://registry.npmjs.org/@material-ui/styles/-/styles-4.10.0.tgz", @@ -2242,16 +2205,6 @@ "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.5.tgz", "integrity": "sha512-7+2BITlgjgDhH0vvwZU/HZJVyk+2XUlvxXe8dFMedNX/aMkaOq++rMAFXc0tM7ij15QaWlbdQASBR9dihi+bDQ==" }, - "node_modules/@types/lodash": { - "version": "4.14.168", - "resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.14.168.tgz", - "integrity": "sha512-oVfRvqHV/V6D1yifJbVRU3TMp8OT6o6BG+U9MkwuJ3U8/CsDHvalRpsxBqivn71ztOFZBTfJMvETbqHiaNSj7Q==" - }, - "node_modules/@types/lowlight": { - "version": "0.0.1", - "resolved": "https://registry.npmjs.org/@types/lowlight/-/lowlight-0.0.1.tgz", - "integrity": "sha512-yPpbpV1KfpFOZ0ZZbsgwWumraiAKoX7/Ng75Ah//w+ZBt4j0xwrQ2aHSlk2kPzQVK4LiPbNFE1LjC00IL4nl/A==" - }, "node_modules/@types/minimatch": { "version": "3.0.3", "resolved": "https://registry.npmjs.org/@types/minimatch/-/minimatch-3.0.3.tgz", @@ -2262,11 +2215,6 @@ "resolved": "https://registry.npmjs.org/@types/node/-/node-13.9.5.tgz", "integrity": "sha512-hkzMMD3xu6BrJpGVLeQ3htQQNAcOrJjX7WFmtK8zWQpz2UJf13LCFF2ALA7c9OVdvc2vQJeDdjfR35M0sBCxvw==" }, - "node_modules/@types/numeral": { - "version": "0.0.26", - "resolved": "https://registry.npmjs.org/@types/numeral/-/numeral-0.0.26.tgz", - "integrity": "sha512-DwCsRqeOWopdEsm5KLTxKVKDSDoj+pzZD1vlwu1GQJ6IF3RhjuleYlRwyRH6MJLGaf3v8wFTnC6wo3yYfz0bnA==" - }, "node_modules/@types/parse-json": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/@types/parse-json/-/parse-json-4.0.0.tgz", @@ -2337,27 +2285,11 @@ "@types/react": "*" } }, - "node_modules/@types/react-window": { - "version": "1.8.2", - "resolved": "https://registry.npmjs.org/@types/react-window/-/react-window-1.8.2.tgz", - "integrity": "sha512-gP1xam68Wc4ZTAee++zx6pTdDAH08rAkQrWm4B4F/y6hhmlT9Mgx2q8lTCXnrPHXsr15XjRN9+K2DLKcz44qEQ==", - "dependencies": { - "@types/react": "*" - } - }, "node_modules/@types/stack-utils": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/@types/stack-utils/-/stack-utils-1.0.1.tgz", "integrity": "sha512-l42BggppR6zLmpfU6fq9HEa2oGPEI8yrSPL3GITjfRInppYFahObbIQOQK3UGxEnyQpltZLaPe75046NOZQikw==" }, - "node_modules/@types/styled-jsx": { - "version": "2.2.8", - "resolved": "https://registry.npmjs.org/@types/styled-jsx/-/styled-jsx-2.2.8.tgz", - "integrity": "sha512-Yjye9VwMdYeXfS71ihueWRSxrruuXTwKCbzue4+5b2rjnQ//AtyM7myZ1BEhNhBQ/nL/RE7bdToUoLln2miKvg==", - "dependencies": { - "@types/react": "*" - } - }, "node_modules/@types/yargs": { "version": "13.0.11", "resolved": "https://registry.npmjs.org/@types/yargs/-/yargs-13.0.11.tgz", @@ -3075,14 +3007,6 @@ "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.10.1.tgz", "integrity": "sha512-zg7Hz2k5lI8kb7U32998pRRFin7zJlkfezGJjUc2heaD4Pw2wObakCDVzkKztTm/Ln7eiVvYsjqak0Ed4LkMDA==" }, - "node_modules/axios": { - "version": "0.21.1", - "resolved": "https://registry.npmjs.org/axios/-/axios-0.21.1.tgz", - "integrity": "sha512-dKQiRHxGD9PPRIUNIWvZhPTPpl1rf/OxTYKsqKUDjBwYylTvV7SjSHJb9ratfyzM6wCdLCOYLzs73qpg5c4iGA==", - "dependencies": { - "follow-redirects": "^1.10.0" - } - }, "node_modules/axobject-query": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/axobject-query/-/axobject-query-2.2.0.tgz", @@ -5234,11 +5158,6 @@ "webidl-conversions": "^4.0.2" } }, - "node_modules/dayjs": { - "version": "1.10.4", - "resolved": "https://registry.npmjs.org/dayjs/-/dayjs-1.10.4.tgz", - "integrity": "sha512-RI/Hh4kqRc1UKLOAf/T5zdMMX5DQIlDxwUe3wSyMMnEbGunnpENCdbUgM+dW7kXidZqCttBrmw7BhN4TMddkCw==" - }, "node_modules/debug": { "version": "4.3.1", "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.1.tgz", @@ -7066,18 +6985,6 @@ "resolved": "https://registry.npmjs.org/fast-levenshtein/-/fast-levenshtein-2.0.6.tgz", "integrity": "sha1-PYpcZog6FqMMqGQ+hR8Zuqd5eRc=" }, - "node_modules/fault": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/fault/-/fault-1.0.4.tgz", - "integrity": "sha512-CJ0HCB5tL5fYTEA7ToAq5+kTwd++Borf1/bifxd9iT70QcXr4MRrO3Llf8Ifs70q+SJcGHFtnIE/Nw6giCtECA==", - "dependencies": { - "format": "^0.2.0" - }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" - } - }, "node_modules/faye-websocket": { "version": "0.10.0", "resolved": "https://registry.npmjs.org/faye-websocket/-/faye-websocket-0.10.0.tgz", @@ -7411,14 +7318,6 @@ "node": ">= 0.12" } }, - "node_modules/format": { - "version": "0.2.2", - "resolved": "https://registry.npmjs.org/format/-/format-0.2.2.tgz", - "integrity": "sha1-1hcBB+nv3E7TDJ3DkBbflCtctYs=", - "engines": { - "node": ">=0.4.x" - } - }, "node_modules/forwarded": { "version": "0.1.2", "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.1.2.tgz", @@ -7905,14 +7804,6 @@ "resolved": "https://registry.npmjs.org/hex-color-regex/-/hex-color-regex-1.1.0.tgz", "integrity": "sha512-l9sfDFsuqtOqKDsQdqrMRk0U85RZc0RtOR9yPI7mRVOa4FsR/BVnZ0shmQRM96Ji99kYZP/7hn1cedc1+ApsTQ==" }, - "node_modules/highlight.js": { - "version": "10.5.0", - "resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-10.5.0.tgz", - "integrity": "sha512-xTmvd9HiIHR6L53TMC7TKolEj65zG1XU+Onr8oi86mYa+nLcIbxTTWkpW7CsEwv/vK7u1zb8alZIMLDqqN6KTw==", - "engines": { - "node": "*" - } - }, "node_modules/history": { "version": "4.10.1", "resolved": "https://registry.npmjs.org/history/-/history-4.10.1.tgz", @@ -8300,9 +8191,12 @@ "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==" }, "node_modules/ini": { - "version": "1.3.8", - "resolved": "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz", - "integrity": "sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==" + "version": "1.3.5", + "resolved": "https://registry.npmjs.org/ini/-/ini-1.3.5.tgz", + "integrity": "sha512-RZY5huIKCMRWDUqZlEi72f/lmXKMvuszcMBduliQ3nnWbx9X/ZBQO7DijMEYS9EhHBb2qacRUMtC7svLwe0lcw==", + "engines": { + "node": "*" + } }, "node_modules/inquirer": { "version": "7.0.4", @@ -11107,19 +11001,6 @@ "tslib": "^1.10.0" } }, - "node_modules/lowlight": { - "version": "1.18.0", - "resolved": "https://registry.npmjs.org/lowlight/-/lowlight-1.18.0.tgz", - "integrity": "sha512-Zlc3GqclU71HRw5fTOy00zz5EOlqAdKMYhOFIO8ay4SQEDQgFuhR8JNwDIzAGMLoqTsWxe0elUNmq5o2USRAzw==", - "dependencies": { - "fault": "^1.0.0", - "highlight.js": "~10.5.0" - }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" - } - }, "node_modules/lru-cache": { "version": "5.1.1", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", @@ -11216,11 +11097,6 @@ "node": ">= 0.6" } }, - "node_modules/memoize-one": { - "version": "5.1.1", - "resolved": "https://registry.npmjs.org/memoize-one/-/memoize-one-5.1.1.tgz", - "integrity": "sha512-HKeeBpWvqiVJD57ZUAsJNm71eHTykffzcLZVYWiVfQeI1rJtuEaS7hQiEpWfVVk18donPwJEcFKIkCmPJNOhHA==" - }, "node_modules/memory-fs": { "version": "0.4.1", "resolved": "https://registry.npmjs.org/memory-fs/-/memory-fs-0.4.1.tgz", @@ -11861,14 +11737,6 @@ "resolved": "https://registry.npmjs.org/num2fraction/-/num2fraction-1.2.2.tgz", "integrity": "sha1-b2gragJ6Tp3fpFZM0lidHU5mnt4=" }, - "node_modules/numeral": { - "version": "2.0.6", - "resolved": "https://registry.npmjs.org/numeral/-/numeral-2.0.6.tgz", - "integrity": "sha1-StCAk21EPCVhrtnyGX7//iX05QY=", - "engines": { - "node": "*" - } - }, "node_modules/nwsapi": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.0.tgz", @@ -14503,22 +14371,6 @@ "prop-types": "^15.6.2" } }, - "node_modules/react-window": { - "version": "1.8.6", - "resolved": "https://registry.npmjs.org/react-window/-/react-window-1.8.6.tgz", - "integrity": "sha512-8VwEEYyjz6DCnGBsd+MgkD0KJ2/OXFULyDtorIiTz+QzwoP94tBoA7CnbtyXMm+cCeAUER5KJcPtWl9cpKbOBg==", - "dependencies": { - "@babel/runtime": "^7.0.0", - "memoize-one": ">=3.1.1 <6" - }, - "engines": { - "node": ">8.0.0" - }, - "peerDependencies": { - "react": "^15.0.0 || ^16.0.0 || ^17.0.0", - "react-dom": "^15.0.0 || ^16.0.0 || ^17.0.0" - } - }, "node_modules/read-pkg": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/read-pkg/-/read-pkg-3.0.0.tgz", @@ -15109,17 +14961,6 @@ "resolved": "https://registry.npmjs.org/rgba-regex/-/rgba-regex-1.0.0.tgz", "integrity": "sha1-QzdOLiyglosO8VI0YLfXMP8i7rM=" }, - "node_modules/rifm": { - "version": "0.7.0", - "resolved": "https://registry.npmjs.org/rifm/-/rifm-0.7.0.tgz", - "integrity": "sha512-DSOJTWHD67860I5ojetXdEQRIBvF6YcpNe53j0vn1vp9EUb9N80EiZTxgP+FkDKorWC8PZw052kTF4C1GOivCQ==", - "dependencies": { - "@babel/runtime": "^7.3.1" - }, - "peerDependencies": { - "react": ">=16.8" - } - }, "node_modules/rimraf": { "version": "2.6.3", "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.6.3.tgz", @@ -19427,11 +19268,6 @@ "resolved": "https://registry.npmjs.org/@csstools/normalize.css/-/normalize.css-10.1.0.tgz", "integrity": "sha512-ij4wRiunFfaJxjB0BdrYHIH8FxBJpOwNPhhAcunlmPdXudL1WQV1qoP9un6JsEBAgQH+7UXyyjh0g7jTxXK6tg==" }, - "@date-io/core": { - "version": "1.3.13", - "resolved": "https://registry.npmjs.org/@date-io/core/-/core-1.3.13.tgz", - "integrity": "sha512-AlEKV7TxjeK+jxWVKcCFrfYAk8spX9aCyiToFIiLPtfQbsjmRGLIhb5VZgptQcJdHtLXo7+m0DuurwFgUToQuA==" - }, "@emotion/hash": { "version": "0.8.0", "resolved": "https://registry.npmjs.org/@emotion/hash/-/hash-0.8.0.tgz", @@ -19879,19 +19715,6 @@ "react-is": "^16.8.0" } }, - "@material-ui/pickers": { - "version": "3.2.10", - "resolved": "https://registry.npmjs.org/@material-ui/pickers/-/pickers-3.2.10.tgz", - "integrity": "sha512-B8G6Obn5S3RCl7hwahkQj9sKUapwXWFjiaz/Bsw1fhYFdNMnDUolRiWQSoKPb1/oKe37Dtfszoywi1Ynbo3y8w==", - "requires": { - "@babel/runtime": "^7.6.0", - "@date-io/core": "1.x", - "@types/styled-jsx": "^2.2.8", - "clsx": "^1.0.2", - "react-transition-group": "^4.0.0", - "rifm": "^0.7.0" - } - }, "@material-ui/styles": { "version": "4.10.0", "resolved": "https://registry.npmjs.org/@material-ui/styles/-/styles-4.10.0.tgz", @@ -20181,16 +20004,6 @@ "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.5.tgz", "integrity": "sha512-7+2BITlgjgDhH0vvwZU/HZJVyk+2XUlvxXe8dFMedNX/aMkaOq++rMAFXc0tM7ij15QaWlbdQASBR9dihi+bDQ==" }, - "@types/lodash": { - "version": "4.14.168", - "resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.14.168.tgz", - "integrity": "sha512-oVfRvqHV/V6D1yifJbVRU3TMp8OT6o6BG+U9MkwuJ3U8/CsDHvalRpsxBqivn71ztOFZBTfJMvETbqHiaNSj7Q==" - }, - "@types/lowlight": { - "version": "0.0.1", - "resolved": "https://registry.npmjs.org/@types/lowlight/-/lowlight-0.0.1.tgz", - "integrity": "sha512-yPpbpV1KfpFOZ0ZZbsgwWumraiAKoX7/Ng75Ah//w+ZBt4j0xwrQ2aHSlk2kPzQVK4LiPbNFE1LjC00IL4nl/A==" - }, "@types/minimatch": { "version": "3.0.3", "resolved": "https://registry.npmjs.org/@types/minimatch/-/minimatch-3.0.3.tgz", @@ -20201,11 +20014,6 @@ "resolved": "https://registry.npmjs.org/@types/node/-/node-13.9.5.tgz", "integrity": "sha512-hkzMMD3xu6BrJpGVLeQ3htQQNAcOrJjX7WFmtK8zWQpz2UJf13LCFF2ALA7c9OVdvc2vQJeDdjfR35M0sBCxvw==" }, - "@types/numeral": { - "version": "0.0.26", - "resolved": "https://registry.npmjs.org/@types/numeral/-/numeral-0.0.26.tgz", - "integrity": "sha512-DwCsRqeOWopdEsm5KLTxKVKDSDoj+pzZD1vlwu1GQJ6IF3RhjuleYlRwyRH6MJLGaf3v8wFTnC6wo3yYfz0bnA==" - }, "@types/parse-json": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/@types/parse-json/-/parse-json-4.0.0.tgz", @@ -20276,27 +20084,11 @@ "@types/react": "*" } }, - "@types/react-window": { - "version": "1.8.2", - "resolved": "https://registry.npmjs.org/@types/react-window/-/react-window-1.8.2.tgz", - "integrity": "sha512-gP1xam68Wc4ZTAee++zx6pTdDAH08rAkQrWm4B4F/y6hhmlT9Mgx2q8lTCXnrPHXsr15XjRN9+K2DLKcz44qEQ==", - "requires": { - "@types/react": "*" - } - }, "@types/stack-utils": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/@types/stack-utils/-/stack-utils-1.0.1.tgz", "integrity": "sha512-l42BggppR6zLmpfU6fq9HEa2oGPEI8yrSPL3GITjfRInppYFahObbIQOQK3UGxEnyQpltZLaPe75046NOZQikw==" }, - "@types/styled-jsx": { - "version": "2.2.8", - "resolved": "https://registry.npmjs.org/@types/styled-jsx/-/styled-jsx-2.2.8.tgz", - "integrity": "sha512-Yjye9VwMdYeXfS71ihueWRSxrruuXTwKCbzue4+5b2rjnQ//AtyM7myZ1BEhNhBQ/nL/RE7bdToUoLln2miKvg==", - "requires": { - "@types/react": "*" - } - }, "@types/yargs": { "version": "13.0.11", "resolved": "https://registry.npmjs.org/@types/yargs/-/yargs-13.0.11.tgz", @@ -20901,14 +20693,6 @@ "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.10.1.tgz", "integrity": "sha512-zg7Hz2k5lI8kb7U32998pRRFin7zJlkfezGJjUc2heaD4Pw2wObakCDVzkKztTm/Ln7eiVvYsjqak0Ed4LkMDA==" }, - "axios": { - "version": "0.21.1", - "resolved": "https://registry.npmjs.org/axios/-/axios-0.21.1.tgz", - "integrity": "sha512-dKQiRHxGD9PPRIUNIWvZhPTPpl1rf/OxTYKsqKUDjBwYylTvV7SjSHJb9ratfyzM6wCdLCOYLzs73qpg5c4iGA==", - "requires": { - "follow-redirects": "^1.10.0" - } - }, "axobject-query": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/axobject-query/-/axobject-query-2.2.0.tgz", @@ -22736,11 +22520,6 @@ } } }, - "dayjs": { - "version": "1.10.4", - "resolved": "https://registry.npmjs.org/dayjs/-/dayjs-1.10.4.tgz", - "integrity": "sha512-RI/Hh4kqRc1UKLOAf/T5zdMMX5DQIlDxwUe3wSyMMnEbGunnpENCdbUgM+dW7kXidZqCttBrmw7BhN4TMddkCw==" - }, "debug": { "version": "4.3.1", "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.1.tgz", @@ -24259,14 +24038,6 @@ "resolved": "https://registry.npmjs.org/fast-levenshtein/-/fast-levenshtein-2.0.6.tgz", "integrity": "sha1-PYpcZog6FqMMqGQ+hR8Zuqd5eRc=" }, - "fault": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/fault/-/fault-1.0.4.tgz", - "integrity": "sha512-CJ0HCB5tL5fYTEA7ToAq5+kTwd++Borf1/bifxd9iT70QcXr4MRrO3Llf8Ifs70q+SJcGHFtnIE/Nw6giCtECA==", - "requires": { - "format": "^0.2.0" - } - }, "faye-websocket": { "version": "0.10.0", "resolved": "https://registry.npmjs.org/faye-websocket/-/faye-websocket-0.10.0.tgz", @@ -24541,11 +24312,6 @@ "mime-types": "^2.1.12" } }, - "format": { - "version": "0.2.2", - "resolved": "https://registry.npmjs.org/format/-/format-0.2.2.tgz", - "integrity": "sha1-1hcBB+nv3E7TDJ3DkBbflCtctYs=" - }, "forwarded": { "version": "0.1.2", "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.1.2.tgz", @@ -24946,11 +24712,6 @@ "resolved": "https://registry.npmjs.org/hex-color-regex/-/hex-color-regex-1.1.0.tgz", "integrity": "sha512-l9sfDFsuqtOqKDsQdqrMRk0U85RZc0RtOR9yPI7mRVOa4FsR/BVnZ0shmQRM96Ji99kYZP/7hn1cedc1+ApsTQ==" }, - "highlight.js": { - "version": "10.5.0", - "resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-10.5.0.tgz", - "integrity": "sha512-xTmvd9HiIHR6L53TMC7TKolEj65zG1XU+Onr8oi86mYa+nLcIbxTTWkpW7CsEwv/vK7u1zb8alZIMLDqqN6KTw==" - }, "history": { "version": "4.10.1", "resolved": "https://registry.npmjs.org/history/-/history-4.10.1.tgz", @@ -25284,9 +25045,9 @@ "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==" }, "ini": { - "version": "1.3.8", - "resolved": "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz", - "integrity": "sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==" + "version": "1.3.5", + "resolved": "https://registry.npmjs.org/ini/-/ini-1.3.5.tgz", + "integrity": "sha512-RZY5huIKCMRWDUqZlEi72f/lmXKMvuszcMBduliQ3nnWbx9X/ZBQO7DijMEYS9EhHBb2qacRUMtC7svLwe0lcw==" }, "inquirer": { "version": "7.0.4", @@ -27538,15 +27299,6 @@ "tslib": "^1.10.0" } }, - "lowlight": { - "version": "1.18.0", - "resolved": "https://registry.npmjs.org/lowlight/-/lowlight-1.18.0.tgz", - "integrity": "sha512-Zlc3GqclU71HRw5fTOy00zz5EOlqAdKMYhOFIO8ay4SQEDQgFuhR8JNwDIzAGMLoqTsWxe0elUNmq5o2USRAzw==", - "requires": { - "fault": "^1.0.0", - "highlight.js": "~10.5.0" - } - }, "lru-cache": { "version": "5.1.1", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", @@ -27629,11 +27381,6 @@ "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz", "integrity": "sha1-hxDXrwqmJvj/+hzgAWhUUmMlV0g=" }, - "memoize-one": { - "version": "5.1.1", - "resolved": "https://registry.npmjs.org/memoize-one/-/memoize-one-5.1.1.tgz", - "integrity": "sha512-HKeeBpWvqiVJD57ZUAsJNm71eHTykffzcLZVYWiVfQeI1rJtuEaS7hQiEpWfVVk18donPwJEcFKIkCmPJNOhHA==" - }, "memory-fs": { "version": "0.4.1", "resolved": "https://registry.npmjs.org/memory-fs/-/memory-fs-0.4.1.tgz", @@ -28186,11 +27933,6 @@ "resolved": "https://registry.npmjs.org/num2fraction/-/num2fraction-1.2.2.tgz", "integrity": "sha1-b2gragJ6Tp3fpFZM0lidHU5mnt4=" }, - "numeral": { - "version": "2.0.6", - "resolved": "https://registry.npmjs.org/numeral/-/numeral-2.0.6.tgz", - "integrity": "sha1-StCAk21EPCVhrtnyGX7//iX05QY=" - }, "nwsapi": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.0.tgz", @@ -30349,15 +30091,6 @@ "prop-types": "^15.6.2" } }, - "react-window": { - "version": "1.8.6", - "resolved": "https://registry.npmjs.org/react-window/-/react-window-1.8.6.tgz", - "integrity": "sha512-8VwEEYyjz6DCnGBsd+MgkD0KJ2/OXFULyDtorIiTz+QzwoP94tBoA7CnbtyXMm+cCeAUER5KJcPtWl9cpKbOBg==", - "requires": { - "@babel/runtime": "^7.0.0", - "memoize-one": ">=3.1.1 <6" - } - }, "read-pkg": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/read-pkg/-/read-pkg-3.0.0.tgz", @@ -30841,14 +30574,6 @@ "resolved": "https://registry.npmjs.org/rgba-regex/-/rgba-regex-1.0.0.tgz", "integrity": "sha1-QzdOLiyglosO8VI0YLfXMP8i7rM=" }, - "rifm": { - "version": "0.7.0", - "resolved": "https://registry.npmjs.org/rifm/-/rifm-0.7.0.tgz", - "integrity": "sha512-DSOJTWHD67860I5ojetXdEQRIBvF6YcpNe53j0vn1vp9EUb9N80EiZTxgP+FkDKorWC8PZw052kTF4C1GOivCQ==", - "requires": { - "@babel/runtime": "^7.3.1" - } - }, "rimraf": { "version": "2.6.3", "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.6.3.tgz", diff --git a/dashboard/client/package.json b/dashboard/client/package.json index 535d3b48362f..3ac262ef70d5 100644 --- a/dashboard/client/package.json +++ b/dashboard/client/package.json @@ -1,36 +1,25 @@ { - "name": "ray-dashboard-client", - "version": "1.0.0", + "name": "client", + "version": "0.1.0", "private": true, "dependencies": { "@material-ui/core": "4.11.0", "@material-ui/icons": "^4.9.1", "@material-ui/lab": "^4.0.0-alpha.56", - "@material-ui/pickers": "^3.2.10", "@reduxjs/toolkit": "^1.3.1", "@types/classnames": "^2.2.10", "@types/jest": "25.1.4", - "@types/lodash": "^4.14.161", - "@types/lowlight": "^0.0.1", "@types/node": "13.9.5", - "@types/numeral": "^0.0.26", "@types/react": "16.9.26", "@types/react-dom": "16.9.5", "@types/react-redux": "^7.1.7", "@types/react-router-dom": "^5.1.3", - "@types/react-window": "^1.8.2", - "axios": "^0.21.1", "classnames": "^2.2.6", - "dayjs": "^1.9.4", - "lodash": "^4.17.20", - "lowlight": "^1.14.0", - "numeral": "^2.0.6", "react": "^16.13.1", "react-dom": "^16.13.1", "react-redux": "^7.2.0", "react-router-dom": "^5.1.2", "react-scripts": "^3.4.3", - "react-window": "^1.8.5", "typeface-roboto": "0.0.75", "typescript": "3.8.3", "use-debounce": "^3.4.3" @@ -51,7 +40,6 @@ "eslint": "./node_modules/.bin/eslint \"src/**\"" }, "eslintConfig": { - "ignorePatterns": ["*.svg", "*.css"], "extends": [ "plugin:import/warnings", "react-app" @@ -122,6 +110,5 @@ "last 1 firefox version", "last 1 safari version" ] - }, - "proxy": "http://localhost:8265" + } } diff --git a/dashboard/client/src/App.tsx b/dashboard/client/src/App.tsx index be2a8fc0beb6..c0bdae6a13dd 100644 --- a/dashboard/client/src/App.tsx +++ b/dashboard/client/src/App.tsx @@ -1,112 +1,21 @@ import { CssBaseline } from "@material-ui/core"; -import { ThemeProvider } from "@material-ui/core/styles"; -import React, { Suspense, useEffect, useState } from "react"; +import React from "react"; import { Provider } from "react-redux"; -import { HashRouter, Route, Switch } from "react-router-dom"; +import { BrowserRouter, Route } from "react-router-dom"; import Dashboard from "./pages/dashboard/Dashboard"; -import Loading from "./pages/exception/Loading"; -import { getNodeList } from "./service/node"; import { store } from "./store"; -import { darkTheme, lightTheme } from "./theme"; -import { getLocalStorage, setLocalStorage } from "./util/localData"; -// lazy loading fro prevent loading too much code at once -const Actors = React.lazy(() => import("./pages/actor")); -const CMDResult = React.lazy(() => import("./pages/cmd/CMDResult")); -const Index = React.lazy(() => import("./pages/index/Index")); -const Job = React.lazy(() => import("./pages/job")); -const JobDetail = React.lazy(() => import("./pages/job/JobDetail")); -const BasicLayout = React.lazy(() => import("./pages/layout")); -const Logs = React.lazy(() => import("./pages/log/Logs")); -const Node = React.lazy(() => import("./pages/node")); -const NodeDetail = React.lazy(() => import("./pages/node/NodeDetail")); - -// key to store theme in local storage -const RAY_DASHBOARD_THEME_KEY = "ray-dashboard-theme"; - -// a global map for relations -export const GlobalContext = React.createContext({ - nodeMap: {} as { [key: string]: string }, - ipLogMap: {} as { [key: string]: string }, - namespaceMap: {} as { [key: string]: string[] }, -}); - -export const getDefaultTheme = () => - getLocalStorage(RAY_DASHBOARD_THEME_KEY) || "light"; -export const setLocalTheme = (theme: string) => - setLocalStorage(RAY_DASHBOARD_THEME_KEY, theme); - -const App = () => { - const [theme, _setTheme] = useState(getDefaultTheme()); - const [context, setContext] = useState<{ - nodeMap: { [key: string]: string }; - ipLogMap: { [key: string]: string }; - namespaceMap: { [key: string]: string[] }; - }>({ nodeMap: {}, ipLogMap: {}, namespaceMap: {} }); - const getTheme = (name: string) => { - switch (name) { - case "dark": - return darkTheme; - case "light": - default: - return lightTheme; - } - }; - const setTheme = (name: string) => { - setLocalTheme(name); - _setTheme(name); - }; - useEffect(() => { - getNodeList().then((res) => { - if (res?.data?.data?.summary) { - const nodeMap = {} as { [key: string]: string }; - const ipLogMap = {} as { [key: string]: string }; - res.data.data.summary.forEach(({ hostname, raylet, ip, logUrl }) => { - nodeMap[hostname] = raylet.nodeId; - ipLogMap[ip] = logUrl; - }); - setContext({ nodeMap, ipLogMap, namespaceMap: {} }); - } - }); - }, []); - - return ( - - - - - - - - - ( - - - - - - ( - - )} - exact - path="/log/:host?/:path?" - /> - - - - - - )} - /> - - - - - - - ); -}; +class App extends React.Component { + render() { + return ( + + + + + + + ); + } +} export default App; diff --git a/dashboard/client/src/api.ts b/dashboard/client/src/api.ts index b7f4f5f41477..e2ff52464e84 100644 --- a/dashboard/client/src/api.ts +++ b/dashboard/client/src/api.ts @@ -1,4 +1,7 @@ -const base = window.location.origin; +const base = + process.env.NODE_ENV === "development" + ? "http://localhost:8265" + : window.location.origin; type APIResponse = { result: boolean; diff --git a/dashboard/client/src/components/ActorTable.tsx b/dashboard/client/src/components/ActorTable.tsx deleted file mode 100644 index b90e5cf34a68..000000000000 --- a/dashboard/client/src/components/ActorTable.tsx +++ /dev/null @@ -1,253 +0,0 @@ -import { - InputAdornment, - Table, - TableBody, - TableCell, - TableHead, - TableRow, - TextField, - TextFieldProps, -} from "@material-ui/core"; -import { orange } from "@material-ui/core/colors"; -import { SearchOutlined } from "@material-ui/icons"; -import Autocomplete from "@material-ui/lab/Autocomplete"; -import Pagination from "@material-ui/lab/Pagination"; -import React, { useContext, useState } from "react"; -import { Link } from "react-router-dom"; -import { GlobalContext } from "../App"; -import { Actor } from "../type/actor"; -import { Worker } from "../type/worker"; -import { longTextCut } from "../util/func"; -import { useFilter } from "../util/hook"; -import StateCounter from "./StatesCounter"; -import { StatusChip } from "./StatusChip"; -import RayletWorkerTable, { ExpandableTableRow } from "./WorkerTable"; - -const ActorTable = ({ - actors = {}, - workers = [], -}: { - actors: { [actorId: string]: Actor }; - workers?: Worker[]; -}) => { - const [pageNo, setPageNo] = useState(1); - const { changeFilter, filterFunc } = useFilter(); - const [pageSize, setPageSize] = useState(10); - const { ipLogMap } = useContext(GlobalContext); - const actorList = Object.values(actors || {}) - .map((e) => ({ - ...e, - functionDesc: Object.values( - e.taskSpec?.functionDescriptor?.javaFunctionDescriptor || - e.taskSpec?.functionDescriptor?.pythonFunctionDescriptor || - {}, - ).join(" "), - })) - .filter(filterFunc); - const list = actorList.slice((pageNo - 1) * pageSize, pageNo * pageSize); - - return ( - -
- e.state)), - )} - onInputChange={(_: any, value: string) => { - changeFilter("state", value.trim()); - }} - renderInput={(params: TextFieldProps) => ( - - )} - /> - e.address?.ipAddress)), - )} - onInputChange={(_: any, value: string) => { - changeFilter("address.ipAddress", value.trim()); - }} - renderInput={(params: TextFieldProps) => ( - - )} - /> - { - changeFilter("pid", value.trim()); - }, - endAdornment: ( - - - - ), - }} - /> - { - changeFilter("functionDesc", value.trim()); - }, - endAdornment: ( - - - - ), - }} - /> - { - changeFilter("name", value.trim()); - }, - endAdornment: ( - - - - ), - }} - /> - { - changeFilter("actorId", value.trim()); - }, - endAdornment: ( - - - - ), - }} - /> - { - setPageSize(Math.min(Number(value), 500) || 10); - }, - }} - /> -
-
-
- setPageNo(num)} - count={Math.ceil(actorList.length / pageSize)} - /> -
-
- -
-
- - - - {[ - "", - "ID(Num Restarts)", - "Name", - "Task Func Desc", - "Job Id", - "Pid", - "IP", - "Port", - "State", - "Log", - ].map((col) => ( - - {col} - - ))} - - - - {list.map( - ({ - actorId, - functionDesc, - jobId, - pid, - address, - state, - name, - numRestarts, - }) => ( - - e.pid === pid && - address.ipAddress === e.coreWorkerStats[0].ipAddress, - ).length - } - expandComponent={ - - e.pid === pid && - address.ipAddress === e.coreWorkerStats[0].ipAddress, - )} - mini - /> - } - key={actorId} - > - 0 ? orange[500] : "inherit", - }} - > - {actorId}({numRestarts}) - - {name} - - {longTextCut(functionDesc, 60)} - - {jobId} - {pid} - {address?.ipAddress} - {address?.port} - - - - - {ipLogMap[address?.ipAddress] && ( - - Log - - )} - - - ), - )} - -
-
- ); -}; - -export default ActorTable; diff --git a/dashboard/client/src/components/Loading.tsx b/dashboard/client/src/components/Loading.tsx deleted file mode 100644 index 6c1cb1e8f0ea..000000000000 --- a/dashboard/client/src/components/Loading.tsx +++ /dev/null @@ -1,10 +0,0 @@ -import { Backdrop, CircularProgress } from "@material-ui/core"; -import React from "react"; - -const Loading = ({ loading }: { loading: boolean }) => ( - - - -); - -export default Loading; diff --git a/dashboard/client/src/components/LogView/LogVirtualView.tsx b/dashboard/client/src/components/LogView/LogVirtualView.tsx deleted file mode 100644 index 2046989c2702..000000000000 --- a/dashboard/client/src/components/LogView/LogVirtualView.tsx +++ /dev/null @@ -1,221 +0,0 @@ -import dayjs from "dayjs"; -import low from "lowlight"; -import React, { - CSSProperties, - MutableRefObject, - useEffect, - useRef, - useState, -} from "react"; -import { FixedSizeList as List } from "react-window"; -import "./darcula.css"; -import "./github.css"; -import "./index.css"; -import { getDefaultTheme } from "../../App"; - -const uniqueKeySelector = () => Math.random().toString(16).slice(-8); - -const timeReg = /(?:(?!0000)[0-9]{4}-(?:(?:0[1-9]|1[0-2])-(?:0[1-9]|1[0-9]|2[0-8])|(?:0[13-9]|1[0-2])-(?:29|30)|(?:0[13578]|1[02])-31)|(?:[0-9]{2}(?:0[48]|[2468][048]|[13579][26])|(?:0[48]|[2468][048]|[13579][26])00)-02-29)\s+([01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9]/; - -const value2react = ( - { type, tagName, properties, children, value = "" }: any, - key: string, - keywords: string = "", -) => { - switch (type) { - case "element": - return React.createElement( - tagName, - { - className: properties.className[0], - key: `${key}line${uniqueKeySelector()}`, - }, - children.map((e: any, i: number) => - value2react(e, `${key}-${i}`, keywords), - ), - ); - case "text": - if (keywords && value.includes(keywords)) { - const afterChildren = []; - const vals = value.split(keywords); - let tmp = vals.shift(); - if (!tmp) { - return React.createElement( - "span", - { className: "find-kws" }, - keywords, - ); - } - while (typeof tmp === "string") { - if (tmp !== "") { - afterChildren.push(tmp); - } else { - afterChildren.push( - React.createElement("span", { className: "find-kws" }, keywords), - ); - } - - tmp = vals.shift(); - if (tmp) { - afterChildren.push( - React.createElement("span", { className: "find-kws" }, keywords), - ); - } - } - return afterChildren; - } - return value; - default: - return []; - } -}; - -export type LogVirtualViewProps = { - content: string; - width?: number; - height?: number; - fontSize?: number; - theme?: "light" | "dark"; - language?: string; - focusLine?: number; - keywords?: string; - style?: { [key: string]: string | number }; - listRef?: MutableRefObject; - onScrollBottom?: (event: Event) => void; - revert?: boolean; - startTime?: string; - endTime?: string; -}; - -const LogVirtualView: React.FC = ({ - content, - width = "100%", - height, - fontSize = 12, - theme = getDefaultTheme(), - keywords = "", - language = "dos", - focusLine = 1, - style = {}, - listRef, - onScrollBottom, - revert = false, - startTime, - endTime, -}) => { - const [logs, setLogs] = useState<{ i: number; origin: string }[]>([]); - const total = logs.length; - const timmer = useRef>(); - const el = useRef(null); - const outter = useRef(null); - if (listRef) { - listRef.current = outter.current; - } - const itemRenderer = ({ - index, - style: s, - }: { - index: number; - style: CSSProperties; - }) => { - const { i, origin } = logs[revert ? logs.length - 1 - index : index]; - return ( -
- - {i + 1} - - {low - .highlight(language, origin) - .value.map((v) => value2react(v, index.toString(), keywords))} -
- ); - }; - - useEffect(() => { - const originContent = content.split("\n"); - if (timmer.current) { - clearTimeout(timmer.current); - } - timmer.current = setTimeout(() => { - setLogs( - originContent - .map((e, i) => ({ - i, - origin: e, - time: (e?.match(timeReg) || [""])[0], - })) - .filter((e) => { - let bool = e.origin.includes(keywords); - if ( - e.time && - startTime && - !dayjs(e.time).isAfter(dayjs(startTime)) - ) { - bool = false; - } - if (e.time && endTime && !dayjs(e.time).isBefore(dayjs(endTime))) { - bool = false; - } - return bool; - }) - .map((e) => ({ - ...e, - })), - ); - }, 500); - }, [content, keywords, language, startTime, endTime]); - - useEffect(() => { - if (el.current) { - el.current?.scrollTo((focusLine - 1) * (fontSize + 6)); - } - }, [focusLine, fontSize]); - - useEffect(() => { - if (outter.current) { - const scrollFunc = (event: any) => { - const { target } = event; - if ( - target && - target.scrollTop + target.clientHeight === target.scrollHeight - ) { - if (onScrollBottom) { - onScrollBottom(event); - } - } - }; - outter.current.addEventListener("scroll", scrollFunc); - return () => outter?.current?.removeEventListener("scroll", scrollFunc); - } - }, [onScrollBottom]); - - return ( - - {itemRenderer} - - ); -}; - -export default LogVirtualView; diff --git a/dashboard/client/src/components/LogView/darcula.css b/dashboard/client/src/components/LogView/darcula.css deleted file mode 100644 index 8564bf89570d..000000000000 --- a/dashboard/client/src/components/LogView/darcula.css +++ /dev/null @@ -1,59 +0,0 @@ -/* -Dracula Theme v1.2.0 -https://github.com/zenorocha/dracula-theme -Copyright 2015, All rights reserved -Code licensed under the MIT license -http://zenorocha.mit-license.org -@author Éverton Ribeiro -@author Zeno Rocha -*/ -.hljs-dark { - display: block; - overflow-x: auto; - padding: 0.5em; - color: #f8f8f2; -} -.hljs-dark .hljs-number, -.hljs-dark .hljs-keyword, -.hljs-dark .hljs-selector-tag, -.hljs-dark .hljs-literal, -.hljs-dark .hljs-section, -.hljs-dark .hljs-link { - color: #8be9fd; -} -.hljs-dark .hljs-function .hljs-keyword { - color: #ff79c6; -} -.hljs-dark .hljs-string, -.hljs-dark .hljs-title, -.hljs-dark .hljs-name, -.hljs-dark .hljs-type, -.hljs-dark .hljs-attribute, -.hljs-dark .hljs-symbol, -.hljs-dark .hljs-bullet, -.hljs-dark .hljs-addition, -.hljs-dark .hljs-variable, -.hljs-dark .hljs-template-tag, -.hljs-dark .hljs-template-variable { - color: #f1fa8c; -} -.hljs-dark .hljs-comment, -.hljs-dark .hljs-quote, -.hljs-dark .hljs-deletion, -.hljs-dark .hljs-meta { - color: #6272a4; -} -.hljs-dark .hljs-keyword, -.hljs-dark .hljs-selector-tag, -.hljs-dark .hljs-literal, -.hljs-dark .hljs-title, -.hljs-dark .hljs-section, -.hljs-dark .hljs-doctag, -.hljs-dark .hljs-type, -.hljs-dark .hljs-name, -.hljs-dark .hljs-strong { - font-weight: bold; -} -.hljs-dark .hljs-emphasis { - font-style: italic; -} diff --git a/dashboard/client/src/components/LogView/github.css b/dashboard/client/src/components/LogView/github.css deleted file mode 100644 index ca16d3f7393e..000000000000 --- a/dashboard/client/src/components/LogView/github.css +++ /dev/null @@ -1,96 +0,0 @@ -/* -github.com style (c) Vasily Polovnyov -*/ - -.hljs-light { - display: block; - overflow-x: auto; - padding: 0.5em; - color: #333; -} - -.hljs-light .hljs-comment, -.hljs-light .hljs-quote { - color: #998; - font-style: italic; -} - -.hljs-light .hljs-keyword, -.hljs-light .hljs-selector-tag, -.hljs-light .hljs-subst { - color: #333; - font-weight: bold; -} - -.hljs-light .hljs-number, -.hljs-light .hljs-literal, -.hljs-light .hljs-variable, -.hljs-light .hljs-template-variable, -.hljs-light .hljs-tag .hljs-attr { - color: #008080; -} - -.hljs-light .hljs-string, -.hljs-light .hljs-doctag { - color: #d14; -} - -.hljs-light .hljs-title, -.hljs-light .hljs-section, -.hljs-light .hljs-selector-id { - color: #900; - font-weight: bold; -} - -.hljs-light .hljs-subst { - font-weight: normal; -} - -.hljs-light .hljs-type, -.hljs-light .hljs-class .hljs-title { - color: #458; - font-weight: bold; -} - -.hljs-light .hljs-tag, -.hljs-light .hljs-name, -.hljs-light .hljs-attribute { - color: #000080; - font-weight: normal; -} - -.hljs-light .hljs-regexp, -.hljs-light .hljs-link { - color: #009926; -} - -.hljs-light .hljs-symbol, -.hljs-light .hljs-bullet { - color: #990073; -} - -.hljs-light .hljs-built_in, -.hljs-light .hljs-builtin-name { - color: #0086b3; -} - -.hljs-light .hljs-meta { - color: #999; - font-weight: bold; -} - -.hljs-light .hljs-deletion { - background: #fdd; -} - -.hljs-light .hljs-addition { - background: #dfd; -} - -.hljs-light .hljs-emphasis { - font-style: italic; -} - -.hljs-light .hljs-strong { - font-weight: bold; -} diff --git a/dashboard/client/src/components/LogView/index.css b/dashboard/client/src/components/LogView/index.css deleted file mode 100644 index 32e5f884f2bc..000000000000 --- a/dashboard/client/src/components/LogView/index.css +++ /dev/null @@ -1,3 +0,0 @@ -span.find-kws { - background-color: #ffd800; -} diff --git a/dashboard/client/src/components/PercentageBar.tsx b/dashboard/client/src/components/PercentageBar.tsx deleted file mode 100644 index 6b2cc48ade68..000000000000 --- a/dashboard/client/src/components/PercentageBar.tsx +++ /dev/null @@ -1,57 +0,0 @@ -import { makeStyles } from "@material-ui/core"; -import React, { PropsWithChildren } from "react"; - -const useStyle = makeStyles((theme) => ({ - container: { - background: "linear-gradient(45deg, #21CBF3ee 30%, #2196F3ee 90%)", - border: `1px solid #ffffffbb`, - padding: "0 12px", - height: 18, - lineHeight: "18px", - position: "relative", - boxSizing: "content-box", - borderRadius: 4, - }, - displayBar: { - background: theme.palette.background.paper, - position: "absolute", - right: 0, - height: 18, - transition: "0.5s width", - borderRadius: 2, - borderTopLeftRadius: 0, - borderBottomLeftRadius: 0, - border: "2px solid transparent", - boxSizing: "border-box", - }, - text: { - fontSize: 12, - zIndex: 2, - position: "relative", - color: theme.palette.text.primary, - width: "100%", - textAlign: "center", - }, -})); - -const PercentageBar = ( - props: PropsWithChildren<{ num: number; total: number }>, -) => { - const { num, total } = props; - const classes = useStyle(); - const per = Math.round((num / total) * 100); - - return ( -
-
-
{props.children}
-
- ); -}; - -export default PercentageBar; diff --git a/dashboard/client/src/components/SearchComponent.tsx b/dashboard/client/src/components/SearchComponent.tsx deleted file mode 100644 index 02170b13c31f..000000000000 --- a/dashboard/client/src/components/SearchComponent.tsx +++ /dev/null @@ -1,87 +0,0 @@ -import { - InputAdornment, - makeStyles, - MenuItem, - TextField, -} from "@material-ui/core"; -import { SearchOutlined } from "@material-ui/icons"; -import React from "react"; - -const useStyles = makeStyles((theme) => ({ - search: { - margin: theme.spacing(1), - marginTop: 0, - }, -})); - -export const SearchInput = ({ - label, - onChange, - defaultValue, -}: { - label: string; - defaultValue?: string; - onChange?: (value: string) => void; -}) => { - const classes = useStyles(); - - return ( - { - if (onChange) { - onChange(value); - } - }, - defaultValue, - endAdornment: ( - - - - ), - }} - /> - ); -}; - -export const SearchSelect = ({ - label, - onChange, - options, -}: { - label: string; - onChange?: (value: string) => void; - options: (string | [string, string])[]; -}) => { - const classes = useStyles(); - return ( - { - if (onChange) { - onChange(value as string); - } - }, - style: { - width: 100, - }, - }} - > - All - {options.map((e) => - typeof e === "string" ? ( - {e} - ) : ( - {e[1]} - ), - )} - - ); -}; diff --git a/dashboard/client/src/components/SpeedTools.tsx b/dashboard/client/src/components/SpeedTools.tsx deleted file mode 100644 index 7094a41176a7..000000000000 --- a/dashboard/client/src/components/SpeedTools.tsx +++ /dev/null @@ -1,156 +0,0 @@ -import { - Grow, - makeStyles, - Paper, - Tab, - Tabs, - TextField, -} from "@material-ui/core"; -import { red } from "@material-ui/core/colors"; -import { Build, Close } from "@material-ui/icons"; -import React, { useState } from "react"; -import { StatusChip } from "./StatusChip"; - -const chunkArray = (myArray: string[], chunk_size: number) => { - const results = []; - - while (myArray.length) { - results.push(myArray.splice(0, chunk_size)); - } - - return results; -}; - -const revertBit = (str: string) => { - return chunkArray(str.split(""), 2) - .reverse() - .map((e) => e.join("")) - .join(""); -}; - -const detectFlag = (str: string, offset: number) => { - const flag = parseInt(str, 16); - const mask = 1 << offset; - - return Number(!!(flag & mask)); -}; - -const useStyle = makeStyles((theme) => ({ - toolContainer: { - background: theme.palette.primary.main, - width: 48, - height: 48, - borderRadius: 48, - position: "fixed", - bottom: 100, - left: 50, - color: theme.palette.primary.contrastText, - }, - icon: { - position: "absolute", - left: 12, - cursor: "pointer", - top: 12, - }, - popover: { - position: "absolute", - left: 50, - bottom: 48, - width: 500, - height: 300, - padding: 6, - border: "1px solid", - borderColor: theme.palette.text.disabled, - }, - close: { - float: "right", - color: theme.palette.error.main, - cursor: "pointer", - }, -})); - -const ObjectIdReader = () => { - const [id, setId] = useState(""); - const tagList = [ - ["Create From Task", 15, 1], - ["Put Object", 14, 0], - ["Return Object", 14, 1], - ] as [string, number, number][]; - - return ( -
- { - setId(value); - }, - }} - /> -
- {id.length === 40 ? ( -
- Job ID: {id.slice(24, 28)}
- Actor ID: {id.slice(16, 28)}
- Task ID: {id.slice(0, 28)}
- Index: {parseInt(revertBit(id.slice(32)), 16)}
- Flag: {revertBit(id.slice(28, 32))} -
-
- {tagList - .filter( - ([a, b, c]) => detectFlag(revertBit(id.slice(28, 32)), b) === c, - ) - .map(([name]) => ( - - ))} -
- ) : ( - - Object ID should be 40 letters long - - )} -
-
- ); -}; - -const Tools = () => { - const [sel, setSel] = useState("oid_converter"); - const toolMap = { - oid_converter: , - } as { [key: string]: JSX.Element }; - - return ( -
- setSel(val)}> - Object ID Reader} - /> - - {toolMap[sel]} -
- ); -}; - -const SpeedTools = () => { - const [show, setShow] = useState(false); - const classes = useStyle(); - - return ( - - setShow(!show)} /> - - - setShow(false)} /> - - - - - ); -}; - -export default SpeedTools; diff --git a/dashboard/client/src/components/StatesCounter.tsx b/dashboard/client/src/components/StatesCounter.tsx deleted file mode 100644 index b5fc987e5f6c..000000000000 --- a/dashboard/client/src/components/StatesCounter.tsx +++ /dev/null @@ -1,31 +0,0 @@ -import { Grid } from "@material-ui/core"; -import React from "react"; -import { StatusChip } from "./StatusChip"; - -const StateCounter = ({ - type, - list, -}: { - type: string; - list: { state: string }[]; -}) => { - const stateMap = {} as { [state: string]: number }; - list.forEach(({ state }) => { - stateMap[state] = stateMap[state] + 1 || 1; - }); - - return ( - - - - - {Object.entries(stateMap).map(([s, num]) => ( - - - - ))} - - ); -}; - -export default StateCounter; diff --git a/dashboard/client/src/components/StatusChip.tsx b/dashboard/client/src/components/StatusChip.tsx deleted file mode 100644 index dc9fb11fa705..000000000000 --- a/dashboard/client/src/components/StatusChip.tsx +++ /dev/null @@ -1,90 +0,0 @@ -import { Color } from "@material-ui/core"; -import { - blue, - blueGrey, - cyan, - green, - grey, - lightBlue, - red, -} from "@material-ui/core/colors"; -import { CSSProperties } from "@material-ui/core/styles/withStyles"; -import React, { ReactNode } from "react"; -import { ActorEnum } from "../type/actor"; - -const colorMap = { - node: { - ALIVE: green, - DEAD: red, - }, - actor: { - [ActorEnum.ALIVE]: green, - [ActorEnum.DEAD]: red, - [ActorEnum.PENDING]: blue, - [ActorEnum.RECONSTRUCTING]: lightBlue, - }, - job: { - INIT: grey, - SUBMITTED: blue, - DISPATCHED: lightBlue, - RUNNING: green, - COMPLETED: cyan, - FINISHED: cyan, - FAILED: red, - }, -} as { - [key: string]: { - [key: string]: Color; - }; -}; - -const typeMap = { - deps: blue, - INFO: cyan, - ERROR: red, -} as { - [key: string]: Color; -}; - -export const StatusChip = ({ - type, - status, - suffix, -}: { - type: string; - status: string | ActorEnum | ReactNode; - suffix?: string; -}) => { - const style = { - padding: "2px 8px", - border: "solid 1px", - borderRadius: 4, - fontSize: 12, - margin: 2, - } as CSSProperties; - - let color = blueGrey as Color; - - if (typeMap[type]) { - color = typeMap[type]; - } else if ( - typeof status === "string" && - colorMap[type] && - colorMap[type][status] - ) { - color = colorMap[type][status]; - } - - style.color = color[500]; - style.borderColor = color[500]; - if (color !== blueGrey) { - style.backgroundColor = `${color[500]}20`; - } - - return ( - - {status} - {suffix} - - ); -}; diff --git a/dashboard/client/src/components/TitleCard.tsx b/dashboard/client/src/components/TitleCard.tsx deleted file mode 100644 index db088f775e60..000000000000 --- a/dashboard/client/src/components/TitleCard.tsx +++ /dev/null @@ -1,34 +0,0 @@ -import { makeStyles, Paper } from "@material-ui/core"; -import React, { PropsWithChildren, ReactNode } from "react"; - -const useStyles = makeStyles((theme) => ({ - card: { - padding: theme.spacing(2), - paddingTop: theme.spacing(1.5), - margin: [theme.spacing(2), theme.spacing(1)].map((e) => `${e}px`).join(" "), - }, - title: { - fontSize: theme.typography.fontSize + 2, - fontWeight: 500, - color: theme.palette.text.secondary, - marginBottom: theme.spacing(1), - }, - body: { - padding: theme.spacing(0.5), - }, -})); - -const TitleCard = ({ - title, - children, -}: PropsWithChildren<{ title: ReactNode | string }>) => { - const classes = useStyles(); - return ( - -
{title}
-
{children}
-
- ); -}; - -export default TitleCard; diff --git a/dashboard/client/src/components/WorkerTable.tsx b/dashboard/client/src/components/WorkerTable.tsx deleted file mode 100644 index aa6bba57b710..000000000000 --- a/dashboard/client/src/components/WorkerTable.tsx +++ /dev/null @@ -1,299 +0,0 @@ -import { - Button, - Grid, - IconButton, - Table, - TableBody, - TableCell, - TableContainer, - TableHead, - TableRow, -} from "@material-ui/core"; -import { KeyboardArrowDown, KeyboardArrowRight } from "@material-ui/icons"; -import dayjs from "dayjs"; -import React, { - PropsWithChildren, - ReactNode, - useContext, - useEffect, - useState, -} from "react"; -import { Link } from "react-router-dom"; -import { GlobalContext } from "../App"; -import { Actor } from "../type/actor"; -import { CoreWorkerStats, Worker } from "../type/worker"; -import { memoryConverter } from "../util/converter"; -import { longTextCut } from "../util/func"; - -import { useFilter } from "../util/hook"; -import ActorTable from "./ActorTable"; -import PercentageBar from "./PercentageBar"; -import { SearchInput } from "./SearchComponent"; - -export const ExpandableTableRow = ({ - children, - expandComponent, - length, - stateKey = "", - ...otherProps -}: PropsWithChildren<{ - expandComponent: ReactNode; - length: number; - stateKey?: string; -}>) => { - const [isExpanded, setIsExpanded] = React.useState(false); - - useEffect(() => { - if (stateKey.startsWith("ON")) { - setIsExpanded(true); - } else if (stateKey.startsWith("OFF")) { - setIsExpanded(false); - } - }, [stateKey]); - - if (length < 1) { - return ( - - - {children} - - ); - } - - return ( - - - - setIsExpanded(!isExpanded)} - > - {length} - {isExpanded ? : } - - - {children} - - {isExpanded && ( - - {expandComponent} - - )} - - ); -}; - -const WorkerDetailTable = ({ - actorMap, - coreWorkerStats, -}: { - actorMap: { [actorId: string]: Actor }; - coreWorkerStats: CoreWorkerStats[]; -}) => { - const actors = {} as { [actorId: string]: Actor }; - (coreWorkerStats || []) - .filter((e) => actorMap[e.actorId]) - .forEach((e) => (actors[e.actorId] = actorMap[e.actorId])); - - if (!Object.values(actors).length) { - return

The Worker Haven't Had Related Actor Yet.

; - } - - return ( - - - - ); -}; - -const RayletWorkerTable = ({ - workers = [], - actorMap, - mini, -}: { - workers: Worker[]; - actorMap: { [actorId: string]: Actor }; - mini?: boolean; -}) => { - const { changeFilter, filterFunc } = useFilter(); - const [key, setKey] = useState(""); - const { nodeMap, ipLogMap } = useContext(GlobalContext); - const open = () => setKey(`ON${Math.random()}`); - const close = () => setKey(`OFF${Math.random()}`); - - return ( - - {!mini && ( -
- changeFilter("pid", value)} - /> - - -
- )}{" "} - - - - {[ - "", - "Pid", - "CPU", - "CPU Times", - "Memory", - "CMD Line", - "Create Time", - "Log", - "Ops", - "IP/Hostname", - ].map((col) => ( - - {col} - - ))} - - - - {workers - .filter(filterFunc) - .sort((aWorker, bWorker) => { - const a = - (aWorker.coreWorkerStats || []).filter( - (e) => actorMap[e.actorId], - ).length || 0; - const b = - (bWorker.coreWorkerStats || []).filter( - (e) => actorMap[e.actorId], - ).length || 0; - return b - a; - }) - .map( - ({ - pid, - cpuPercent, - cpuTimes, - memoryInfo, - cmdline, - createTime, - coreWorkerStats = [], - language, - ip, - hostname, - }) => ( - - } - length={ - (coreWorkerStats || []).filter((e) => actorMap[e.actorId]) - .length - } - key={pid} - stateKey={key} - > - {pid} - - - {cpuPercent}% - - - -
- {Object.entries(cpuTimes || {}).map(([key, val]) => ( -
- {key}:{val} -
- ))} -
-
- -
- {Object.entries(memoryInfo || {}).map(([key, val]) => ( -
- {key}:{memoryConverter(val)} -
- ))} -
-
- - {cmdline && longTextCut(cmdline.filter((e) => e).join(" "))} - - - {dayjs(createTime * 1000).format("YYYY/MM/DD HH:mm:ss")} - - - - {ipLogMap[ip] && ( - - - Log - - - )} - - - - {language === "JAVA" && ( -
- {" "} - - -
- )} -
- - {ip} -
- {nodeMap[hostname] ? ( - - {hostname} - - ) : ( - hostname - )} -
-
- ), - )} -
-
-
- ); -}; - -export default RayletWorkerTable; diff --git a/dashboard/client/src/logo.svg b/dashboard/client/src/logo.svg deleted file mode 100644 index 70be9ee548c6..000000000000 --- a/dashboard/client/src/logo.svg +++ /dev/null @@ -1,34 +0,0 @@ - - - - -Ray Logo - - - - - - - - - - diff --git a/dashboard/client/src/pages/actor/index.tsx b/dashboard/client/src/pages/actor/index.tsx deleted file mode 100644 index cbcd264e26af..000000000000 --- a/dashboard/client/src/pages/actor/index.tsx +++ /dev/null @@ -1,36 +0,0 @@ -import { makeStyles } from "@material-ui/core"; -import React, { useEffect, useState } from "react"; -import ActorTable from "../../components/ActorTable"; -import TitleCard from "../../components/TitleCard"; -import { getActors } from "../../service/actor"; -import { Actor } from "../../type/actor"; - -const useStyles = makeStyles((theme) => ({ - root: { - padding: theme.spacing(2), - width: "100%", - }, -})); - -const Actors = () => { - const classes = useStyles(); - const [actors, setActors] = useState<{ [actorId: string]: Actor }>({}); - - useEffect(() => { - getActors().then((res) => { - if (res?.data?.data?.actors) { - setActors(res.data.data.actors); - } - }); - }, []); - - return ( -
- - - -
- ); -}; - -export default Actors; diff --git a/dashboard/client/src/pages/cmd/CMDResult.tsx b/dashboard/client/src/pages/cmd/CMDResult.tsx deleted file mode 100644 index ed87c10d8e7c..000000000000 --- a/dashboard/client/src/pages/cmd/CMDResult.tsx +++ /dev/null @@ -1,137 +0,0 @@ -import { - Button, - Grid, - makeStyles, - MenuItem, - Paper, - Select, -} from "@material-ui/core"; -import React, { useCallback, useEffect, useState } from "react"; -import { RouteComponentProps } from "react-router-dom"; -import LogVirtualView from "../../components/LogView/LogVirtualView"; -import TitleCard from "../../components/TitleCard"; -import { getJmap, getJstack, getJstat } from "../../service/util"; - -const useStyles = makeStyles((theme) => ({ - root: { - padding: theme.spacing(4), - width: "100%", - }, - table: { - marginTop: theme.spacing(4), - padding: theme.spacing(2), - }, - pageMeta: { - padding: theme.spacing(2), - marginTop: theme.spacing(2), - }, - search: { - margin: theme.spacing(1), - }, -})); - -const CMDResult = ( - props: RouteComponentProps<{ cmd: string; ip: string; pid: string }>, -) => { - const classes = useStyles(); - const { - match: { params }, - } = props; - const { cmd, ip, pid } = params; - const [result, setResult] = useState(); - const [option, setOption] = useState("gcutil"); - const executeJstat = useCallback( - () => - getJstat(ip, pid, option) - .then((rsp) => { - if (rsp.data.result) { - setResult(rsp.data.data.output); - } else { - setResult(rsp.data.msg); - } - }) - .catch((err) => setResult(err.toString())), - [ip, pid, option], - ); - - useEffect(() => { - switch (cmd) { - case "jstack": - getJstack(ip, pid) - .then((rsp) => { - if (rsp.data.result) { - setResult(rsp.data.data.output); - } else { - setResult(rsp.data.msg); - } - }) - .catch((err) => setResult(err.toString())); - break; - case "jmap": - getJmap(ip, pid) - .then((rsp) => { - if (rsp.data.result) { - setResult(rsp.data.data.output); - } else { - setResult(rsp.data.msg); - } - }) - .catch((err) => setResult(err.toString())); - break; - case "jstat": - executeJstat(); - break; - default: - setResult(`Command ${cmd} is not supported.`); - break; - } - }, [cmd, executeJstat, ip, pid]); - - return ( -
- - {cmd === "jstat" && ( - - - - - - - - - - - )} - - - - -
- ); -}; - -export default CMDResult; diff --git a/dashboard/client/src/pages/dashboard/Dashboard.tsx b/dashboard/client/src/pages/dashboard/Dashboard.tsx index 07f266961451..0ffbce7f5d5f 100644 --- a/dashboard/client/src/pages/dashboard/Dashboard.tsx +++ b/dashboard/client/src/pages/dashboard/Dashboard.tsx @@ -1,5 +1,4 @@ import { - Button, createStyles, makeStyles, Tab, @@ -9,7 +8,6 @@ import { } from "@material-ui/core"; import React, { useCallback, useEffect, useRef } from "react"; import { useDispatch, useSelector } from "react-redux"; -import { useHistory } from "react-router-dom"; import { getActorGroups, getNodeInfo, getTuneAvailability } from "../../api"; import { StoreState } from "../../store"; import LastUpdated from "./LastUpdated"; @@ -35,7 +33,6 @@ const useDashboardStyles = makeStyles((theme: Theme) => "& > :not(:first-child)": { marginTop: theme.spacing(4), }, - position: "relative", }, tabs: { borderBottomColor: theme.palette.divider, @@ -62,7 +59,6 @@ const Dashboard: React.FC = () => { const tuneAvailability = useSelector(tuneAvailabilitySelector); const tab = useSelector(tabSelector); const classes = useDashboardStyles(); - const history = useHistory(); // Polling Function const refreshInfo = useCallback(async () => { @@ -107,15 +103,6 @@ const Dashboard: React.FC = () => { return (
Ray Dashboard - { - return ( -
-
- - - - 404 NOT FOUND -

- We can't provide the page you wanted yet, better try with another path - next time. -

-
-
- ); -}; - -export default Error404; diff --git a/dashboard/client/src/pages/exception/Loading.tsx b/dashboard/client/src/pages/exception/Loading.tsx deleted file mode 100644 index 24140c4dc0de..000000000000 --- a/dashboard/client/src/pages/exception/Loading.tsx +++ /dev/null @@ -1,21 +0,0 @@ -import React from "react"; -import Logo from "../../logo.svg"; - -export default () => { - return ( -
-
- Loading -
- Loading... -
-
- ); -}; diff --git a/dashboard/client/src/pages/index/Index.tsx b/dashboard/client/src/pages/index/Index.tsx deleted file mode 100644 index 9612164499f4..000000000000 --- a/dashboard/client/src/pages/index/Index.tsx +++ /dev/null @@ -1,110 +0,0 @@ -import { - makeStyles, - TableBody, - TableCell, - TableContainer, - TableHead, - TableRow, -} from "@material-ui/core"; -import React, { useEffect, useState } from "react"; -import { version } from "../../../package.json"; -import TitleCard from "../../components/TitleCard"; -import { getRayConfig } from "../../service/cluster"; -import { getNodeList } from "../../service/node"; -import { RayConfig } from "../../type/config"; -import { NodeDetail } from "../../type/node"; -import { memoryConverter } from "../../util/converter"; - -const useStyle = makeStyles((theme) => ({ - root: { - padding: theme.spacing(2), - }, - label: { - fontWeight: "bold", - }, -})); - -const getVal = (key: string, value: any) => { - if (key === "containerMemory") { - return memoryConverter(value * 1024 * 1024); - } - return JSON.stringify(value); -}; - -const useIndex = () => { - const [rayConfig, setConfig] = useState(); - const [nodes, setNodes] = useState([]); - useEffect(() => { - getRayConfig().then((res) => { - if (res?.data?.data?.config) { - setConfig(res.data.data.config); - } - }); - }, []); - useEffect(() => { - getNodeList().then((res) => { - if (res?.data?.data?.summary) { - setNodes(res.data.data.summary); - } - }); - }, []); - - return { rayConfig, nodes }; -}; - -const Index = () => { - const { rayConfig } = useIndex(); - const classes = useStyle(); - - return ( -
- -

Dashboard Frontend Version: {version}

- {rayConfig?.imageUrl && ( -

- Image Url:{" "} - - {rayConfig.imageUrl} - -

- )} - {rayConfig?.sourceCodeLink && ( -

- Source Code:{" "} - - {rayConfig.sourceCodeLink} - -

- )} -
- {rayConfig && ( - - - - Key - Value - - - {Object.entries(rayConfig).map(([key, value]) => ( - - {key} - {getVal(key, value)} - - ))} - - - - )} -
- ); -}; - -export default Index; diff --git a/dashboard/client/src/pages/job/JobDetail.tsx b/dashboard/client/src/pages/job/JobDetail.tsx deleted file mode 100644 index b720b9c057de..000000000000 --- a/dashboard/client/src/pages/job/JobDetail.tsx +++ /dev/null @@ -1,246 +0,0 @@ -import { - Grid, - makeStyles, - Switch, - Tab, - Table, - TableBody, - TableCell, - TableContainer, - TableHead, - TableRow, - Tabs, -} from "@material-ui/core"; -import React from "react"; -import { Link, RouteComponentProps } from "react-router-dom"; -import ActorTable from "../../components/ActorTable"; -import Loading from "../../components/Loading"; -import { StatusChip } from "../../components/StatusChip"; -import TitleCard from "../../components/TitleCard"; -import RayletWorkerTable from "../../components/WorkerTable"; -import { longTextCut } from "../../util/func"; -import { useJobDetail } from "./hook/useJobDetail"; - -const useStyle = makeStyles((theme) => ({ - root: { - padding: theme.spacing(2), - }, - paper: { - padding: theme.spacing(2), - marginTop: theme.spacing(2), - marginBottom: theme.spacing(2), - }, - label: { - fontWeight: "bold", - }, - pageMeta: { - padding: theme.spacing(2), - marginTop: theme.spacing(2), - }, - tab: { - marginBottom: theme.spacing(2), - }, - dependenciesChip: { - margin: theme.spacing(0.5), - wordBreak: "break-all", - }, - alert: { - color: theme.palette.error.main, - }, -})); - -const JobDetailPage = (props: RouteComponentProps<{ id: string }>) => { - const classes = useStyle(); - const { - actorMap, - jobInfo, - job, - msg, - selectedTab, - handleChange, - handleSwitchChange, - params, - refreshing, - ipLogMap, - } = useJobDetail(props); - - if (!job || !jobInfo) { - return ( -
- - - -
- Auto Refresh: - -
- Request Status: {msg}
-
-
- ); - } - - return ( -
- - -
- Auto Refresh: - -
- Request Status: {msg}
-
- - - - - - - - {selectedTab === "info" && ( - - - Driver IP:{" "} - {jobInfo.driverIpAddress} - - {ipLogMap[jobInfo.driverIpAddress] && ( - - Driver Log:{" "} - - Log - - - )} - - Driver Pid:{" "} - {jobInfo.driverPid} - - {jobInfo.eventUrl && ( - - Event Link:{" "} - - Event Log - - - )} - {jobInfo.failErrorMessage && ( - - Fail Error:{" "} - - {jobInfo.failErrorMessage} - - - )} - - )} - {jobInfo?.dependencies && selectedTab === "dep" && ( -
- {jobInfo?.dependencies?.python && ( - -
- {jobInfo.dependencies.python.map((e) => ( - - ))} -
-
- )} - {jobInfo?.dependencies?.java && ( - - - - - - {["Name", "Version", "URL"].map((col) => ( - - {col} - - ))} - - - - {jobInfo.dependencies.java.map( - ({ name, version, url }) => ( - - {name} - {version} - - - {url} - - - - ), - )} - -
-
-
- )} -
- )} - {selectedTab === "worker" && ( -
- - - -
- )} - {selectedTab === "actor" && ( -
- - - -
- )} -
-
- ); -}; - -export default JobDetailPage; diff --git a/dashboard/client/src/pages/job/hook/useJobDetail.ts b/dashboard/client/src/pages/job/hook/useJobDetail.ts deleted file mode 100644 index 695fca760931..000000000000 --- a/dashboard/client/src/pages/job/hook/useJobDetail.ts +++ /dev/null @@ -1,73 +0,0 @@ -import { useCallback, useContext, useEffect, useRef, useState } from "react"; -import { RouteComponentProps } from "react-router-dom"; -import { GlobalContext } from "../../../App"; -import { getJobDetail } from "../../../service/job"; -import { JobDetail } from "../../../type/job"; - -export const useJobDetail = (props: RouteComponentProps<{ id: string }>) => { - const { - match: { params }, - } = props; - const [job, setJob] = useState(); - const [msg, setMsg] = useState("Loading the job detail"); - const [refreshing, setRefresh] = useState(true); - const [selectedTab, setTab] = useState("info"); - const { ipLogMap } = useContext(GlobalContext); - const tot = useRef(); - const handleChange = (event: React.ChangeEvent<{}>, newValue: string) => { - setTab(newValue); - }; - const handleSwitchChange = (event: React.ChangeEvent) => { - setRefresh(event.target.checked); - }; - const getJob = useCallback(async () => { - if (!refreshing) { - return; - } - const rsp = await getJobDetail(params.id); - - if (rsp.data?.data?.detail) { - setJob(rsp.data.data.detail); - } - - if (rsp.data?.msg) { - setMsg(rsp.data.msg || ""); - } - - if (rsp.data.result === false) { - setMsg("Job Query Error Please Check JobId"); - setJob(undefined); - setRefresh(false); - } - - tot.current = setTimeout(getJob, 4000); - }, [refreshing, params.id]); - - useEffect(() => { - if (tot.current) { - clearTimeout(tot.current); - } - getJob(); - return () => { - if (tot.current) { - clearTimeout(tot.current); - } - }; - }, [getJob]); - - const { jobInfo } = job || {}; - const actorMap = job?.jobActors; - - return { - actorMap, - jobInfo, - job, - msg, - selectedTab, - handleChange, - handleSwitchChange, - params, - refreshing, - ipLogMap, - }; -}; diff --git a/dashboard/client/src/pages/job/hook/useJobList.ts b/dashboard/client/src/pages/job/hook/useJobList.ts deleted file mode 100644 index 04f97532f75c..000000000000 --- a/dashboard/client/src/pages/job/hook/useJobList.ts +++ /dev/null @@ -1,68 +0,0 @@ -import { useCallback, useEffect, useRef, useState } from "react"; -import { getJobList } from "../../../service/job"; -import { Job } from "../../../type/job"; - -export const useJobList = () => { - const [jobList, setList] = useState([]); - const [page, setPage] = useState({ pageSize: 10, pageNo: 1 }); - const [msg, setMsg] = useState("Loading the job list..."); - const [isRefreshing, setRefresh] = useState(true); - const [filter, setFilter] = useState< - { - key: "jobId" | "name" | "language" | "state" | "namespaceId"; - val: string; - }[] - >([]); - const refreshRef = useRef(isRefreshing); - const tot = useRef(); - const changeFilter = ( - key: "jobId" | "name" | "language" | "state" | "namespaceId", - val: string, - ) => { - const f = filter.find((e) => e.key === key); - if (f) { - f.val = val; - } else { - filter.push({ key, val }); - } - setFilter([...filter]); - }; - const onSwitchChange = (event: React.ChangeEvent) => { - setRefresh(event.target.checked); - }; - refreshRef.current = isRefreshing; - const getJob = useCallback(async () => { - if (!refreshRef.current) { - return; - } - const rsp = await getJobList(); - - if (rsp?.data?.data?.summary) { - setList(rsp.data.data.summary.sort((a, b) => b.timestamp - a.timestamp)); - setMsg(rsp.data.msg || ""); - } - - tot.current = setTimeout(getJob, 4000); - }, []); - - useEffect(() => { - getJob(); - return () => { - if (tot.current) { - clearTimeout(tot.current); - } - }; - }, [getJob]); - return { - jobList: jobList.filter((node) => - filter.every((f) => node[f.key] && node[f.key].includes(f.val)), - ), - msg, - isRefreshing, - onSwitchChange, - changeFilter, - page, - originalJobs: jobList, - setPage: (key: string, val: number) => setPage({ ...page, [key]: val }), - }; -}; diff --git a/dashboard/client/src/pages/job/index.tsx b/dashboard/client/src/pages/job/index.tsx deleted file mode 100644 index b4984c129d3f..000000000000 --- a/dashboard/client/src/pages/job/index.tsx +++ /dev/null @@ -1,126 +0,0 @@ -import { - Switch, - Table, - TableBody, - TableCell, - TableContainer, - TableHead, - TableRow, -} from "@material-ui/core"; -import { makeStyles } from "@material-ui/core/styles"; -import Pagination from "@material-ui/lab/Pagination"; -import dayjs from "dayjs"; -import React from "react"; -import { Link } from "react-router-dom"; -import Loading from "../../components/Loading"; -import { SearchInput, SearchSelect } from "../../components/SearchComponent"; -import TitleCard from "../../components/TitleCard"; -import { useJobList } from "./hook/useJobList"; - -const useStyles = makeStyles((theme) => ({ - root: { - padding: theme.spacing(2), - width: "100%", - }, -})); - -const columns = ["ID", "DriverIpAddress", "DriverPid", "IsDead", "Timestamp"]; - -const JobList = () => { - const classes = useStyles(); - const { - msg, - isRefreshing, - onSwitchChange, - jobList, - changeFilter, - page, - setPage, - } = useJobList(); - - return ( -
- - - Auto Refresh: - -
- Request Status: {msg} -
- - - changeFilter("jobId", value)} - /> - changeFilter("language", value)} - options={["JAVA", "PYTHON"]} - /> - - setPage("pageSize", Math.min(Number(value), 500) || 10) - } - /> -
- setPage("pageNo", pageNo)} - /> -
- - - - {columns.map((col) => ( - - {col} - - ))} - - - - {jobList - .slice( - (page.pageNo - 1) * page.pageSize, - page.pageNo * page.pageSize, - ) - .map( - ({ - jobId = "", - driverIpAddress, - isDead, - driverPid, - timestamp, - }) => ( - - - {jobId} - - {driverIpAddress} - {driverPid} - - {isDead ? "true" : "false"} - - - {dayjs(timestamp * 1000).format("YYYY/MM/DD HH:mm:ss")} - - - ), - )} - -
-
-
-
- ); -}; - -export default JobList; diff --git a/dashboard/client/src/pages/layout/index.tsx b/dashboard/client/src/pages/layout/index.tsx deleted file mode 100644 index bcaffafce6ec..000000000000 --- a/dashboard/client/src/pages/layout/index.tsx +++ /dev/null @@ -1,157 +0,0 @@ -import { IconButton, Tooltip } from "@material-ui/core"; -import Drawer from "@material-ui/core/Drawer"; -import List from "@material-ui/core/List"; -import ListItem from "@material-ui/core/ListItem"; -import ListItemText from "@material-ui/core/ListItemText"; -import { makeStyles } from "@material-ui/core/styles"; -import Typography from "@material-ui/core/Typography"; -import { NightsStay, VerticalAlignTop, WbSunny } from "@material-ui/icons"; -import classnames from "classnames"; -import React, { PropsWithChildren } from "react"; -import { RouteComponentProps } from "react-router-dom"; - -import SpeedTools from "../../components/SpeedTools"; -import Logo from "../../logo.svg"; - -const drawerWidth = 200; - -const useStyles = makeStyles((theme) => ({ - root: { - display: "flex", - "& a": { - color: theme.palette.primary.main, - }, - }, - drawer: { - width: drawerWidth, - flexShrink: 0, - background: theme.palette.background.paper, - }, - drawerPaper: { - width: drawerWidth, - border: "none", - background: theme.palette.background.paper, - boxShadow: theme.shadows[1], - }, - title: { - padding: theme.spacing(2), - textAlign: "center", - lineHeight: "36px", - }, - divider: { - background: "rgba(255, 255, 255, .12)", - }, - menuItem: { - cursor: "pointer", - "&:hover": { - background: theme.palette.primary.main, - }, - }, - selected: { - background: `linear-gradient(45deg, ${theme.palette.primary.main} 30%, ${theme.palette.secondary.main} 90%)`, - }, - child: { - flex: 1, - }, -})); - -const BasicLayout = ( - props: PropsWithChildren< - { setTheme: (theme: string) => void; theme: string } & RouteComponentProps - >, -) => { - const classes = useStyles(); - const { location, history, children, setTheme, theme } = props; - - return ( -
- - - Ray
Ray Dashboard -
- - history.push("/node")} - > - NODES - - history.push("/job")} - > - JOBS - - history.push("/actors")} - > - ACTORS - - history.push("/log")} - > - LOGS - - history.push("/")} - > - BACK TO EXISTING DASHBOARD - - - { - window.scrollTo(0, 0); - }} - > - - - - - { - setTheme(theme === "dark" ? "light" : "dark"); - }} - > - - {theme === "dark" ? : } - - - - - -
-
{children}
-
- ); -}; - -export default BasicLayout; diff --git a/dashboard/client/src/pages/log/Logs.tsx b/dashboard/client/src/pages/log/Logs.tsx deleted file mode 100644 index 12218d52a0fa..000000000000 --- a/dashboard/client/src/pages/log/Logs.tsx +++ /dev/null @@ -1,306 +0,0 @@ -import { - Button, - InputAdornment, - LinearProgress, - List, - ListItem, - makeStyles, - Paper, - Switch, - TextField, -} from "@material-ui/core"; -import { SearchOutlined } from "@material-ui/icons"; -import React, { useEffect, useRef, useState } from "react"; -import { RouteComponentProps } from "react-router-dom"; -import LogVirtualView from "../../components/LogView/LogVirtualView"; -import { SearchInput } from "../../components/SearchComponent"; -import TitleCard from "../../components/TitleCard"; -import { getLogDetail } from "../../service/log"; - -const useStyles = makeStyles((theme) => ({ - root: { - padding: theme.spacing(2), - width: "100%", - }, - table: { - marginTop: theme.spacing(4), - padding: theme.spacing(2), - }, - pageMeta: { - padding: theme.spacing(2), - marginTop: theme.spacing(2), - }, - search: { - margin: theme.spacing(1), - }, -})); - -type LogsProps = RouteComponentProps<{ host?: string; path?: string }> & { - theme?: "dark" | "light"; -}; - -const useLogs = (props: LogsProps) => { - const { - match: { params }, - location: { search: urlSearch }, - theme, - } = props; - const { host, path } = params; - const searchMap = new URLSearchParams(urlSearch); - const urlFileName = searchMap.get("fileName"); - const el = useRef(null); - const [origin, setOrigin] = useState(); - const [search, setSearch] = useState<{ - keywords?: string; - lineNumber?: string; - fontSize?: number; - revert?: boolean; - }>(); - const [fileName, setFileName] = useState(searchMap.get("fileName") || ""); - const [log, setLogs] = useState< - undefined | string | { [key: string]: string }[] - >(); - const [startTime, setStart] = useState(); - const [endTime, setEnd] = useState(); - - useEffect(() => { - setFileName(urlFileName || ""); - }, [urlFileName]); - - useEffect(() => { - let url = "log_index"; - setLogs("Loading..."); - if (host) { - url = decodeURIComponent(host); - setOrigin(new URL(url).origin); - if (path) { - url += decodeURIComponent(path); - } - } else { - setOrigin(undefined); - } - getLogDetail(url) - .then((res) => { - if (res) { - setLogs(res); - } else { - setLogs("(null)"); - } - }) - .catch(() => { - setLogs("Failed to load"); - }); - }, [host, path]); - - return { - log, - origin, - host, - path, - el, - search, - setSearch, - theme, - fileName, - setFileName, - startTime, - setStart, - endTime, - setEnd, - }; -}; - -const Logs = (props: LogsProps) => { - const classes = useStyles(); - const { - log, - origin, - path, - el, - search, - setSearch, - theme, - fileName, - setFileName, - startTime, - setStart, - endTime, - setEnd, - } = useLogs(props); - let href = "#/log/"; - - if (origin) { - if (path) { - const after = decodeURIComponent(path).split("/"); - after.pop(); - if (after.length > 1) { - href += encodeURIComponent(origin); - href += "/"; - href += encodeURIComponent(after.join("/")); - } - } - } - - return ( -
- - - {!origin &&

Please choose an url to get log path

} - {origin && ( -

- Now Path: {origin} - {decodeURIComponent(path || "")} -

- )} - {origin && ( -
- - {typeof log === "object" && ( - { - setFileName(val); - }} - /> - )} -
- )} -
- - {typeof log === "object" && ( - - {log - .filter((e) => !fileName || e?.name?.includes(fileName)) - .map((e: { [key: string]: string }) => ( - - - {e.name} - - - ))} - - )} - {typeof log === "string" && log !== "Loading..." && ( -
-
- { - setSearch({ ...search, keywords: value }); - }, - type: "", - endAdornment: ( - - - - ), - }} - /> - { - setSearch({ ...search, lineNumber: value }); - }, - type: "", - endAdornment: ( - - - - ), - }} - /> - { - setSearch({ ...search, fontSize: Number(value) }); - }, - type: "", - }} - /> - { - setStart(val.target.value); - }} - InputLabelProps={{ - shrink: true, - }} - /> - { - setEnd(val.target.value); - }} - InputLabelProps={{ - shrink: true, - }} - /> -
- Reverse:{" "} - setSearch({ ...search, revert: v })} - /> - -
-
- -
- )} - {log === "Loading..." && ( -
-
- -
- )} -
-
-
- ); -}; - -export default Logs; diff --git a/dashboard/client/src/pages/node/NodeDetail.tsx b/dashboard/client/src/pages/node/NodeDetail.tsx deleted file mode 100644 index 6f5187bdb822..000000000000 --- a/dashboard/client/src/pages/node/NodeDetail.tsx +++ /dev/null @@ -1,287 +0,0 @@ -import { - Grid, - makeStyles, - Switch, - Tab, - TableContainer, - Tabs, -} from "@material-ui/core"; -import dayjs from "dayjs"; -import React from "react"; -import { Link, RouteComponentProps } from "react-router-dom"; -import ActorTable from "../../components/ActorTable"; -import Loading from "../../components/Loading"; -import PercentageBar from "../../components/PercentageBar"; -import { StatusChip } from "../../components/StatusChip"; -import TitleCard from "../../components/TitleCard"; -import RayletWorkerTable from "../../components/WorkerTable"; -import { ViewMeasures } from "../../type/raylet"; -import { memoryConverter } from "../../util/converter"; -import { useNodeDetail } from "./hook/useNodeDetail"; - -const useStyle = makeStyles((theme) => ({ - root: { - padding: theme.spacing(2), - }, - paper: { - padding: theme.spacing(2), - marginTop: theme.spacing(2), - marginBottom: theme.spacing(2), - }, - label: { - fontWeight: "bold", - }, - tab: { - marginBottom: theme.spacing(2), - }, -})); - -const showMeasureKeys = [ - "local_total_resource", - "local_available_resource", - "actor_stats", - "task_dependency_manager_stats", - "reconstruction_policy_stats", - "scheduling_queue_stats", - "object_manager_stats", -]; - -const ViewDataDisplayer = ({ view }: { view?: ViewMeasures }) => { - if (!view) { - return null; - } - const { tags = "", ...otherProps } = view; - - return ( - - {tags.split(",").pop()?.split(":").slice(1).join(":")}= - {Object.keys(otherProps).length > 0 ? ( - JSON.stringify(Object.values(otherProps).pop()) - ) : ( - null - )} - - ); -}; - -const NodeDetailPage = (props: RouteComponentProps<{ id: string }>) => { - const classes = useStyle(); - const { - params, - selectedTab, - nodeDetail, - msg, - isRefreshing, - onRefreshChange, - raylet, - handleChange, - } = useNodeDetail(props); - - return ( -
- - - -
- Auto Refresh: - -
- Request Status: {msg} -
- - - - - - - - {nodeDetail && selectedTab === "info" && ( -
- - -
Hostname
{" "} - {nodeDetail.hostname} -
- -
IP
{nodeDetail.ip} -
-
- - -
CPU (Logic/Physic)
{" "} - {nodeDetail.cpus[0]}/ {nodeDetail.cpus[1]} -
- -
Load (1/5/15min)
{" "} - {nodeDetail?.loadAvg[0] && - nodeDetail.loadAvg[0] - .map((e) => Number(e).toFixed(2)) - .join("/")} -
-
- - -
Load per CPU (1/5/15min)
{" "} - {nodeDetail?.loadAvg[1] && - nodeDetail.loadAvg[1] - .map((e) => Number(e).toFixed(2)) - .join("/")} -
- -
Boot Time
{" "} - {dayjs(nodeDetail.bootTime * 1000).format( - "YYYY/MM/DD HH:mm:ss", - )} -
-
- - -
Sent Tps
{" "} - {memoryConverter(nodeDetail?.net[0])}/s -
- -
Recieved Tps
{" "} - {memoryConverter(nodeDetail?.net[1])}/s -
-
- - -
Memory
{" "} - {nodeDetail?.mem && ( - - {memoryConverter(nodeDetail?.mem[0] - nodeDetail?.mem[1])}/ - {memoryConverter(nodeDetail?.mem[0])}({nodeDetail?.mem[2]}%) - - )} -
- -
CPU
{" "} - - {nodeDetail.cpu}% - -
-
- - {nodeDetail?.disk && - Object.entries(nodeDetail?.disk).map(([path, obj]) => ( - -
Disk ({path})
{" "} - {obj && ( - - {memoryConverter(obj.used)}/{memoryConverter(obj.total)} - ({obj.percent}%, {memoryConverter(obj.free)} free) - - )} -
- ))} -
- - -
Logs
{" "} - - log - -
-
-
- )} - {raylet && Object.keys(raylet).length > 0 && selectedTab === "raylet" && ( - -
- - -
Command
-
-
- {nodeDetail?.cmdline.join(" ")} -
-
-
- - -
Pid
{raylet?.pid} -
- -
Workers Num
{" "} - {raylet?.numWorkers} -
- -
Node Manager Port
{" "} - {raylet?.nodeManagerPort} -
-
- {showMeasureKeys - .map((e) => raylet.viewData.find((view) => view.viewName === e)) - .map((e) => - e ? ( - -

- {e.viewName - .split("_") - .map((e) => e[0].toUpperCase() + e.slice(1)) - .join(" ")} -

- - {e.measures.map((e) => ( - - ))} - -
- ) : null, - )} -
-
- )} - {nodeDetail?.workers && selectedTab === "worker" && ( - - - - - - )} - {nodeDetail?.actors && selectedTab === "actor" && ( - - - - - - )} -
-
- ); -}; - -export default NodeDetailPage; diff --git a/dashboard/client/src/pages/node/hook/useNodeDetail.ts b/dashboard/client/src/pages/node/hook/useNodeDetail.ts deleted file mode 100644 index 1ca3570a20ff..000000000000 --- a/dashboard/client/src/pages/node/hook/useNodeDetail.ts +++ /dev/null @@ -1,66 +0,0 @@ -import { useCallback, useContext, useEffect, useRef, useState } from "react"; -import { RouteComponentProps } from "react-router-dom"; -import { GlobalContext } from "../../../App"; -import { getNodeDetail } from "../../../service/node"; -import { NodeDetailExtend } from "../../../type/node"; - -export const useNodeDetail = (props: RouteComponentProps<{ id: string }>) => { - const { - match: { params }, - } = props; - const [selectedTab, setTab] = useState("info"); - const [nodeDetail, setNode] = useState(); - const [msg, setMsg] = useState("Loading the node infos..."); - const { namespaceMap } = useContext(GlobalContext); - const [isRefreshing, setRefresh] = useState(true); - const tot = useRef(); - const onRefreshChange = (event: React.ChangeEvent) => { - setRefresh(event.target.checked); - }; - const getDetail = useCallback(async () => { - if (!isRefreshing) { - return; - } - const { data } = await getNodeDetail(params.id); - const { data: rspData, msg, result } = data; - if (rspData?.detail) { - setNode(rspData.detail); - } - - if (msg) { - setMsg(msg); - } - - if (result === false) { - setMsg("Node Query Error Please Check Node Name"); - setRefresh(false); - } - - tot.current = setTimeout(getDetail, 4000); - }, [isRefreshing, params.id]); - const raylet = nodeDetail?.raylet; - const handleChange = (event: React.ChangeEvent<{}>, newValue: string) => { - setTab(newValue); - }; - - useEffect(() => { - getDetail(); - return () => { - if (tot.current) { - clearTimeout(tot.current); - } - }; - }, [getDetail]); - - return { - params, - selectedTab, - nodeDetail, - msg, - isRefreshing, - onRefreshChange, - raylet, - handleChange, - namespaceMap, - }; -}; diff --git a/dashboard/client/src/pages/node/hook/useNodeList.ts b/dashboard/client/src/pages/node/hook/useNodeList.ts deleted file mode 100644 index 96a3339ba4e8..000000000000 --- a/dashboard/client/src/pages/node/hook/useNodeList.ts +++ /dev/null @@ -1,74 +0,0 @@ -import { useCallback, useEffect, useRef, useState } from "react"; -import { getNodeList } from "../../../service/node"; -import { NodeDetail } from "../../../type/node"; -import { useSorter } from "../../../util/hook"; - -export const useNodeList = () => { - const [nodeList, setList] = useState([]); - const [msg, setMsg] = useState("Loading the nodes infos..."); - const [isRefreshing, setRefresh] = useState(true); - const [mode, setMode] = useState("table"); - const [filter, setFilter] = useState< - { key: "hostname" | "ip" | "state"; val: string }[] - >([]); - const [page, setPage] = useState({ pageSize: 10, pageNo: 1 }); - const { sorterFunc, setOrderDesc, setSortKey, sorterKey } = useSorter("cpu"); - const tot = useRef(); - const changeFilter = (key: "hostname" | "ip" | "state", val: string) => { - const f = filter.find((e) => e.key === key); - if (f) { - f.val = val; - } else { - filter.push({ key, val }); - } - setFilter([...filter]); - }; - const onSwitchChange = (event: React.ChangeEvent) => { - setRefresh(event.target.checked); - }; - const getList = useCallback(async () => { - if (!isRefreshing) { - return; - } - const { data } = await getNodeList(); - const { data: rspData, msg } = data; - setList(rspData.summary || []); - if (msg) { - setMsg(msg); - } else { - setMsg(""); - } - tot.current = setTimeout(getList, 4000); - }, [isRefreshing]); - - useEffect(() => { - getList(); - return () => { - if (tot.current) { - clearTimeout(tot.current); - } - }; - }, [getList]); - - return { - nodeList: nodeList - .map((e) => ({ ...e, state: e.raylet.state })) - .sort((a, b) => (a.raylet.nodeId > b.raylet.nodeId ? 1 : -1)) - .sort(sorterFunc) - .filter((node) => - filter.every((f) => node[f.key] && node[f.key].includes(f.val)), - ), - msg, - isRefreshing, - onSwitchChange, - changeFilter, - page, - originalNodes: nodeList, - setPage: (key: string, val: number) => setPage({ ...page, [key]: val }), - sorterKey, - setSortKey, - setOrderDesc, - mode, - setMode, - }; -}; diff --git a/dashboard/client/src/pages/node/index.tsx b/dashboard/client/src/pages/node/index.tsx deleted file mode 100644 index ea258cb6d09b..000000000000 --- a/dashboard/client/src/pages/node/index.tsx +++ /dev/null @@ -1,351 +0,0 @@ -import { - Button, - ButtonGroup, - Grid, - Paper, - Switch, - Table, - TableBody, - TableCell, - TableContainer, - TableHead, - TableRow, - Tooltip, -} from "@material-ui/core"; -import { makeStyles } from "@material-ui/core/styles"; -import Pagination from "@material-ui/lab/Pagination"; -import React from "react"; -import { Link } from "react-router-dom"; -import Loading from "../../components/Loading"; -import PercentageBar from "../../components/PercentageBar"; -import { SearchInput, SearchSelect } from "../../components/SearchComponent"; -import StateCounter from "../../components/StatesCounter"; -import { StatusChip } from "../../components/StatusChip"; -import TitleCard from "../../components/TitleCard"; -import { NodeDetail } from "../../type/node"; -import { memoryConverter } from "../../util/converter"; -import { useNodeList } from "./hook/useNodeList"; - -const useStyles = makeStyles((theme) => ({ - root: { - padding: theme.spacing(2), - width: "100%", - position: "relative", - }, -})); - -const columns = [ - "State", - "ID", - "Host", - "IP", - "CPU Usage", - "Memory", - "Disk(root)", - "Sent", - "Received", - "Log", -]; - -export const brpcLinkChanger = (href: string) => { - const { location } = window; - const { pathname } = location; - const pathArr = pathname.split("/"); - if (pathArr.some((e) => e.split(".").length > 1)) { - const index = pathArr.findIndex((e) => e.includes(".")); - const resultArr = pathArr.slice(0, index); - resultArr.push(href); - return `${location.protocol}//${location.host}${resultArr.join("/")}`; - } - - return `http://${href}`; -}; - -export const NodeCard = (props: { node: NodeDetail }) => { - const { node } = props; - - if (!node) { - return null; - } - - const { raylet, hostname, ip, cpu, mem, net, disk, logUrl } = node; - const { nodeId, state } = raylet; - - return ( - -

- {nodeId}{" "} -

-

- - - - - - {hostname}({ip}) - - {net && net[0] >= 0 && ( - - Sent{" "} - {memoryConverter(net[0])}/s{" "} - Received{" "} - {memoryConverter(net[1])}/s - - )} - -

- - {cpu >= 0 && ( - - CPU - - {cpu}% - - - )} - {mem && ( - - Memory - - {memoryConverter(mem[0] - mem[1])}/{memoryConverter(mem[0])}( - {mem[2]}%) - - - )} - {disk && disk["/"] && ( - - Disk('/') - - {memoryConverter(disk["/"].used)}/ - {memoryConverter(disk["/"].total)}({disk["/"].percent}%) - - - )} - - - - - - -
- ); -}; - -const Nodes = () => { - const classes = useStyles(); - const { - msg, - isRefreshing, - onSwitchChange, - nodeList, - changeFilter, - page, - setPage, - setSortKey, - setOrderDesc, - mode, - setMode, - } = useNodeList(); - - return ( -
- - - Auto Refresh: - -
- Request Status: {msg} -
- - - - - - - changeFilter("hostname", value.trim())} - /> - - - changeFilter("ip", value.trim())} - /> - - - changeFilter("state", value.trim())} - options={["ALIVE", "DEAD"]} - /> - - - - setPage("pageSize", Math.min(Number(value), 500) || 10) - } - /> - - - setSortKey(val)} - /> - - - - Reverse: - setOrderDesc(checked)} /> - - - - - - - - - -
- setPage("pageNo", pageNo)} - /> -
- {mode === "table" && ( - - - - - {columns.map((col) => ( - - {col} - - ))} - - - - {nodeList - .slice( - (page.pageNo - 1) * page.pageSize, - page.pageNo * page.pageSize, - ) - .map( - ( - { - hostname = "", - ip = "", - cpu = 0, - mem = [], - disk, - net = [0, 0], - raylet, - logUrl, - }: NodeDetail, - i, - ) => ( - - - - - - - - {raylet.nodeId.slice(0, 5)} - - - - {hostname} - {ip} - - - {cpu}% - - - - - {memoryConverter(mem[0] - mem[1])}/ - {memoryConverter(mem[0])}({mem[2]}%) - - - - {disk && disk["/"] && ( - - {memoryConverter(disk["/"].used)}/ - {memoryConverter(disk["/"].total)}( - {disk["/"].percent}%) - - )} - - - {memoryConverter(net[0])}/s - - - {memoryConverter(net[1])}/s - - - - Log - - - - ), - )} - -
-
- )} - {mode === "card" && ( - - {nodeList - .slice( - (page.pageNo - 1) * page.pageSize, - page.pageNo * page.pageSize, - ) - .map((e) => ( - - - - ))} - - )} -
-
- ); -}; - -export default Nodes; diff --git a/dashboard/client/src/service/actor.ts b/dashboard/client/src/service/actor.ts deleted file mode 100644 index 425fd62a44de..000000000000 --- a/dashboard/client/src/service/actor.ts +++ /dev/null @@ -1,14 +0,0 @@ -import axios from "axios"; -import { Actor } from "../type/actor"; - -export const getActors = () => { - return axios.get<{ - result: boolean; - message: string; - data: { - actors: { - [actorId: string]: Actor; - }; - }; - }>("logical/actors"); -}; diff --git a/dashboard/client/src/service/cluster.ts b/dashboard/client/src/service/cluster.ts deleted file mode 100644 index 9bf53e76dbb9..000000000000 --- a/dashboard/client/src/service/cluster.ts +++ /dev/null @@ -1,6 +0,0 @@ -import axios from "axios"; -import { RayConfigRsp } from "../type/config"; - -export const getRayConfig = () => { - return axios.get("api/ray_config"); -}; diff --git a/dashboard/client/src/service/job.ts b/dashboard/client/src/service/job.ts deleted file mode 100644 index fc5d5452db68..000000000000 --- a/dashboard/client/src/service/job.ts +++ /dev/null @@ -1,10 +0,0 @@ -import axios from "axios"; -import { JobDetailRsp, JobListRsp } from "../type/job"; - -export const getJobList = () => { - return axios.get("jobs?view=summary"); -}; - -export const getJobDetail = (id: string) => { - return axios.get(`jobs/${id}`); -}; diff --git a/dashboard/client/src/service/log.ts b/dashboard/client/src/service/log.ts deleted file mode 100644 index b485b12f1684..000000000000 --- a/dashboard/client/src/service/log.ts +++ /dev/null @@ -1,35 +0,0 @@ -import axios from "axios"; - -export const getLogDetail = async (url: string) => { - if (window.location.pathname !== "/" && url !== "log_index") { - const pathArr = window.location.pathname.split("/"); - if (pathArr.length > 1) { - const idx = pathArr.findIndex((e) => e.includes(":")); - if (idx > -1) { - const afterArr = pathArr.slice(0, idx); - afterArr.push(url.replace(/https?:\/\//, "")); - url = afterArr.join("/"); - } - } - } - const rsp = await axios.get( - url === "log_index" ? url : `log_proxy?url=${encodeURIComponent(url)}`, - ); - if (rsp.headers["content-type"]?.includes("html")) { - const el = document.createElement("div"); - el.innerHTML = rsp.data; - const arr = [].map.call( - el.getElementsByTagName("li"), - (li: HTMLLIElement) => { - const a = li.children[0] as HTMLAnchorElement; - return { - name: li.innerText, - href: li.innerText.includes("http") ? a.href : a.pathname, - } as { [key: string]: string }; - }, - ); - return arr as { [key: string]: string }[]; - } - - return rsp.data as string; -}; diff --git a/dashboard/client/src/service/node.ts b/dashboard/client/src/service/node.ts deleted file mode 100644 index 5eac1dc9cafb..000000000000 --- a/dashboard/client/src/service/node.ts +++ /dev/null @@ -1,10 +0,0 @@ -import axios from "axios"; -import { NodeDetailRsp, NodeListRsp } from "../type/node"; - -export const getNodeList = async () => { - return await axios.get("nodes?view=summary"); -}; - -export const getNodeDetail = async (id: string) => { - return await axios.get(`nodes/${id}`); -}; diff --git a/dashboard/client/src/service/util.ts b/dashboard/client/src/service/util.ts deleted file mode 100644 index 966c82db2919..000000000000 --- a/dashboard/client/src/service/util.ts +++ /dev/null @@ -1,52 +0,0 @@ -import axios from "axios"; - -type CMDRsp = { - result: boolean; - msg: string; - data: { - output: string; - }; -}; - -export const getJstack = (ip: string, pid: string) => { - return axios.get("utils/jstack", { - params: { - ip, - pid, - }, - }); -}; - -export const getJmap = (ip: string, pid: string) => { - return axios.get("utils/jmap", { - params: { - ip, - pid, - }, - }); -}; - -export const getJstat = (ip: string, pid: string, options: string) => { - return axios.get("utils/jstat", { - params: { - ip, - pid, - options, - }, - }); -}; - -type NamespacesRsp = { - result: boolean; - msg: string; - data: { - namespaces: { - namespaceId: string; - hostNameList: string[]; - }[]; - }; -}; - -export const getNamespaces = () => { - return axios.get("namespaces"); -}; diff --git a/dashboard/client/src/theme.ts b/dashboard/client/src/theme.ts deleted file mode 100644 index f83d58b5ad46..000000000000 --- a/dashboard/client/src/theme.ts +++ /dev/null @@ -1,61 +0,0 @@ -import { blue, blueGrey, grey, lightBlue } from "@material-ui/core/colors"; -import { createMuiTheme } from "@material-ui/core/styles"; - -const basicTheme = { - typography: { - fontSize: 12, - fontFamily: [ - "-apple-system", - "BlinkMacSystemFont", - '"Segoe UI"', - "Roboto", - '"Helvetica Neue"', - "Arial", - "sans-serif", - '"Apple Color Emoji"', - '"Segoe UI Emoji"', - '"Segoe UI Symbol"', - ].join(","), - }, - props: { - MuiPaper: { - elevation: 0, - }, - }, -}; - -export const lightTheme = createMuiTheme({ - ...basicTheme, - palette: { - primary: blue, - secondary: lightBlue, - text: { - primary: grey[900], - secondary: grey[800], - disabled: grey[400], - hint: grey[300], - }, - background: { - paper: "#fff", - default: blueGrey[50], - }, - }, -}); - -export const darkTheme = createMuiTheme({ - ...basicTheme, - palette: { - primary: blue, - secondary: lightBlue, - text: { - primary: blueGrey[50], - secondary: blueGrey[100], - disabled: blueGrey[200], - hint: blueGrey[300], - }, - background: { - paper: grey[800], - default: grey[900], - }, - }, -}); diff --git a/dashboard/client/src/type/actor.ts b/dashboard/client/src/type/actor.ts deleted file mode 100644 index 8a00c0e41269..000000000000 --- a/dashboard/client/src/type/actor.ts +++ /dev/null @@ -1,94 +0,0 @@ -export enum ActorEnum { - ALIVE = "ALIVE", - PENDING = "PENDING", - RECONSTRUCTING = "RECONSTRUCTING", - DEAD = "DEAD", -} - -export type Address = { - rayletId: string; - ipAddress: string; - port: number; - workerId: string; -}; - -export type TaskSpec = { - actorCreationTaskSpec: { - actorId: string; - dynamicWorkerOptions: string[]; - extensionData: string; - isAsyncio: boolean; - isDetached: boolean; - maxActorRestarts: boolean; - maxConcurrency: number; - name: string; - }; - args: { - data: string; - metadata: string; - nestedInlinedIds: string[]; - objectIds: string[]; - }[]; - callerAddress: { - ipAddress: string; - port: number; - rayletId: string; - workerId: string; - }; - callerId: string; - functionDescriptor: { - javaFunctionDescriptor: { - className: string; - functionName: string; - signature: string; - }; - pythonFunctionDescriptor: { - className: string; - functionName: string; - signature: string; - }; - }; - jobId: string; - language: string; - maxRetries: number; - numReturns: string; - parentCounter: string; - parentTaskId: string; - requiredPlacementResources: { - [key: string]: number; - }; - requiredResources: { - [key: string]: number; - }; - sourceActorId: string; - taskId: string; - type: string; -}; - -export type Actor = { - actorId: string; - children: { [key: string]: Actor }; - taskSpec: TaskSpec; - ipAddress: string; - isDirectCall: boolean; - jobId: string; - numExecutedTasks: number; - numLocalObjects: number; - numObjectIdsInScope: number; - state: ActorEnum | string; // PENDING, ALIVE, RECONSTRUCTING, DEAD - taskQueueLength: number; - usedObjectStoreMemory: number; - usedResources: { [key: string]: string | number }; - timestamp: number; - actorTitle: string; - averageTaskExecutionSpeed: number; - nodeId: string; - pid: number; - ownerAddress: Address; - address: Address; - maxReconstructions: string; - remainingReconstructions: string; - isDetached: false; - name: string; - numRestarts: string; -}; diff --git a/dashboard/client/src/type/config.d.ts b/dashboard/client/src/type/config.d.ts deleted file mode 100644 index 40a34a25fcd5..000000000000 --- a/dashboard/client/src/type/config.d.ts +++ /dev/null @@ -1,22 +0,0 @@ -export type RayConfig = { - userName: string; - workNodeNumber: number; - headNodeNumber: number; - containerVcores: number; - containerMemory: number; - clusterName: string; - supremeFo: boolean; - jobManagerPort: number; - externalRedisAddresses: string; - envParams: string; - sourceCodeLink: string; - imageUrl: string; -}; - -export type RayConfigRsp = { - result: boolean; - msg: string; - data: { - config: RayConfig; - }; -}; diff --git a/dashboard/client/src/type/event.d.ts b/dashboard/client/src/type/event.d.ts deleted file mode 100644 index 4f586f9a04d5..000000000000 --- a/dashboard/client/src/type/event.d.ts +++ /dev/null @@ -1,31 +0,0 @@ -export type Event = { - eventId: string; - jobId: string; - nodeId: string; - sourceType: string; - sourceHostname: string; - sourcePid: number; - label: string; - message: string; - timestamp: number; - severity: string; -}; - -export type EventRsp = { - result: boolean; - msg: string; - data: { - jobId: string; - events: Event[]; - }; -}; - -export type EventGlobalRsp = { - result: boolean; - msg: string; - data: { - events: { - global: Event[]; - }; - }; -}; diff --git a/dashboard/client/src/type/job.d.ts b/dashboard/client/src/type/job.d.ts deleted file mode 100644 index c5ca4dce874c..000000000000 --- a/dashboard/client/src/type/job.d.ts +++ /dev/null @@ -1,70 +0,0 @@ -import { Actor } from "./actor"; -import { Worker } from "./worker"; - -export type Job = { - jobId: string; - name: string; - owner: string; - language: string; - driverEntry: string; - state: string; - timestamp: number; - namespaceId: string; - driverPid: number; - driverIpAddress: string; - isDead: boolean; -}; - -export type PythonDependenciey = string; - -export type JavaDependency = { - name: string; - version: string; - md5: string; - url: string; -}; - -export type JobInfo = { - url: string; - driverArgs: string; - customConfig: { - [k: string]: string; - }; - jvmOptions: string; - dependencies: { - python: PythonDependenciey[]; - java: JavaDependency[]; - }; - driverStarted: boolean; - submitTime: string; - startTime: null | string | number; - endTime: null | string | number; - driverIpAddress: string; - driverHostname: string; - driverPid: number; - eventUrl: string; - failErrorMessage: string; - driverCmdline: string; -} & Job; - -export type JobDetail = { - jobInfo: JobInfo; - jobActors: { [id: string]: Actor }; - jobWorkers: Worker[]; -}; - -export type JobDetailRsp = { - data: { - detail: JobDetail; - }; - msg: string; - result: boolean; -}; - -export type JobListRsp = { - data: { - summary: Job[]; - }; - msg: string; - result: boolean; -}; diff --git a/dashboard/client/src/type/node.d.ts b/dashboard/client/src/type/node.d.ts deleted file mode 100644 index 12106d9adab0..000000000000 --- a/dashboard/client/src/type/node.d.ts +++ /dev/null @@ -1,62 +0,0 @@ -import { Actor } from "./actor"; -import { Raylet } from "./raylet"; -import { Worker } from "./worker"; - -export type NodeDetail = { - now: number; - hostname: string; - ip: string; - cpu: number; // cpu usage - cpus: number[]; // Logic CPU Count, Physical CPU Count - mem: number[]; // total memory, free memory, memory used ratio - bootTime: number; // start time - loadAvg: number[][]; // recent 1,5,15 minitues system load,load per cpu http://man7.org/linux/man-pages/man3/getloadavg.3.html - disk: { - // disk used on root - "/": { - total: number; - used: number; - free: number; - percent: number; - }; - // disk used on tmp - "/tmp": { - total: number; - used: number; - free: number; - percent: number; - }; - }; - net: number[]; // sent tps, received tps - raylet: Raylet; - logCounts: number; - errorCounts: number; - actors: { [id: string]: Actor }; - cmdline: string[]; - state: string; - logUrl: string; -}; - -export type NodeListRsp = { - data: { - summary: NodeDetail[]; - }; - result: boolean; - msg: string; -}; - -export type NodeDetailExtend = { - workers: Worker[]; - raylet: Raylet; - actors: { - [actorId: string]: Actor; - }; -} & NodeDetail; - -export type NodeDetailRsp = { - data: { - detail: NodeDetailExtend; - }; - msg: string; - result: boolean; -}; diff --git a/dashboard/client/src/type/raylet.d.ts b/dashboard/client/src/type/raylet.d.ts deleted file mode 100644 index 459b4c2b9086..000000000000 --- a/dashboard/client/src/type/raylet.d.ts +++ /dev/null @@ -1,28 +0,0 @@ -export type ViewMeasures = { - tags: string; - int_value?: number; - double_value?: number; - distribution_min?: number; - distribution_mean?: number; - distribution_max?: number; - distribution_count?: number; - distribution_bucket_boundaries?: number[]; - distribution_bucket_counts?: number[]; -}; - -export type ViewData = { - viewName: string; - measures: ViewMeasures[]; -}; - -export type Raylet = { - viewData: ViewData[]; - numWorkers: number; - pid: number; - nodeId: string; - nodeManagerPort: number; - brpcPort: pid; - state: string; - startTime: number; - terminateTime: number; -}; diff --git a/dashboard/client/src/type/worker.d.ts b/dashboard/client/src/type/worker.d.ts deleted file mode 100644 index cf35bfa018dd..000000000000 --- a/dashboard/client/src/type/worker.d.ts +++ /dev/null @@ -1,36 +0,0 @@ -export type CoreWorkerStats = { - currentTaskFuncDesc: string; - ipAddress: string; - port: string; - actorId: string; - usedResources: { [key: string]: number }; - numExecutedTasks: number; - workerId: string; - actorTitle: string; - jobId: string; -}; - -export type Worker = { - createTime: number; - cpuPercent: number; - cmdline: string[]; - memoryInfo: { - rss: number; // aka “Resident Set Size”, this is the non-swapped physical memory a process has used. On UNIX it matches “top“‘s RES column). On Windows this is an alias for wset field and it matches “Mem Usage” column of taskmgr.exe. - vms: number; // aka “Virtual Memory Size”, this is the total amount of virtual memory used by the process. On UNIX it matches “top“‘s VIRT column. On Windows this is an alias for pagefile field and it matches “Mem Usage” “VM Size” column of taskmgr.exe. - pfaults: number; // number of page faults. - pageins: number; // number of actual pageins. - [key: string]: number; - }; - cpuTimes: { - user: number; - system: number; - childrenUser: number; - childrenUystem: number; - iowait?: number; - }; - pid: number; - coreWorkerStats: CoreWorkerStats[]; - language: string; - hostname: string; - ip: hostname; -}; diff --git a/dashboard/client/src/util/converter.ts b/dashboard/client/src/util/converter.ts deleted file mode 100644 index 427ae86b78f3..000000000000 --- a/dashboard/client/src/util/converter.ts +++ /dev/null @@ -1,27 +0,0 @@ -export const memoryConverter = (bytes: number) => { - if (bytes < 1024) { - return `${bytes}KB`; - } - - if (bytes < 1024 ** 2) { - return `${(bytes / 1024 ** 1).toFixed(2)}KB`; - } - - if (bytes < 1024 ** 3) { - return `${(bytes / 1024 ** 2).toFixed(2)}MB`; - } - - if (bytes < 1024 ** 4) { - return `${(bytes / 1024 ** 3).toFixed(2)}GB`; - } - - if (bytes < 1024 ** 5) { - return `${(bytes / 1024 ** 4).toFixed(2)}TB`; - } - - if (bytes < 1024 ** 6) { - return `${(bytes / 1024 ** 5).toFixed(2)}TB`; - } - - return ""; -}; diff --git a/dashboard/client/src/util/func.tsx b/dashboard/client/src/util/func.tsx deleted file mode 100644 index c07ef70fe85b..000000000000 --- a/dashboard/client/src/util/func.tsx +++ /dev/null @@ -1,28 +0,0 @@ -import { Tooltip } from "@material-ui/core"; -import React, { CSSProperties } from "react"; - -export const longTextCut = (text: string = "", len: number = 28) => ( - - {text.length > len ? text.slice(0, len) + "..." : text} - -); - -export const jsonFormat = (str: string | object) => { - const preStyle = { - textAlign: "left", - wordBreak: "break-all", - whiteSpace: "pre-wrap", - } as CSSProperties; - if (typeof str === "object") { - return
{JSON.stringify(str, null, 2)}
; - } - try { - const j = JSON.parse(str); - if (typeof j !== "object") { - return JSON.stringify(j); - } - return
{JSON.stringify(j, null, 2)}
; - } catch (e) { - return str; - } -}; diff --git a/dashboard/client/src/util/hook.ts b/dashboard/client/src/util/hook.ts deleted file mode 100644 index 3c6f61b06ef8..000000000000 --- a/dashboard/client/src/util/hook.ts +++ /dev/null @@ -1,63 +0,0 @@ -import { get } from "lodash"; -import { useState } from "react"; - -export const useFilter = () => { - const [filters, setFilters] = useState<{ key: KeyType; val: string }[]>([]); - const changeFilter = (key: KeyType, val: string) => { - const f = filters.find((e) => e.key === key); - if (f) { - f.val = val; - } else { - filters.push({ key, val }); - } - setFilters([...filters]); - }; - const filterFunc = (instance: { [key: string]: any }) => { - return filters.every( - (f) => !f.val || get(instance, f.key, "").toString().includes(f.val), - ); - }; - - return { - changeFilter, - filterFunc, - }; -}; - -export const useSorter = (initialSortKey?: string) => { - const [sorter, setSorter] = useState({ - key: initialSortKey || "", - desc: false, - }); - - const sorterFunc = ( - instanceA: { [key: string]: any }, - instanceB: { [key: string]: any }, - ) => { - if (!sorter.key) { - return 0; - } - - let [b, a] = [instanceA, instanceB]; - if (sorter.desc) { - [a, b] = [instanceA, instanceB]; - } - - if (!get(a, sorter.key)) { - return -1; - } - - if (!get(b, sorter.key)) { - return 1; - } - - return get(a, sorter.key) > get(b, sorter.key) ? 1 : -1; - }; - - return { - sorterFunc, - setSortKey: (key: string) => setSorter({ ...sorter, key }), - setOrderDesc: (desc: boolean) => setSorter({ ...sorter, desc }), - sorterKey: sorter.key, - }; -}; diff --git a/dashboard/client/src/util/localData.ts b/dashboard/client/src/util/localData.ts deleted file mode 100644 index 0066c4788b95..000000000000 --- a/dashboard/client/src/util/localData.ts +++ /dev/null @@ -1,12 +0,0 @@ -export const getLocalStorage = (key: string) => { - const data = window.localStorage.getItem(key); - try { - return JSON.parse(data || "") as T; - } catch { - return data; - } -}; - -export const setLocalStorage = (key: string, value: any) => { - return window.localStorage.setItem(key, JSON.stringify(value)); -}; diff --git a/dashboard/head.py b/dashboard/head.py index f1ef75ef478d..e8e9119132d2 100644 --- a/dashboard/head.py +++ b/dashboard/head.py @@ -159,9 +159,7 @@ async def run(self): if not gcs_address: raise Exception("GCS address not found.") logger.info("Connect to GCS at %s", gcs_address) - options = (("grpc.enable_http_proxy", 0), ) - channel = aiogrpc.insecure_channel( - gcs_address, options=options) + channel = aiogrpc.insecure_channel(gcs_address) except Exception as ex: logger.error("Connect to GCS failed: %s, retry...", ex) await asyncio.sleep( diff --git a/dashboard/modules/logical_view/logical_view_head.py b/dashboard/modules/logical_view/logical_view_head.py index 6b8e0bae1ecb..cf29db637da1 100644 --- a/dashboard/modules/logical_view/logical_view_head.py +++ b/dashboard/modules/logical_view/logical_view_head.py @@ -46,9 +46,7 @@ async def kill_actor(self, req) -> aiohttp.web.Response: except KeyError: return rest_response(success=False, message="Bad Request") try: - options = (("grpc.enable_http_proxy", 0), ) - channel = aiogrpc.insecure_channel( - f"{ip_address}:{port}", options=options) + channel = aiogrpc.insecure_channel(f"{ip_address}:{port}") stub = core_worker_pb2_grpc.CoreWorkerServiceStub(channel) await stub.KillActor( diff --git a/dashboard/modules/reporter/reporter_agent.py b/dashboard/modules/reporter/reporter_agent.py index e604f7463f86..3d9472a3dee3 100644 --- a/dashboard/modules/reporter/reporter_agent.py +++ b/dashboard/modules/reporter/reporter_agent.py @@ -77,25 +77,7 @@ def __init__(self, dashboard_agent): "node_cpu": Gauge("node_cpu", "Total CPU usage on a ray node", "percentage", ["ip"]), "node_mem": Gauge("node_mem", "Total memory usage on a ray node", - "bytes", ["ip"]), - "node_disk_usage": Gauge("node_disk_usage", - "Total disk usage (bytes) on a ray node", - "bytes", ["ip"]), - "node_disk_utilization_percentage": Gauge( - "node_disk_utilization_percentage", - "Total disk utilization (percentage) on a ray node", - "percentage", ["ip"]), - "node_network_sent": Gauge("node_network_sent", - "Total network sent", "bytes", ["ip"]), - "node_network_received": Gauge("node_network_received", - "Total network received", "bytes", - ["ip"]), - "node_network_send_speed": Gauge("node_network_send_speed", - "Network send speed", "bytes/sec", - ["ip"]), - "node_network_receive_speed": Gauge("node_network_receive_speed", - "Network receive speed", - "bytes/sec", ["ip"]), + "mb", ["ip"]), "raylet_cpu": Gauge("raylet_cpu", "CPU usage of the raylet on a node.", "percentage", ["ip", "pid"]), @@ -255,10 +237,8 @@ def _get_all_stats(self): self._network_stats_hist.append((now, network_stats)) self._network_stats_hist = self._network_stats_hist[-7:] then, prev_network_stats = self._network_stats_hist[0] - prev_send, prev_recv = prev_network_stats - now_send, now_recv = network_stats - network_speed_stats = ((now_send - prev_send) / (now - then), - (now_recv - prev_recv) / (now - then)) + netstats = ((network_stats[0] - prev_network_stats[0]) / (now - then), + (network_stats[1] - prev_network_stats[1]) / (now - then)) return { "now": now, "hostname": self._hostname, @@ -271,8 +251,7 @@ def _get_all_stats(self): "loadAvg": self._get_load_avg(), "disk": self._get_disk_usage(), "gpus": self._get_gpu_usage(), - "network": network_stats, - "network_speed": network_speed_stats, + "net": netstats, "cmdline": self._get_raylet_cmdline(), } @@ -285,45 +264,10 @@ def _record_stats(self, stats): # -- Mem per node -- total, avail, _ = stats["mem"] - mem_usage = float(total - avail) + mem_usage = float(total - avail) / 1e6 mem_record = Record( gauge=self._gauges["node_mem"], value=mem_usage, tags={"ip": ip}) - # -- Disk per node -- - used, free = 0, 0 - for entry in stats["disk"].values(): - used += entry.used - free += entry.free - disk_utilization = float(used / (used + free)) * 100 - disk_usage_record = Record( - gauge=self._gauges["node_disk_usage"], value=used, tags={"ip": ip}) - disk_utilization_percentage_record = Record( - gauge=self._gauges["node_disk_utilization_percentage"], - value=disk_utilization, - tags={"ip": ip}) - - # -- Network speed (send/receive) stats per node -- - network_stats = stats["network"] - network_sent_record = Record( - gauge=self._gauges["node_network_sent"], - value=network_stats[0], - tags={"ip": ip}) - network_received_record = Record( - gauge=self._gauges["node_network_received"], - value=network_stats[1], - tags={"ip": ip}) - - # -- Network speed (send/receive) per node -- - network_speed_stats = stats["network_speed"] - network_send_speed_record = Record( - gauge=self._gauges["node_network_send_speed"], - value=network_speed_stats[0], - tags={"ip": ip}) - network_receive_speed_record = Record( - gauge=self._gauges["node_network_receive_speed"], - value=network_speed_stats[1], - tags={"ip": ip}) - raylet_stats = self._get_raylet_stats() raylet_pid = str(raylet_stats["pid"]) # -- raylet CPU -- @@ -346,12 +290,8 @@ def _record_stats(self, stats): "pid": raylet_pid }) - self._metrics_agent.record_reporter_stats([ - cpu_record, mem_record, disk_usage_record, - disk_utilization_percentage_record, network_sent_record, - network_received_record, network_send_speed_record, - network_receive_speed_record, raylet_cpu_record, raylet_mem_record - ]) + self._metrics_agent.record_reporter_stats( + [cpu_record, mem_record, raylet_cpu_record, raylet_mem_record]) async def _perform_iteration(self, aioredis_client): """Get any changes to the log files and push updates to Redis.""" diff --git a/dashboard/modules/reporter/reporter_head.py b/dashboard/modules/reporter/reporter_head.py index 7d375c8d66c4..8faef274d60c 100644 --- a/dashboard/modules/reporter/reporter_head.py +++ b/dashboard/modules/reporter/reporter_head.py @@ -38,9 +38,7 @@ async def _update_stubs(self, change): if change.new: node_id, ports = change.new ip = DataSource.node_id_to_ip[node_id] - options = (("grpc.enable_http_proxy", 0), ) - channel = aiogrpc.insecure_channel( - f"{ip}:{ports[1]}", options=options) + channel = aiogrpc.insecure_channel(f"{ip}:{ports[1]}") stub = reporter_pb2_grpc.ReporterServiceStub(channel) self._stubs[ip] = stub @@ -78,7 +76,10 @@ async def get_ray_config(self, req) -> aiohttp.web.Response: payload = { "min_workers": cfg["min_workers"], - "max_workers": cfg["max_workers"] + "max_workers": cfg["max_workers"], + "initial_workers": cfg["initial_workers"], + "autoscaling_mode": cfg["autoscaling_mode"], + "idle_timeout_minutes": cfg["idle_timeout_minutes"], } try: diff --git a/dashboard/modules/reporter/tests/test_reporter.py b/dashboard/modules/reporter/tests/test_reporter.py index 72617562f92c..001ea42a5b88 100644 --- a/dashboard/modules/reporter/tests/test_reporter.py +++ b/dashboard/modules/reporter/tests/test_reporter.py @@ -105,13 +105,7 @@ def test_case_stats_exist(): prom_addresses) return all([ "ray_node_cpu" in metric_names, "ray_node_mem" in metric_names, - "ray_raylet_cpu" in metric_names, "ray_raylet_mem" in metric_names, - "ray_node_disk_usage" in metric_names, - "ray_node_disk_utilization_percentage" in metric_names, - "ray_node_network_sent" in metric_names, - "ray_node_network_received" in metric_names, - "ray_node_network_send_speed" in metric_names, - "ray_node_network_receive_speed" in metric_names + "ray_raylet_cpu" in metric_names, "ray_raylet_mem" in metric_names ]) def test_case_ip_correct(): diff --git a/dashboard/modules/stats_collector/stats_collector_consts.py b/dashboard/modules/stats_collector/stats_collector_consts.py index cdcbf6bd126d..55119cd75dfa 100644 --- a/dashboard/modules/stats_collector/stats_collector_consts.py +++ b/dashboard/modules/stats_collector/stats_collector_consts.py @@ -1,8 +1,5 @@ -import ray - NODE_STATS_UPDATE_INTERVAL_SECONDS = 1 RETRY_GET_ALL_ACTOR_INFO_INTERVAL_SECONDS = 1 ACTOR_CHANNEL = "ACTOR" ERROR_INFO_UPDATE_INTERVAL_SECONDS = 5 LOG_INFO_UPDATE_INTERVAL_SECONDS = 5 -NIL_NODE_ID = ray.NodeID.nil().hex() diff --git a/dashboard/modules/stats_collector/stats_collector_head.py b/dashboard/modules/stats_collector/stats_collector_head.py index d8c085c0ea62..ae75864e50ca 100644 --- a/dashboard/modules/stats_collector/stats_collector_head.py +++ b/dashboard/modules/stats_collector/stats_collector_head.py @@ -71,8 +71,7 @@ async def _update_stubs(self, change): node_id, node_info = change.new address = "{}:{}".format(node_info["nodeManagerAddress"], int(node_info["nodeManagerPort"])) - options = (("grpc.enable_http_proxy", 0), ) - channel = aiogrpc.insecure_channel(address, options=options) + channel = aiogrpc.insecure_channel(address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) self._stubs[node_id] = stub @@ -203,10 +202,8 @@ def _process_actor_table_data(data): node_id = actor_table_data["address"]["rayletId"] job_actors.setdefault(job_id, {})[actor_id] = actor_table_data - # Update only when node_id is not Nil. - if node_id != stats_collector_consts.NIL_NODE_ID: - node_actors.setdefault( - node_id, {})[actor_id] = actor_table_data + node_actors.setdefault(node_id, + {})[actor_id] = actor_table_data DataSource.job_actors.reset(job_actors) DataSource.node_actors.reset(node_actors) logger.info("Received %d actor info from GCS.", @@ -221,35 +218,24 @@ def _process_actor_table_data(data): RETRY_GET_ALL_ACTOR_INFO_INTERVAL_SECONDS) # Receive actors from channel. - state_keys = ("state", "address", "numRestarts", "timestamp", "pid") async for sender, msg in receiver.iter(): try: - actor_id, actor_table_data = msg + _, actor_table_data = msg pubsub_message = ray.gcs_utils.PubSubMessage.FromString( actor_table_data) message = ray.gcs_utils.ActorTableData.FromString( pubsub_message.data) actor_table_data = actor_table_data_to_dict(message) _process_actor_table_data(actor_table_data) - # If actor is not new registered but updated, we only update - # states related fields. - if actor_table_data["state"] != "DEPENDENCIES_UNREADY": - actor_id = actor_id.decode("UTF-8")[len( - ray.gcs_utils.TablePrefix_ACTOR_string + ":"):] - actor_table_data_copy = dict(DataSource.actors[actor_id]) - for k in state_keys: - actor_table_data_copy[k] = actor_table_data[k] - actor_table_data = actor_table_data_copy actor_id = actor_table_data["actorId"] job_id = actor_table_data["jobId"] node_id = actor_table_data["address"]["rayletId"] # Update actors. DataSource.actors[actor_id] = actor_table_data - # Update node actors (only when node_id is not Nil). - if node_id != stats_collector_consts.NIL_NODE_ID: - node_actors = dict(DataSource.node_actors.get(node_id, {})) - node_actors[actor_id] = actor_table_data - DataSource.node_actors[node_id] = node_actors + # Update node actors. + node_actors = dict(DataSource.node_actors.get(node_id, {})) + node_actors[actor_id] = actor_table_data + DataSource.node_actors[node_id] = node_actors # Update job actors. job_actors = dict(DataSource.job_actors.get(job_id, {})) job_actors[actor_id] = actor_table_data diff --git a/dashboard/modules/stats_collector/tests/test_stats_collector.py b/dashboard/modules/stats_collector/tests/test_stats_collector.py index cb4a1d3c5470..bed6d650fc29 100644 --- a/dashboard/modules/stats_collector/tests/test_stats_collector.py +++ b/dashboard/modules/stats_collector/tests/test_stats_collector.py @@ -7,12 +7,7 @@ import random import pytest import ray -import redis import threading -import ray.new_dashboard.modules.stats_collector.stats_collector_consts \ - as stats_collector_consts -import ray.new_dashboard.utils as dashboard_utils -import ray.ray_constants as ray_constants from datetime import datetime, timedelta from ray.cluster_utils import Cluster from ray.new_dashboard.tests.conftest import * # noqa @@ -378,127 +373,5 @@ def check_errs(): check_errs, (AssertionError), timeout_ms=1000) -def test_nil_node(enable_test_module, disable_aiohttp_cache, - ray_start_with_dashboard): - assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) - is True) - webui_url = ray_start_with_dashboard["webui_url"] - assert wait_until_server_available(webui_url) - webui_url = format_web_url(webui_url) - - @ray.remote(num_gpus=1) - class InfeasibleActor: - pass - - infeasible_actor = InfeasibleActor.remote() # noqa - - timeout_seconds = 5 - start_time = time.time() - last_ex = None - while True: - time.sleep(1) - try: - resp = requests.get(f"{webui_url}/logical/actors") - resp_json = resp.json() - resp_data = resp_json["data"] - actors = resp_data["actors"] - assert len(actors) == 1 - response = requests.get(webui_url + "/test/dump?key=node_actors") - response.raise_for_status() - result = response.json() - assert stats_collector_consts.NIL_NODE_ID not in result["data"][ - "nodeActors"] - break - except Exception as ex: - last_ex = ex - finally: - if time.time() > start_time + timeout_seconds: - ex_stack = traceback.format_exception( - type(last_ex), last_ex, - last_ex.__traceback__) if last_ex else [] - ex_stack = "".join(ex_stack) - raise Exception(f"Timed out while testing, {ex_stack}") - - -def test_actor_pubsub(disable_aiohttp_cache, ray_start_with_dashboard): - timeout = 5 - assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) - is True) - address_info = ray_start_with_dashboard - address = address_info["redis_address"] - address = address.split(":") - assert len(address) == 2 - - client = redis.StrictRedis( - host=address[0], - port=int(address[1]), - password=ray_constants.REDIS_DEFAULT_PASSWORD) - - p = client.pubsub(ignore_subscribe_messages=True) - p.psubscribe(ray.gcs_utils.RAY_ACTOR_PUBSUB_PATTERN) - - @ray.remote - class DummyActor: - def __init__(self): - pass - - # Create a dummy actor. - a = DummyActor.remote() - - def handle_pub_messages(client, msgs, timeout, expect_num): - start_time = time.time() - while time.time() - start_time < timeout and len(msgs) < expect_num: - msg = client.get_message() - if msg is None: - time.sleep(0.01) - continue - pubsub_msg = ray.gcs_utils.PubSubMessage.FromString(msg["data"]) - actor_data = ray.gcs_utils.ActorTableData.FromString( - pubsub_msg.data) - msgs.append(actor_data) - - msgs = [] - handle_pub_messages(p, msgs, timeout, 2) - - # Assert we received published actor messages with state - # DEPENDENCIES_UNREADY and ALIVE. - assert len(msgs) == 2 - - # Kill actor. - ray.kill(a) - handle_pub_messages(p, msgs, timeout, 3) - - # Assert we received published actor messages with state DEAD. - assert len(msgs) == 3 - - def actor_table_data_to_dict(message): - return dashboard_utils.message_to_dict( - message, { - "actorId", "parentId", "jobId", "workerId", "rayletId", - "actorCreationDummyObjectId", "callerId", "taskId", - "parentTaskId", "sourceActorId", "placementGroupId" - }, - including_default_value_fields=False) - - non_state_keys = ("actorId", "jobId", "taskSpec") - for msg in msgs: - actor_data_dict = actor_table_data_to_dict(msg) - # DEPENDENCIES_UNREADY is 0, which would not be keeped in dict. We - # need check its original value. - if msg.state == 0: - assert len(actor_data_dict) > 5 - for k in non_state_keys: - assert k in actor_data_dict - # For status that is not DEPENDENCIES_UNREADY, only states fields will - # be published. - elif actor_data_dict["state"] in ("ALIVE", "DEAD"): - assert actor_data_dict.keys() == { - "state", "address", "timestamp", "pid" - } - else: - raise Exception("Unknown state: {}".format( - actor_data_dict["state"])) - - if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/dashboard/tests/conftest.py b/dashboard/tests/conftest.py index ec893fbef252..cb49e8bfc94a 100644 --- a/dashboard/tests/conftest.py +++ b/dashboard/tests/conftest.py @@ -1,40 +1,17 @@ -import os -import pytest -from ray.tests.conftest import * # noqa - - -@pytest.fixture -def enable_test_module(): - os.environ["RAY_DASHBOARD_MODULE_TEST"] = "true" - yield - os.environ.pop("RAY_DASHBOARD_MODULE_TEST", None) - - -@pytest.fixture -def disable_aiohttp_cache(): - os.environ["RAY_DASHBOARD_NO_CACHE"] = "true" - yield - os.environ.pop("RAY_DASHBOARD_NO_CACHE", None) - - -@pytest.fixture -def set_http_proxy(): - http_proxy = os.environ.get("http_proxy", None) - https_proxy = os.environ.get("https_proxy", None) - - # set http proxy - os.environ["http_proxy"] = "www.example.com:990" - os.environ["https_proxy"] = "www.example.com:990" - - yield - - # reset http proxy - if http_proxy: - os.environ["http_proxy"] = http_proxy - else: - del os.environ["http_proxy"] - - if https_proxy: - os.environ["https_proxy"] = https_proxy - else: - del os.environ["https_proxy"] +import os +import pytest +from ray.tests.conftest import * # noqa + + +@pytest.fixture +def enable_test_module(): + os.environ["RAY_DASHBOARD_MODULE_TEST"] = "true" + yield + os.environ.pop("RAY_DASHBOARD_MODULE_TEST", None) + + +@pytest.fixture +def disable_aiohttp_cache(): + os.environ["RAY_DASHBOARD_NO_CACHE"] = "true" + yield + os.environ.pop("RAY_DASHBOARD_NO_CACHE", None) diff --git a/dashboard/tests/test_dashboard.py b/dashboard/tests/test_dashboard.py index 529e394613d0..1acc94a169fe 100644 --- a/dashboard/tests/test_dashboard.py +++ b/dashboard/tests/test_dashboard.py @@ -571,38 +571,5 @@ def test_immutable_types(): print(d3[1]) -def test_http_proxy(enable_test_module, set_http_proxy, shutdown_only): - address_info = ray.init(num_cpus=1, include_dashboard=True) - assert (wait_until_server_available(address_info["webui_url"]) is True) - - webui_url = address_info["webui_url"] - webui_url = format_web_url(webui_url) - - timeout_seconds = 10 - start_time = time.time() - while True: - time.sleep(1) - try: - response = requests.get( - webui_url + "/test/dump", - proxies={ - "http": None, - "https": None - }) - response.raise_for_status() - try: - response.json() - assert response.ok - except Exception as ex: - logger.info("failed response: %s", response.text) - raise ex - break - except (AssertionError, requests.exceptions.ConnectionError) as e: - logger.info("Retry because of %s", e) - finally: - if time.time() > start_time + timeout_seconds: - raise Exception("Timed out while testing.") - - if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/dashboard/utils.py b/dashboard/utils.py index 5c347ed32a49..e1379eea8e14 100644 --- a/dashboard/utils.py +++ b/dashboard/utils.py @@ -1,35 +1,34 @@ import abc +import os +import socket +import time import asyncio import collections +import json import datetime import functools import importlib import inspect -import json import logging -import os import pkgutil -import socket import traceback -from abc import ABCMeta, abstractmethod from base64 import b64decode -from collections import namedtuple +from abc import ABCMeta, abstractmethod from collections.abc import MutableMapping, Mapping, Sequence +from collections import namedtuple from typing import Any -import aiohttp.signals -import aiohttp.web import aioredis -import time +import aiohttp.web +import ray.new_dashboard.consts as dashboard_consts from aiohttp import hdrs from aiohttp.frozenlist import FrozenList from aiohttp.typedefs import PathLike from aiohttp.web import RouteDef +import aiohttp.signals from google.protobuf.json_format import MessageToDict - -import ray.new_dashboard.consts as dashboard_consts -from ray.ray_constants import env_bool from ray.utils import binary_to_hex +from ray.ray_constants import env_bool try: create_task = asyncio.create_task diff --git a/doc/examples/lm/lm-cluster.yaml b/doc/examples/lm/lm-cluster.yaml index 7ea6641f588d..3590d482aa64 100644 --- a/doc/examples/lm/lm-cluster.yaml +++ b/doc/examples/lm/lm-cluster.yaml @@ -9,6 +9,23 @@ min_workers: 1 # node. This takes precedence over min_workers. max_workers: 2 +# The initial number of worker nodes to launch in addition to the head +# node. When the cluster is first brought up (or when it is refreshed with a +# subsequent `ray up`) this number of nodes will be started. +initial_workers: 1 + +# Whether or not to autoscale aggressively. If this is enabled, if at any point +# we would start more workers, we start at least enough to bring us to +# initial_workers. +autoscaling_mode: default + + +# The autoscaler will scale up the cluster to this target fraction of resource +# usage. For example, if a cluster of 10 nodes is 100% busy and +# target_utilization is 0.8, it would resize the cluster to 13. This fraction +# can be decreased to increase the aggressiveness of upscaling. +# This value must be less than 1.0 for scaling to happen. +target_utilization_fraction: 0.48 # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 5 diff --git a/doc/examples/plot_example-lm.rst b/doc/examples/plot_example-lm.rst index 204f470b3f29..843a7e782310 100644 --- a/doc/examples/plot_example-lm.rst +++ b/doc/examples/plot_example-lm.rst @@ -11,7 +11,7 @@ You can view the `code for this example`_. .. _`code for this example`: https://github.com/ray-project/ray/tree/master/doc/examples/lm -To use Ray cluster launcher on AWS, install boto (``pip install boto3``) and configure your AWS credentials in ``~/.aws/credentials`` as described on the :ref:`Automatic Cluster Setup page `. +To use Ray cluster launcher on AWS, install boto (``pip install boto3``) and configure your AWS credentials in ``~/.aws/credentials`` as described on the :ref:`Automatic Cluster Setup page `. We provide an `example config file `__ (``lm-cluster.yaml``). In the example config file, we use an ``m5.xlarge`` on-demand instance as the head node, and use ``p3.2xlarge`` GPU spot instances as the worker nodes. We set the minimal number of workers to 1 and maximum workers to 2 in the config, which can be modified according to your own demand. diff --git a/python/ray/autoscaler/kubernetes/example_scripts/run_on_head.py b/doc/kubernetes/example.py similarity index 63% rename from python/ray/autoscaler/kubernetes/example_scripts/run_on_head.py rename to doc/kubernetes/example.py index 3def71effcf2..b1ea3e23d901 100644 --- a/python/ray/autoscaler/kubernetes/example_scripts/run_on_head.py +++ b/doc/kubernetes/example.py @@ -1,10 +1,9 @@ from collections import Counter +import os import sys import time import ray -# Run this script on the Ray head node using kubectl exec. - @ray.remote def gethostname(x): @@ -17,9 +16,7 @@ def gethostname(x): def wait_for_nodes(expected): # Wait for all nodes to join the cluster. while True: - resources = ray.cluster_resources() - node_keys = [key for key in resources if "node" in key] - num_nodes = sum(resources[node_key] for node_key in node_keys) + num_nodes = len(ray.nodes()) if num_nodes < expected: print("{} nodes have joined so far, waiting for {} more.".format( num_nodes, expected - num_nodes)) @@ -30,7 +27,7 @@ def wait_for_nodes(expected): def main(): - wait_for_nodes(3) + wait_for_nodes(4) # Check that objects can be transferred from each node to each other node. for i in range(10): @@ -46,5 +43,13 @@ def main(): if __name__ == "__main__": - ray.init(address="auto") + # NOTE: If you know you're running this on the head node, you can just + # use "localhost" here. + # redis_host = "localhost" + if ("RAY_HEAD_SERVICE_HOST" not in os.environ + or os.environ["RAY_HEAD_SERVICE_HOST"] == ""): + raise ValueError("RAY_HEAD_SERVICE_HOST environment variable empty." + "Is there a ray cluster running?") + redis_host = os.environ["RAY_HEAD_SERVICE_HOST"] + ray.init(address=redis_host + ":6379") main() diff --git a/doc/kubernetes/ray-cluster.yaml b/doc/kubernetes/ray-cluster.yaml index fe3a04c486e7..70d386ad5b21 100644 --- a/doc/kubernetes/ray-cluster.yaml +++ b/doc/kubernetes/ray-cluster.yaml @@ -6,18 +6,24 @@ metadata: name: ray-head spec: ports: - - name: client - protocol: TCP - port: 10001 - targetPort: 10001 - - name: dashboard - protocol: TCP - port: 8265 - targetPort: 8265 - - name: redis - protocol: TCP - port: 6379 - targetPort: 6379 + # Redis ports. + - name: redis-primary + port: 6379 + targetPort: 6379 + - name: redis-shard-0 + port: 6380 + targetPort: 6380 + - name: redis-shard-1 + port: 6381 + targetPort: 6381 + + # Ray internal communication ports. + - name: object-manager + port: 12345 + targetPort: 12345 + - name: node-manager + port: 12346 + targetPort: 12346 selector: component: ray-head --- @@ -56,12 +62,14 @@ spec: image: rayproject/ray:nightly imagePullPolicy: IfNotPresent command: [ "/bin/bash", "-c", "--" ] - args: - - "ray start --head --port=6379 --redis-shard-ports=6380,6381 --num-cpus=$MY_CPU_REQUEST --object-manager-port=12345 --node-manager-port=12346 --block" + args: + - "ray start --head --node-ip-address=$MY_POD_IP --port=6379 --redis-shard-ports=6380,6381 --num-cpus=$MY_CPU_REQUEST --object-manager-port=12345 --node-manager-port=12346 --block" ports: - - containerPort: 6379 # Redis port - - containerPort: 10001 # Used by Ray Client - - containerPort: 8265 # Used by Ray Dashboard + - containerPort: 6379 # Redis port. + - containerPort: 6380 # Redis port. + - containerPort: 6381 # Redis port. + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. # This volume allocates shared memory for Ray to use for its plasma # object store. If you do not provide this, Ray will fall back to @@ -70,6 +78,11 @@ spec: - mountPath: /dev/shm name: dshm env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + # This is used in the ray start command so that Ray can spawn the # correct number of processes. Omitting this may lead to degraded # performance. @@ -111,14 +124,19 @@ spec: imagePullPolicy: IfNotPresent command: ["/bin/bash", "-c", "--"] args: - - "ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_SERVICE_HOST:$RAY_HEAD_SERVICE_PORT_REDIS --object-manager-port=12345 --node-manager-port=12346 --block" - # This volume allocates shared memory for Ray to use for its plasma - # object store. If you do not provide this, Ray will fall back to - # /tmp which cause slowdowns if is not a shared memory volume. + - "ray start --node-ip-address=$MY_POD_IP --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_SERVICE_HOST:$RAY_HEAD_SERVICE_PORT_REDIS_PRIMARY --object-manager-port=12345 --node-manager-port=12346 --block" + ports: + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. volumeMounts: - mountPath: /dev/shm name: dshm env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + # This is used in the ray start command so that Ray can spawn the # correct number of processes. Omitting this may lead to degraded # performance. diff --git a/doc/kubernetes/ray-job.yaml b/doc/kubernetes/ray-job.yaml new file mode 100644 index 000000000000..686359e167d8 --- /dev/null +++ b/doc/kubernetes/ray-job.yaml @@ -0,0 +1,32 @@ +# Job to run a Ray program in its own pod. Assumes that a cluster is already +# running (e.g., from './ray-cluster.yaml'). +apiVersion: batch/v1 +kind: Job +metadata: + namespace: ray + generateName: ray-test-job- +spec: + template: + spec: + restartPolicy: Never + containers: + - name: ray-head + image: rayproject/ray:nightly + imagePullPolicy: IfNotPresent + command: [ "/bin/bash", "-c", "--" ] + args: + - "cd ~ && wget https://raw.githubusercontent.com/ray-project/ray/master/doc/kubernetes/example.py && + ray start --node-ip-address=$MY_POD_IP --num-cpus=0 --address=$RAY_HEAD_SERVICE_HOST:$RAY_HEAD_SERVICE_PORT_REDIS_PRIMARY --object-manager-port=12345 --node-manager-port=12346 && + python example.py" + ports: + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. + env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + resources: + requests: + cpu: 100m + memory: 512Mi diff --git a/doc/kubernetes/ray-namespace.yaml b/doc/kubernetes/ray-namespace.yaml new file mode 100644 index 000000000000..3f379c3759b0 --- /dev/null +++ b/doc/kubernetes/ray-namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: ray diff --git a/doc/requirements-doc.txt b/doc/requirements-doc.txt index a9a34624a629..cb2c358fa1fa 100644 --- a/doc/requirements-doc.txt +++ b/doc/requirements-doc.txt @@ -25,7 +25,6 @@ sphinx-jsonschema sphinx-tabs sphinx-version-warning sphinx-book-theme -sphinxcontrib.yt starlette tabulate uvicorn diff --git a/doc/source/actors.rst b/doc/source/actors.rst index d82559af86b0..c680b1558b3d 100644 --- a/doc/source/actors.rst +++ b/doc/source/actors.rst @@ -105,7 +105,7 @@ Methods of the actor can be called remotely. counter_actor = Counter.remote() - assert ray.get(counter_actor.increment.remote()) == 1 + assert counter_actor.increment.remote() == 1 @ray.remote class Foo(object): @@ -174,12 +174,14 @@ have these resources (see `configuration instructions * If you specify resource requirements in an actor class's remote decorator, then the actor will acquire those resources for its entire lifetime (if you - do not specify CPU resources, the default is 0), even if it is not executing + do not specify CPU resources, the default is 1), even if it is not executing any methods. The actor will not acquire any additional resources when executing methods. * If you do not specify any resource requirements in the actor class's remote decorator, then by default, the actor will not acquire any resources for its - lifetime. + lifetime, but every time it executes a method, it will need to acquire 1 CPU + resource. + .. tabs:: .. code-tab:: python diff --git a/doc/source/async_api.rst b/doc/source/async_api.rst index 644699d8833b..a305c2dd1be3 100644 --- a/doc/source/async_api.rst +++ b/doc/source/async_api.rst @@ -162,28 +162,3 @@ Instead, you can use the ``max_concurrency`` Actor options without any async met Each invocation of the threaded actor will be running in a thread pool. The size of the threadpool is limited by the ``max_concurrency`` value. - -AsyncIO for Remote Tasks ------------------------- - -We don't support asyncio for remote tasks. The following snippet will fail: - -.. code-block:: python - - @ray.remote - async def f(): - pass - -Instead, you can wrap the ``async`` function with a wrapper to run the task synchronously: - -.. code-block:: python - - async def f(): - pass - - @ray.remote - def wrapper(): - import asyncio - asyncio.get_event_loop().run_until_complete(f()) - - \ No newline at end of file diff --git a/doc/source/cluster/autoscaling.rst b/doc/source/cluster/autoscaling.rst new file mode 100644 index 000000000000..e8d8f235d4e5 --- /dev/null +++ b/doc/source/cluster/autoscaling.rst @@ -0,0 +1,167 @@ +.. _ref-autoscaling: + +Cluster Autoscaling +=================== + +.. tip:: Before you continue, be sure to have read :ref:`cluster-cloud`. + +Basics +------ + +The Ray Cluster Launcher will automatically enable a load-based autoscaler. The scheduler will look at the task, actor, and placement group resource demands from the cluster, and tries to add the minimum set of nodes that can fulfill these demands. When nodes are idle for more than a timeout, they will be removed, down to the ``min_workers`` limit. The head node is never removed. + +To avoid launching too many nodes at once, the number of nodes allowed to be pending is limited by the ``upscaling_speed`` setting. By default it is set to ``1.0``, which means the cluster can be growing in size by at most ``100%`` at any time (e.g., if the cluster currently has 20 nodes, at most 20 pending launches are allowed). This fraction can be set to as high as needed, e.g., ``99999`` to allow the cluster to quickly grow to its max size. + +In more detail, the autoscaler implements the following control loop: + + 1. It calculates the number of nodes required to satisfy all currently pending tasks, actor, and placement group requests. + 2. If the number of nodes required total divided by the number of current nodes exceeds ``1 + upscaling_speed``, then the number of nodes launched will be limited by that threshold. + 3. If a node is idle for a timeout (5 minutes by default), it is removed from the cluster. + +The basic autoscaling config settings are as follows: + +.. code-block:: yaml + + # An unique identifier for the head node and workers of this cluster. + cluster_name: default + + # The minimum number of workers nodes to launch in addition to the head + # node. This number should be >= 0. + min_workers: 0 + + # The autoscaler will scale up the cluster faster with higher upscaling speed. + # E.g., if the task requires adding more nodes then autoscaler will gradually + # scale up the cluster in chunks of upscaling_speed*currently_running_nodes. + # This number should be > 0. + upscaling_speed: 1.0 + + # If a node is idle for this many minutes, it will be removed. A node is + # considered idle if there are no tasks or actors running on it. + idle_timeout_minutes: 5 + +Programmatically Scaling a Cluster +---------------------------------- + +You can from within a Ray program command the autoscaler to scale the cluster up to a desired size with ``request_resources()`` call. The cluster will immediately attempt to scale to accomodate the requested resources, bypassing normal upscaling speed constraints. + +.. autofunction:: ray.autoscaler.sdk.request_resources + +Manually Adding Nodes without Resources (Unmanaged Nodes) +--------------------------------------------------------- + +In some cases, adding special nodes without any resources (i.e. `num_cpus=0`) may be desirable. Such nodes can be used as a driver which connects to the cluster to launch jobs. + +In order to manually add a node to an autoscaled cluster, the `ray-cluster-name` tag should be set and `ray-node-type` tag should be set to `unmanaged`. + +Unmanaged nodes **must have 0 resources**. + +If you are using the `available_node_types` field, you should create a custom node type with `resources: {}`, and `max_workers: 0` when configuring the autoscaler. + +The autoscaler will not attempt to start, stop, or update unmanaged nodes. The user is responsible for properly setting up and cleaning up unmanaged nodes. + + +Multiple Node Type Autoscaling +------------------------------ + +Ray supports multiple node types in a single cluster. In this mode of operation, the scheduler will choose the types of nodes to add based on the resource demands, instead of always adding the same kind of node type. + +The concept of a cluster node type encompasses both the physical instance type (e.g., AWS p3.8xl GPU nodes vs m4.16xl CPU nodes), as well as other attributes (e.g., IAM role, the machine image, etc). `Custom resources `__ can be specified for each node type so that Ray is aware of the demand for specific node types at the application level (e.g., a task may request to be placed on a machine with a specific role or machine image via custom resource). + +An example of configuring multiple node types is as follows `(full example) `__: + +.. code-block:: yaml + + # Specify the allowed node types and the resources they provide. + # The key is the name of the node type, which is just for debugging purposes. + # The node config specifies the launch config and physical instance type. + available_node_types: + cpu_4_ondemand: + node_config: + InstanceType: m4.xlarge + # For AWS instances, autoscaler will automatically add the available + # CPUs/GPUs/accelerator_type ({"CPU": 4} for m4.xlarge) in "resources". + # resources: {"CPU": 4} + min_workers: 1 + max_workers: 5 + cpu_16_spot: + node_config: + InstanceType: m4.4xlarge + InstanceMarketOptions: + MarketType: spot + # Autoscaler will auto fill the CPU resources below. + resources: {"Custom1": 1, "is_spot": 1} + max_workers: 10 + gpu_1_ondemand: + node_config: + InstanceType: p2.xlarge + # Autoscaler will auto fill the CPU/GPU resources below. + resources: {"Custom2": 2} + max_workers: 4 + worker_setup_commands: + - pip install tensorflow-gpu # Example command. + gpu_8_ondemand: + node_config: + InstanceType: p3.8xlarge + # Autoscaler autofills the "resources" below. + # resources: {"CPU": 32, "GPU": 4, "accelerator_type:V100": 1} + max_workers: 2 + worker_setup_commands: + - pip install tensorflow-gpu # Example command. + + # Specify the node type of the head node (as configured above). + head_node_type: cpu_4_ondemand + + # Specify the default type of the worker node (as configured above). + worker_default_node_type: cpu_16_spot + + +The above config defines two CPU node types (``cpu_4_ondemand`` and ``cpu_16_spot``), and two GPU types (``gpu_1_ondemand`` and ``gpu_8_ondemand``). Each node type has a name (e.g., ``cpu_4_ondemand``), which has no semantic meaning and is only for debugging. Let's look at the inner fields of the ``gpu_1_ondemand`` node type: + +The node config tells the underlying Cloud provider how to launch a node of this type. This node config is merged with the top level node config of the YAML and can override fields (i.e., to specify the p2.xlarge instance type here): + +.. code-block:: yaml + + node_config: + InstanceType: p2.xlarge + +The resources field tells the autoscaler what kinds of resources this node provides. This can include custom resources as well (e.g., "Custom2"). This field enables the autoscaler to automatically select the right kind of nodes to launch given the resource demands of the application. The resources specified here will be automatically passed to the ``ray start`` command for the node via an environment variable. For more information, see also the `resource demand scheduler `__: + +.. code-block:: yaml + + resources: {"CPU": 4, "GPU": 1, "Custom2": 2} + +The ``min_workers`` and ``max_workers`` fields constrain the minimum and maximum number of nodes of this type to launch, respectively: + +.. code-block:: yaml + + min_workers: 1 + max_workers: 4 + +The ``worker_setup_commands`` field (and also the ``initialization_commands`` field, not shown) can be used to override the setup and initialization commands for a node type. Note that you can only override the setup for worker nodes. The head node's setup commands are always configured via the top level field in the cluster YAML: + +.. code-block:: yaml + + worker_setup_commands: + - pip install tensorflow-gpu # Example command. + +Docker Support for Multi-type clusters +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For each node type, you can specify ``worker_image`` and ``pull_before_run`` fields. These will override any top level ``docker`` section values (see :ref:`autoscaler-docker`). The ``worker_run_options`` field is combined with top level ``docker: run_options`` field to produce the docker run command for the given node_type. Ray will automatically select the Nvidia docker runtime if it is available. + +The following configuration is for a GPU enabled node type: + +.. code-block:: yaml + + available_node_types: + gpu_1_ondemand: + max_workers: 2 + worker_setup_commands: + - pip install tensorflow-gpu # Example command. + + # Docker specific commands for gpu_1_ondemand + pull_before_run: True + worker_image: + - rayproject/ray-ml:latest-gpu + worker_run_options: # Appended to top-level docker field. + - "-v /home:/home" diff --git a/doc/source/cluster/cloud.rst b/doc/source/cluster/cloud.rst index d2e7b90d55eb..ea59f95eaa79 100644 --- a/doc/source/cluster/cloud.rst +++ b/doc/source/cluster/cloud.rst @@ -272,116 +272,6 @@ There are two ways of running private clusters: $ ray down ray/python/ray/autoscaler/local/example-full.yaml -.. _manual-cluster: - -Manual Ray Cluster Setup ------------------------- - -The most preferable way to run a Ray cluster is via the Ray Cluster Launcher. However, it is also possible to start a Ray cluster by hand. - -This section assumes that you have a list of machines and that the nodes in the cluster can communicate with each other. It also assumes that Ray is installed -on each machine. To install Ray, follow the `installation instructions`_. - -.. _`installation instructions`: http://docs.ray.io/en/master/installation.html - -Starting Ray on each machine -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -On the head node (just choose some node to be the head node), run the following. -If the ``--port`` argument is omitted, Ray will choose port 6379, falling back to a -random port. - -.. code-block:: bash - - $ ray start --head --port=6379 - ... - Next steps - To connect to this Ray runtime from another node, run - ray start --address=':6379' --redis-password='' - - If connection fails, check your firewall settings and network configuration. - -The command will print out the address of the Redis server that was started -(the local node IP address plus the port number you specified). - -**Then on each of the other nodes**, run the following. Make sure to replace -``
`` with the value printed by the command on the head node (it -should look something like ``123.45.67.89:6379``). - -Note that if your compute nodes are on their own subnetwork with Network -Address Translation, to connect from a regular machine outside that subnetwork, -the command printed by the head node will not work. You need to find the -address that will reach the head node from the second machine. If the head node -has a domain address like compute04.berkeley.edu, you can simply use that in -place of an IP address and rely on the DNS. - -.. code-block:: bash - - $ ray start --address=
--redis-password='' - -------------------- - Ray runtime started. - -------------------- - - To terminate the Ray runtime, run - ray stop - -If you wish to specify that a machine has 10 CPUs and 1 GPU, you can do this -with the flags ``--num-cpus=10`` and ``--num-gpus=1``. See the :ref:`Configuration ` page for more information. - -If you see ``Unable to connect to Redis. If the Redis instance is on a -different machine, check that your firewall is configured properly.``, -this means the ``--port`` is inaccessible at the given IP address (because, for -example, the head node is not actually running Ray, or you have the wrong IP -address). - -If you see ``Ray runtime started.``, then the node successfully connected to -the IP address at the ``--port``. You should now be able to connect to the -cluster with ``ray.init(address='auto')``. - -If ``ray.init(address='auto')`` keeps repeating -``redis_context.cc:303: Failed to connect to Redis, retrying.``, then the node -is failing to connect to some other port(s) besides the main port. - -.. code-block:: bash - - If connection fails, check your firewall settings and network configuration. - -If the connection fails, to check whether each port can be reached from a node, -you can use a tool such as ``nmap`` or ``nc``. - -.. code-block:: bash - - $ nmap -sV --reason -p $PORT $HEAD_ADDRESS - Nmap scan report for compute04.berkeley.edu (123.456.78.910) - Host is up, received echo-reply ttl 60 (0.00087s latency). - rDNS record for 123.456.78.910: compute04.berkeley.edu - PORT STATE SERVICE REASON VERSION - 6379/tcp open redis syn-ack ttl 60 Redis key-value store - Service detection performed. Please report any incorrect results at https://nmap.org/submit/ . - $ nc -vv -z $HEAD_ADDRESS $PORT - Connection to compute04.berkeley.edu 6379 port [tcp/*] succeeded! - -If the node cannot access that port at that IP address, you might see - -.. code-block:: bash - - $ nmap -sV --reason -p $PORT $HEAD_ADDRESS - Nmap scan report for compute04.berkeley.edu (123.456.78.910) - Host is up (0.0011s latency). - rDNS record for 123.456.78.910: compute04.berkeley.edu - PORT STATE SERVICE REASON VERSION - 6379/tcp closed redis reset ttl 60 - Service detection performed. Please report any incorrect results at https://nmap.org/submit/ . - $ nc -vv -z $HEAD_ADDRESS $PORT - nc: connect to compute04.berkeley.edu port 6379 (tcp) failed: Connection refused - - -Stopping Ray -~~~~~~~~~~~~ - -When you want to stop the Ray processes, run ``ray stop`` on each node. - - Additional Cloud Providers -------------------------- @@ -393,62 +283,16 @@ Security On cloud providers, nodes will be launched into their own security group by default, with traffic allowed only between nodes in the same group. A new SSH key will also be created and saved to your local machine for access to the cluster. -.. _using-ray-on-a-cluster: - -Running a Ray program on the Ray cluster ----------------------------------------- - -To run a distributed Ray program, you'll need to execute your program on the same machine as one of the nodes. - -.. tabs:: - .. group-tab:: Python - - Within your program/script, you must call ``ray.init`` and add the ``address`` parameter to ``ray.init`` (like ``ray.init(address=...)``). This causes Ray to connect to the existing cluster. For example: - - .. code-block:: python - - ray.init(address="auto") - - .. group-tab:: Java - - You need to add the ``ray.address`` parameter to your command line (like ``-Dray.address=...``). - - To connect your program to the Ray cluster, run it like this: - - .. code-block:: bash - - java -classpath \ - -Dray.address=
\ - - - .. note:: Specifying ``auto`` as the address hasn't been implemented in Java yet. You need to provide the actual address. You can find the address of the server from the output of the ``ray up`` command. - - -.. note:: A common mistake is setting the address to be a cluster node while running the script on your laptop. This will not work because the script needs to be started/executed on one of the Ray nodes. - -To verify that the correct number of nodes have joined the cluster, you can run the following. - -.. code-block:: python - - import time - - @ray.remote - def f(): - time.sleep(0.01) - return ray.services.get_node_ip_address() - - # Get a list of the IP addresses of the nodes that have joined the cluster. - set(ray.get([f.remote() for _ in range(1000)])) - What's Next? ------------- Now that you have a working understanding of the cluster launcher, check out: -* :ref:`ref-cluster-quick-start`: A end-to-end demo to run an application that autoscales. -* :ref:`cluster-config`: A complete reference of how to configure your Ray cluster. +* :ref:`cluster-config`: A guide to configuring your Ray cluster. * :ref:`cluster-commands`: A short user guide to the various cluster launcher commands. +* A `step by step guide`_ to using the cluster launcher +* :ref:`ref-autoscaling`: An overview of how Ray autoscaling works. diff --git a/doc/source/cluster/config.rst b/doc/source/cluster/config.rst index 430d5473de0c..8260e8f6b7e6 100644 --- a/doc/source/cluster/config.rst +++ b/doc/source/cluster/config.rst @@ -1,286 +1,82 @@ .. _cluster-config: -Cluster YAML Configuration Options -================================== +Configuring your Cluster +======================== -The cluster configuration is defined within a YAML file that will be used by the Cluster Launcher to launch the head node, and by the Autoscaler to launch worker nodes. Once the cluster configuration is defined, you will need to use the :ref:`Ray CLI ` to perform any operations such as starting and stopping the cluster. +.. tip:: Before you continue, be sure to have read :ref:`cluster-cloud`. -Syntax ------- +To launch a cluster, you must first create a *cluster configuration file*, which specifies some important details about the cluster. -.. parsed-literal:: +Quickstart +---------- - :ref:`cluster_name `: str - :ref:`max_workers `: int - :ref:`upscaling_speed `: float - :ref:`idle_timeout_minutes `: int - :ref:`docker `: - :ref:`docker ` - :ref:`provider `: - :ref:`provider ` - :ref:`auth `: - :ref:`auth ` - :ref:`available_node_types `: - :ref:`node_types ` - :ref:`worker_nodes `: - :ref:`node_config ` - :ref:`head_node_type `: str - :ref:`file_mounts `: - :ref:`file_mounts ` - :ref:`cluster_synced_files `: - - str - :ref:`rsync_exclude `: - - str - :ref:`rsync_filter `: - - str - :ref:`initialization_commands `: - - str - :ref:`setup_commands `: - - str - :ref:`head_setup_commands `: - - str - :ref:`worker_setup_commands `: - - str - :ref:`head_start_ray_commands `: - - str - :ref:`worker_start_ray_commands `: - - str +At a minimum, we need to specify: -Custom types ------------- +* the name of your cluster, +* the number of workers in the cluster +* the cloud provider +* any setup commands that should run on the node upon launch. -.. _cluster-configuration-docker-type: +Here is an example cluster configuration file: -Docker -~~~~~~ - -.. parsed-literal:: - :ref:`image `: str - :ref:`head_image `: str - :ref:`worker_image `: str - :ref:`container_name `: str - :ref:`pull_before_run `: bool - :ref:`run_options `: - - str - :ref:`head_run_options `: - - str - :ref:`worker_run_options `: - - str - :ref:`disable_automatic_runtime_detection `: bool - :ref:`disable_shm_size_detection `: bool - -.. _cluster-configuration-auth-type: - -Auth -~~~~ - -.. tabs:: - .. group-tab:: AWS - - .. parsed-literal:: - - :ref:`ssh_user `: str - :ref:`ssh_private_key `: str - - .. group-tab:: Azure - - .. parsed-literal:: - - :ref:`ssh_user `: str - :ref:`ssh_private_key `: str - :ref:`ssh_public_key `: str - - .. group-tab:: GCP - - .. parsed-literal:: - - :ref:`ssh_user `: str - :ref:`ssh_private_key `: str - -.. _cluster-configuration-provider-type: - -Provider -~~~~~~~~ - -.. tabs:: - .. group-tab:: AWS - - .. parsed-literal:: - - :ref:`type `: str - :ref:`region `: str - :ref:`availability_zone `: str - :ref:`cache_stopped_nodes `: bool - - .. group-tab:: Azure - - .. parsed-literal:: - - :ref:`type `: str - :ref:`location `: str - :ref:`resource_group `: str - :ref:`subscription_id `: str - :ref:`cache_stopped_nodes `: bool - - .. group-tab:: GCP - - .. parsed-literal:: - - :ref:`type `: str - :ref:`region `: str - :ref:`availability_zone `: str - :ref:`project_id `: str - :ref:`cache_stopped_nodes `: bool - -.. _cluster-configuration-node-types-type: - -Node types -~~~~~~~~~~ - -The nodes types object's keys represent the names of the different node types. - -.. parsed-literal:: - : - :ref:`node_config `: - :ref:`Node config ` - :ref:`resources `: - :ref:`Resources ` - :ref:`min_workers `: int - :ref:`max_workers `: int - :ref:`worker_setup_commands `: - - str - :ref:`docker `: - :ref:`Node Docker ` - : - ... - ... - -.. _cluster-configuration-node-config-type: - -Node config -~~~~~~~~~~~ - -.. tabs:: - .. group-tab:: AWS - - A YAML object as defined in `the AWS docs `_. - - .. group-tab:: Azure - - A YAML object as defined in `the deployment template `_ whose resources are defined in `the Azure docs `_. - - .. group-tab:: GCP - - A YAML object as defined in `the GCP docs `_. - -.. _cluster-configuration-node-docker-type: - -Node Docker -~~~~~~~~~~~ - -.. parsed-literal:: - - :ref:`image `: str - :ref:`pull_before_run `: bool - :ref:`run_options `: - - str - :ref:`disable_automatic_runtime_detection `: bool - :ref:`disable_shm_size_detection `: bool - -.. _cluster-configuration-resources-type: - -Resources -~~~~~~~~~ - -.. parsed-literal:: - - :ref:`CPU `: int - :ref:`GPU `: int - : int - : int - ... - -.. _cluster-configuration-file-mounts-type: - -File mounts -~~~~~~~~~~~ - -.. parsed-literal:: - : str # Path 1 on local machine - : str # Path 2 on local machine - ... - -Properties and Definitions --------------------------- - -.. _cluster-configuration-cluster-name: +.. code-block:: yaml -``cluster_name`` -~~~~~~~~~~~~~~~~ + # A unique identifier for this cluster. + cluster_name: basic-ray -The name of the cluster. This is the namespace of the cluster. + # The maximum number of workers nodes to launch in addition to the head + # node. + max_workers: 0 # this means zero workers -* **Required:** Yes -* **Importance:** High -* **Type:** String -* **Default:** "default" -* **Pattern:** ``[a-zA-Z0-9_]+`` + # Cloud-provider specific configuration. + provider: + type: aws + region: us-west-2 + availability_zone: us-west-2a -.. _cluster-configuration-max-workers: + # How Ray will authenticate with newly launched nodes. + auth: + ssh_user: ubuntu -``max_workers`` -~~~~~~~~~~~~~~~ + setup_commands: + - pip install ray[all] + # The following line demonstrate that you can specify arbitrary + # startup scripts on the cluster. + - touch /tmp/some_file.txt -The maximum number of workers the cluster will have at any given time. +Most of the example YAML file is optional. Here is a `reference minimal YAML file `__, and you can find the defaults for `optional fields in this YAML file `__. -* **Required:** No -* **Importance:** High -* **Type:** Integer -* **Default:** ``2`` -* **Minimum:** ``0`` -* **Maximum:** Unbounded +In another example, the `AWS example configuration file `__ cluster config file will create a small cluster with an m5.large head node (on-demand) configured to autoscale up to two m5.large `spot workers `__. -.. _cluster-configuration-upscaling-speed: +**You are encouraged to copy the example YAML file and modify it to your needs. This may include adding additional setup commands to install libraries or sync local data files.** -``upscaling_speed`` -~~~~~~~~~~~~~~~~~~~ +Setup Commands +-------------- -The number of nodes allowed to be pending as a multiple of the current number of nodes. For example, if set to 1.0, the cluster can grow in size by at most 100% at any time, so if the cluster currently has 20 nodes, at most 20 pending launches are allowed. +.. tip:: After you have customized the nodes, create a new machine image (or docker container) and use that in the config file to reduce setup times. -* **Required:** No -* **Importance:** Medium -* **Type:** Float -* **Default:** ``1.0`` -* **Minimum:** ``0.0`` -* **Maximum:** Unbounded +The setup commands you use should ideally be *idempotent* (i.e., can be run multiple times without changing the result). This allows Ray to safely update nodes after they have been created. -.. _cluster-configuration-idle-timeout-minutes: +You can usually make commands idempotent with small modifications, e.g. ``git clone foo`` can be rewritten as ``test -e foo || git clone foo`` which checks if the repo is already cloned first. -``idle_timeout_minutes`` -~~~~~~~~~~~~~~~~~~~~~~~~ +.. _autoscaler-docker: -The number of minutes that need to pass before an idle worker node is removed by the Autoscaler. +Docker Support +-------------- -* **Required:** No -* **Importance:** Medium -* **Type:** Integer -* **Default:** ``5`` -* **Minimum:** ``0`` -* **Maximum:** Unbounded +The cluster launcher is fully compatible with Docker images. To use Docker, provide a ``docker_image`` and ``container_name`` in the ``docker`` field of the YAML. -.. _cluster-configuration-docker: +.. code-block:: yaml -``docker`` -~~~~~~~~~~ + docker: + container_name: "ray_container" + image: "rayproject/ray-ml:latest-gpu" -Configure Ray to run in Docker containers. +We provide docker images on `DockerHub `__. The ``rayproject/ray-ml:latest`` image is a quick way to get up and running . -* **Required:** No -* **Importance:** High -* **Type:** :ref:`Docker ` -* **Default:** ``{}`` +When the cluster is launched, all of the Ray tasks will be executed completely inside of the container. For GPU support, Ray will automatically select the Nvidia docker runtime if available, and you just need to specify a docker image with the CUDA support (``rayproject/ray-ml:latest-gpu`` and all of our ``-gpu`` images have this). -In rare cases when Docker is not available on the system by default (e.g., bad AMI), add the following commands to :ref:`initialization_commands ` to install it. +If Docker is not installed, add the following commands to ``initialization_commands`` to install it. .. code-block:: yaml @@ -290,813 +86,59 @@ In rare cases when Docker is not available on the system by default (e.g., bad A - sudo usermod -aG docker $USER - sudo systemctl restart docker -f -.. _cluster-configuration-provider: - -``provider`` -~~~~~~~~~~~~ - -The cloud provider-specific configuration properties. - -* **Required:** Yes -* **Importance:** High -* **Type:** :ref:`Provider ` - -.. _cluster-configuration-auth: - -``auth`` -~~~~~~~~ - -Authentication credentials that Ray will use to launch nodes. - -* **Required:** Yes -* **Importance:** High -* **Type:** :ref:`Auth ` - -.. _cluster-configuration-available-node-types: - -``available_node_types`` -~~~~~~~~~~~~~~~~~~~~~~~~ - -Tells the autoscaler the allowed node types and the resources they provide. -The key is the name of the node type, which is just for debugging purposes. - -* **Required:** No -* **Importance:** High -* **Type:** :ref:`Node types ` -* **Default:** - -.. tabs:: - .. group-tab:: AWS - - .. code-block:: yaml - - available_node_types: - ray.head.default: - node_config: - InstanceType: m5.large - BlockDeviceMappings: - - DeviceName: /dev/sda1 - Ebs: - VolumeSize: 100 - resources: {"CPU": 2} - min_workers: 0 - max_workers: 0 - ray.worker.small: - node_config: - InstanceType: m5.large - InstanceMarketOptions: - MarketType: spot - resources: {"CPU": 2} - min_workers: 0 - max_workers: 1 - -.. _cluster-configuration-head-node-type: - -``head_node_type`` -~~~~~~~~~~~~~~~~~~ - -The key for one of the node types in :ref:`available_node_types `. This node type will be used to launch the head node. - - -* **Required:** Yes -* **Importance:** High -* **Type:** String -* **Pattern:** ``[a-zA-Z0-9_]+`` - -.. _cluster-configuration-worker-nodes: - -``worker_nodes`` -~~~~~~~~~~~~~~~~ - -The configuration to be used to launch worker nodes on the cloud service provider. Generally, node configs are set in the :ref:`node config of each node type `. Setting this property allows propagation of a default value to all the node types when they launch as workers (e.g., using spot instances across all workers can be configured here so that it doesn't have to be set across all instance types). - -* **Required:** No -* **Importance:** Low -* **Type:** :ref:`Node config ` -* **Default:** ``{}`` - -.. _cluster-configuration-file-mounts: - -``file_mounts`` -~~~~~~~~~~~~~~~ - -The files or directories to copy to the head and worker nodes. - -* **Required:** No -* **Importance:** High -* **Type:** :ref:`File mounts ` -* **Default:** ``[]`` - -.. _cluster-configuration-cluster-synced-files: - -``cluster_synced_files`` -~~~~~~~~~~~~~~~~~~~~~~~~ - -A list of paths to the files or directories to copy from the head node to the worker nodes. The same path on the head node will be copied to the worker node. This behavior is a subset of the file_mounts behavior, so in the vast majority of cases one should just use :ref:`file_mounts `. - -* **Required:** No -* **Importance:** Low -* **Type:** List of String -* **Default:** ``[]`` - -.. _cluster-configuration-rsync-exclude: - -``rsync_exclude`` -~~~~~~~~~~~~~~~~~ - -A list of patterns for files to exclude when running ``rsync up`` or ``rsync down``. The filter is applied on the source directory only. - -Example for a pattern in the list: ``**/.git/**``. - -* **Required:** No -* **Importance:** Low -* **Type:** List of String -* **Default:** ``[]`` - -.. _cluster-configuration-rsync-filter: - -``rsync_filter`` -~~~~~~~~~~~~~~~~ - -A list of patterns for files to exclude when running ``rsync up`` or ``rsync down``. The filter is applied on the source directory and recursively through all subdirectories. - -Example for a pattern in the list: ``.gitignore``. - -* **Required:** No -* **Importance:** Low -* **Type:** List of String -* **Default:** ``[]`` - -.. _cluster-configuration-initialization-commands: - -``initialization_commands`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -A list of commands that will be run before the :ref:`setup commands `. If Docker is enabled, these commands will run outside the container and before Docker is setup. - -* **Required:** No -* **Importance:** Medium -* **Type:** List of String -* **Default:** ``[]`` - -.. _cluster-configuration-setup-commands: - -``setup_commands`` -~~~~~~~~~~~~~~~~~~ - -A list of commands to run to set up nodes. These commands will always run on the head and worker nodes and will be merged with :ref:`head setup commands ` for head and with :ref:`worker setup commands ` for workers. - -* **Required:** No -* **Importance:** Medium -* **Type:** List of String -* **Default:** - -.. tabs:: - .. group-tab:: AWS - - .. code-block:: yaml - - # Default setup_commands: - setup_commands: - - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc - - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl - -- Setup commands should ideally be *idempotent* (i.e., can be run multiple times without changing the result); this allows Ray to safely update nodes after they have been created. You can usually make commands idempotent with small modifications, e.g. ``git clone foo`` can be rewritten as ``test -e foo || git clone foo`` which checks if the repo is already cloned first. - -- Setup commands are run sequentially but separately. For example, if you are using anaconda, you need to run ``conda activate env && pip install -U ray`` because splitting the command into two setup commands will not work. - -- Ideally, you should avoid using setup_commands by creating a docker image with all the dependencies preinstalled to minimize startup time. - -- **Tip**: if you also want to run apt-get commands during setup add the following list of commands: - - .. code-block:: yaml - - setup_commands: - - sudo pkill -9 apt-get || true - - sudo pkill -9 dpkg || true - - sudo dpkg --configure -a - -.. _cluster-configuration-head-setup-commands: - -``head_setup_commands`` -~~~~~~~~~~~~~~~~~~~~~~~ - -A list of commands to run to set up the head node. These commands will be merged with the general :ref:`setup commands `. - -* **Required:** No -* **Importance:** Low -* **Type:** List of String -* **Default:** ``[]`` - -.. _cluster-configuration-worker-setup-commands: - -``worker_setup_commands`` -~~~~~~~~~~~~~~~~~~~~~~~~~ - -A list of commands to run to set up the worker nodes. These commands will be merged with the general :ref:`setup commands `. - -* **Required:** No -* **Importance:** Low -* **Type:** List of String -* **Default:** ``[]`` - -.. _cluster-configuration-head-start-ray-commands: - -``head_start_ray_commands`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Commands to start ray on the head node. You don't need to change this. - -* **Required:** No -* **Importance:** Low -* **Type:** List of String -* **Default:** - -.. tabs:: - .. group-tab:: AWS - - .. code-block:: yaml - - head_start_ray_commands: - - ray stop - - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml - -.. _cluster-configuration-worker-start-ray-commands: - -``worker_start_ray_commands`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Command to start ray on worker nodes. You don't need to change this. - -* **Required:** No -* **Importance:** Low -* **Type:** List of String -* **Default:** - -.. tabs:: - .. group-tab:: AWS - - .. code-block:: yaml - - worker_start_ray_commands: - - ray stop - - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 - -.. _cluster-configuration-image: - -``docker.image`` -~~~~~~~~~~~~~~~~ - -The default Docker image to pull in the head and worker nodes. This can be overridden by the :ref:`head_image ` and :ref:`worker_image ` fields. If neither `image` nor (:ref:`head_image ` and :ref:`worker_image `) are specified, Ray will not use Docker. - -* **Required:** Yes (If Docker is in use.) -* **Importance:** High -* **Type:** String - -The Ray project provides Docker images on `DockerHub `_. The repository includes following images: - -* ``rayproject/ray-ml:latest-gpu``: CUDA support, includes ML dependencies. -* ``rayproject/ray:latest-gpu``: CUDA support, no ML dependencies. -* ``rayproject/ray-ml:latest``: No CUDA support, includes ML dependencies. -* ``rayproject/ray:latest``: No CUDA support, no ML dependencies. - -.. _cluster-configuration-head-image: - -``docker.head_image`` -~~~~~~~~~~~~~~~~~~~~~ -Docker image for the head node to override the default :ref:`docker image `. - -* **Required:** No -* **Importance:** Low -* **Type:** String - -.. _cluster-configuration-worker-image: - -``docker.worker_image`` -~~~~~~~~~~~~~~~~~~~~~~~ -Docker image for the worker nodes to override the default :ref:`docker image `. - -* **Required:** No -* **Importance:** Low -* **Type:** String - -.. _cluster-configuration-container-name: - -``docker.container_name`` -~~~~~~~~~~~~~~~~~~~~~~~~~ - -The name to use when starting the Docker container. - -* **Required:** Yes (If Docker is in use.) -* **Importance:** Low -* **Type:** String -* **Default:** ray_container - -.. _cluster-configuration-pull-before-run: - -``docker.pull_before_run`` -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If enabled, the latest version of image will be pulled when starting Docker. If disabled, ``docker run`` will only pull the image if no cached version is present. - -* **Required:** No -* **Importance:** Medium -* **Type:** Boolean -* **Default:** ``True`` - -.. _cluster-configuration-run-options: - -``docker.run_options`` -~~~~~~~~~~~~~~~~~~~~~~ - -The extra options to pass to ``docker run``. - -* **Required:** No -* **Importance:** Medium -* **Type:** List of String -* **Default:** ``[]`` - -.. _cluster-configuration-head-run-options: - -``docker.head_run_options`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The extra options to pass to ``docker run`` for head node only. - -* **Required:** No -* **Importance:** Low -* **Type:** List of String -* **Default:** ``[]`` +Common cluster configurations +----------------------------- -.. _cluster-configuration-worker-run-options: +The `example-full.yaml `__ configuration is enough to get started with Ray, but for more compute intensive workloads you will want to change the instance types to e.g. use GPU or larger compute instance by editing the yaml file. -``docker.worker_run_options`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Here are a few common configurations (note that we use AWS in the examples, but these examples are generic): -The extra options to pass to ``docker run`` for worker nodes only. +**GPU single node**: use Ray on a single large GPU instance. -* **Required:** No -* **Importance:** Low -* **Type:** List of String -* **Default:** ``[]`` - -.. _cluster-configuration-disable-automatic-runtime-detection: - -``docker.disable_automatic_runtime_detection`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If enabled, Ray will not try to use the NVIDIA Container Runtime if GPUs are present. - -* **Required:** No -* **Importance:** Low -* **Type:** Boolean -* **Default:** ``False`` - - -.. _cluster-configuration-disable-shm-size-detection: - -``docker.disable_shm_size_detection`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If enabled, Ray will not automatically specify the size ``/dev/shm`` for the started container and the runtime's default value (64MiB for Docker) will be used. - -* **Required:** No -* **Importance:** Low -* **Type:** Boolean -* **Default:** ``False`` - - -.. _cluster-configuration-ssh-user: - -``auth.ssh_user`` -~~~~~~~~~~~~~~~~~ - -The user that Ray will authenticate with when launching new nodes. - -* **Required:** Yes -* **Importance:** High -* **Type:** String - -.. _cluster-configuration-ssh-private-key: - -``auth.ssh_private_key`` -~~~~~~~~~~~~~~~~~~~~~~~~ - -.. tabs:: - .. group-tab:: AWS - - The path to an existing private key for Ray to use. If not configured, Ray will create a new private keypair (default behavior). If configured, the key must be added to the project-wide metadata and ``KeyName`` has to be defined in the :ref:`node configuration `. - - * **Required:** No - * **Importance:** Low - * **Type:** String - - .. group-tab:: Azure - - The path to an existing private key for Ray to use. - - * **Required:** Yes - * **Importance:** High - * **Type:** String - - You may use ``ssh-keygen -t rsa -b 4096`` to generate a new ssh keypair. - - .. group-tab:: GCP - - The path to an existing private key for Ray to use. If not configured, Ray will create a new private keypair (default behavior). If configured, the key must be added to the project-wide metadata and ``KeyName`` has to be defined in the :ref:`node configuration `. - - * **Required:** No - * **Importance:** Low - * **Type:** String - -.. _cluster-configuration-ssh-public-key: - -``auth.ssh_public_key`` -~~~~~~~~~~~~~~~~~~~~~~~ - -.. tabs:: - .. group-tab:: AWS - - Not available. - - .. group-tab:: Azure - - The path to an existing public key for Ray to use. - - * **Required:** Yes - * **Importance:** High - * **Type:** String - - .. group-tab:: GCP - - Not available. - -.. _cluster-configuration-type: - -``provider.type`` -~~~~~~~~~~~~~~~~~ - -.. tabs:: - .. group-tab:: AWS - - The cloud service provider. For AWS, this must be set to ``aws``. - - * **Required:** Yes - * **Importance:** High - * **Type:** String - - .. group-tab:: Azure - - The cloud service provider. For Azure, this must be set to ``azure``. - - * **Required:** Yes - * **Importance:** High - * **Type:** String - - .. group-tab:: GCP - - The cloud service provider. For GCP, this must be set to ``gcp``. - - * **Required:** Yes - * **Importance:** High - * **Type:** String - -.. _cluster-configuration-region: - -``provider.region`` -~~~~~~~~~~~~~~~~~~~ - -.. tabs:: - .. group-tab:: AWS - - The region to use for deployment of the Ray cluster. - - * **Required:** Yes - * **Importance:** High - * **Type:** String - * **Default:** us-west-2 - - .. group-tab:: Azure - - Not available. - - .. group-tab:: GCP - - The region to use for deployment of the Ray cluster. - - * **Required:** Yes - * **Importance:** High - * **Type:** String - * **Default:** us-west1 - -.. _cluster-configuration-availability-zone: - -``provider.availability_zone`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. tabs:: - .. group-tab:: AWS - - A string specifying a comma-separated list of availability zone(s) that nodes may be launched in. - - * **Required:** No - * **Importance:** Low - * **Type:** String - * **Default:** us-west-2a,us-west-2b - - .. group-tab:: Azure - - Not available. - - .. group-tab:: GCP - - A string specifying a comma-separated list of availability zone(s) that nodes may be launched in. - - * **Required:** No - * **Importance:** Low - * **Type:** String - * **Default:** us-west1-a - -.. _cluster-configuration-location: - -``provider.location`` -~~~~~~~~~~~~~~~~~~~~~ - -.. tabs:: - .. group-tab:: AWS - - Not available. - - .. group-tab:: Azure - - The location to use for deployment of the Ray cluster. - - * **Required:** Yes - * **Importance:** High - * **Type:** String - * **Default:** westus2 - - .. group-tab:: GCP - - Not available. - -.. _cluster-configuration-resource-group: - -``provider.resource_group`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. tabs:: - .. group-tab:: AWS - - Not available. - - .. group-tab:: Azure - - The resource group to use for deployment of the Ray cluster. - - * **Required:** Yes - * **Importance:** High - * **Type:** String - * **Default:** ray-cluster - - .. group-tab:: GCP - - Not available. - -.. _cluster-configuration-subscription-id: - -``provider.subscription_id`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. tabs:: - .. group-tab:: AWS - - Not available. - - .. group-tab:: Azure - - The subscription ID to use for deployment of the Ray cluster. If not specified, Ray will use the default from the Azure CLI. - - * **Required:** No - * **Importance:** High - * **Type:** String - * **Default:** ``""`` - - .. group-tab:: GCP - - Not available. - -.. _cluster-configuration-project-id: - -``provider.project_id`` -~~~~~~~~~~~~~~~~~~~~~~~ - -.. tabs:: - .. group-tab:: AWS - - Not available. - - .. group-tab:: Azure - - Not available. - - .. group-tab:: GCP - - The globally unique project ID to use for deployment of the Ray cluster. - - * **Required:** No - * **Importance:** Low - * **Type:** String - * **Default:** ``null`` - -.. _cluster-configuration-cache-stopped-nodes: - -``provider.cache_stopped_nodes`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If enabled, nodes will be *stopped* when the cluster scales down. If disabled, nodes will be *terminated* instead. Stopped nodes launch faster than terminated nodes. - - -* **Required:** No -* **Importance:** Low -* **Type:** Boolean -* **Default:** ``True`` - -.. _cluster-configuration-node-config: - -``available_node_types..node_type.node_config`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The configuration to be used to launch the nodes on the cloud service provider. Among other things, this will specify the instance type to be launched. - -* **Required:** Yes -* **Importance:** High -* **Type:** :ref:`Node config ` - -.. _cluster-configuration-resources: - -``available_node_types..node_type.resources`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The resources that a node type provides, which enables the autoscaler to automatically select the right type of nodes to launch given the resource demands of the application. The resources specified will be automatically passed to the ``ray start`` command for the node via an environment variable. If not provided, Autoscaler can automatically detect them only for AWS/Kubernetes cloud providers. For more information, see also the `resource demand scheduler `_ - -* **Required:** Yes (except for AWS/K8s) -* **Importance:** High -* **Type:** :ref:`Resources ` -* **Default:** ``{}`` - -In some cases, adding special nodes without any resources may be desirable. Such nodes can be used as a driver which connects to the cluster to launch jobs. In order to manually add a node to an autoscaled cluster, the *ray-cluster-name* tag should be set and *ray-node-type* tag should be set to unmanaged. Unmanaged nodes can be created by setting the resources to ``{}`` and the :ref:`maximum workers ` to 0. The Autoscaler will not attempt to start, stop, or update unmanaged nodes. The user is responsible for properly setting up and cleaning up unmanaged nodes. - -.. _cluster-configuration-node-min-workers: - -``available_node_types..node_type.min_workers`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The minimum number of workers to maintain for this node type regardless of utilization. - -* **Required:** No -* **Importance:** High -* **Type:** Integer -* **Default:** ``0`` -* **Minimum:** ``0`` -* **Maximum:** Unbounded - -.. _cluster-configuration-node-max-workers: - -``available_node_types..node_type.max_workers`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The maximum number of workers to have in the cluster for this node type regardless of utilization. This takes precedence over :ref:`minimum workers `. - -* **Required:** No -* **Importance:** High -* **Type:** Integer -* **Default:** ``0`` -* **Minimum:** ``0`` -* **Maximum:** Unbounded - -.. _cluster-configuration-node-type-worker-setup-commands: - -``available_node_types..node_type.worker_setup_commands`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -A list of commands to run to set up worker nodes of this type. These commands will replace the general :ref:`worker setup commands ` for the node. - -* **Required:** No -* **Importance:** low -* **Type:** List of String -* **Default:** ``[]`` - -.. _cluster-configuration-cpu: - -``available_node_types..node_type.resources.CPU`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. tabs:: - .. group-tab:: AWS - - The number of CPUs made available by this node. If not configured, Autoscaler can automatically detect them only for AWS/Kubernetes cloud providers. - - * **Required:** Yes (except for AWS/K8s) - * **Importance:** High - * **Type:** Integer - - .. group-tab:: Azure - - The number of CPUs made available by this node. - - * **Required:** Yes - * **Importance:** High - * **Type:** Integer - - .. group-tab:: GCP - - The number of CPUs made available by this node. - - * **Required:** No - * **Importance:** High - * **Type:** Integer - - -.. _cluster-configuration-gpu: - -``available_node_types..node_type.resources.GPU`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. tabs:: - .. group-tab:: AWS - - The number of GPUs made available by this node. If not configured, Autoscaler can automatically detect them only for AWS/Kubernetes cloud providers. - - * **Required:** No - * **Importance:** Low - * **Type:** Integer - - .. group-tab:: Azure - - The number of GPUs made available by this node. - - * **Required:** No - * **Importance:** High - * **Type:** Integer - - .. group-tab:: GCP - - The number of GPUs made available by this node. - - * **Required:** No - * **Importance:** High - * **Type:** Integer - -.. _cluster-configuration-node-docker: - -``available_node_types..docker`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -A set of overrides to the top-level :ref:`Docker ` configuration. +.. code-block:: yaml -* **Required:** No -* **Importance:** Low -* **Type:** :ref:`docker ` -* **Default:** ``{}`` + max_workers: 0 + head_node: + InstanceType: p2.8xlarge -Examples --------- -Minimal configuration -~~~~~~~~~~~~~~~~~~~~~ +**Mixed GPU and CPU nodes**: for RL applications that require proportionally more +CPU than GPU resources, you can use additional CPU workers with a GPU head node. -.. tabs:: - .. group-tab:: AWS +.. code-block:: yaml - .. literalinclude:: ../../../python/ray/autoscaler/aws/example-minimal.yaml - :language: yaml + max_workers: 10 + head_node: + InstanceType: p2.8xlarge + worker_nodes: + InstanceType: m4.16xlarge - .. group-tab:: Azure - - .. literalinclude:: ../../../python/ray/autoscaler/azure/example-minimal.yaml - :language: yaml +**Autoscaling CPU cluster**: use a small head node and have Ray auto-scale +workers as needed. This can be a cost-efficient configuration for clusters with +bursty workloads. You can also request spot workers for additional cost savings. - .. group-tab:: GCP - - .. literalinclude:: ../../../python/ray/autoscaler/gcp/example-minimal.yaml - :language: yaml +.. code-block:: yaml -Full configuration -~~~~~~~~~~~~~~~~~~ + min_workers: 0 + max_workers: 10 + head_node: + InstanceType: m4.large + worker_nodes: + InstanceMarketOptions: + MarketType: spot + InstanceType: m4.16xlarge -.. tabs:: - .. group-tab:: AWS +**Autoscaling GPU cluster**: similar to the autoscaling CPU cluster, but +with GPU worker nodes instead. - .. literalinclude:: ../../../python/ray/autoscaler/aws/example-full.yaml - :language: yaml +.. code-block:: yaml - .. group-tab:: Azure - - .. literalinclude:: ../../../python/ray/autoscaler/azure/example-full.yaml - :language: yaml + min_workers: 0 # NOTE: older Ray versions may need 1+ GPU workers (#2106) + max_workers: 10 + head_node: + InstanceType: m4.large + worker_nodes: + InstanceMarketOptions: + MarketType: spot + InstanceType: p2.xlarge - .. group-tab:: GCP - - .. literalinclude:: ../../../python/ray/autoscaler/gcp/example-full.yaml - :language: yaml diff --git a/doc/source/cluster/deploy.rst b/doc/source/cluster/deploy.rst index 24bcfe456e0d..60a45e171062 100644 --- a/doc/source/cluster/deploy.rst +++ b/doc/source/cluster/deploy.rst @@ -5,7 +5,7 @@ Ray with Cluster Managers .. note:: - If you're using AWS, Azure or GCP you can use the :ref:`Ray Cluster Launcher ` to simplify the cluster setup process. + If you're using AWS, Azure or GCP you can use the :ref:`Ray Cluster Launcher ` to simplify the cluster setup process. .. toctree:: :maxdepth: 2 diff --git a/doc/source/cluster/index.rst b/doc/source/cluster/index.rst index f32fab54874a..c95eca1cb2b7 100644 --- a/doc/source/cluster/index.rst +++ b/doc/source/cluster/index.rst @@ -1,26 +1,229 @@ .. _cluster-index: -Ray Cluster Overview -==================== +Distributed Ray Overview +======================== -What is a Ray cluster? +One of Ray's strengths is the ability to leverage multiple machines in the same program. Ray can, of course, be run on a single machine (and is done so often) but the real power is using Ray on a cluster of machines. + +Key Concepts +------------ + +* **Ray Nodes**: A Ray cluster consists of a **head node** and a set of **worker nodes**. The head node needs to be started first, and the worker nodes are given the address of the head node to form the cluster. The Ray cluster itself can also "auto-scale," meaning that it can interact with a Cloud Provider to request or release instances according to application workload. + +* **Ports**: Ray processes communicate via TCP ports. When starting a Ray cluster, either on prem or on the cloud, it is important to open the right ports so that Ray functions correctly. See :ref:`the Ray Ports documentation ` for more details. + +* **Ray Cluster Launcher**: The :ref:`Ray Cluster Launcher ` is a simple tool that automatically provisions machines and launches a multi-node Ray cluster. You can use the cluster launcher on GCP, Amazon EC2, Azure, or even Kubernetes. + +Summary +------- + +Clusters are started with the :ref:`Ray Cluster Launcher ` or :ref:`manually `. + +You can also create a Ray cluster using a standard cluster manager such as :ref:`Kubernetes `, :ref:`YARN `, or :ref:`SLURM `. + +After a cluster is started, you need to connect your program to the Ray cluster by starting a driver process on the same node as where you ran ``ray start``: + +.. tabs:: + .. code-tab:: python + + # This must + import ray + ray.init(address='auto') + + .. group-tab:: java + + .. code-block:: java + + import io.ray.api.Ray; + + public class MyRayApp { + + public static void main(String[] args) { + Ray.init(); + ... + } + } + + .. code-block:: bash + + java -classpath \ + -Dray.address=
\ + + +and then the rest of your script should be able to leverage Ray as a distributed framework! + + +Using the cluster launcher +-------------------------- + +The ``ray up`` command uses the :ref:`Ray Cluster Launcher ` to start a cluster on the cloud, creating a designated "head node" and worker nodes. Any Python process that runs ``ray.init(address=...)`` on any of the cluster nodes will connect to the ray cluster. + +.. important:: Calling ``ray.init`` on your laptop will not work if using ``ray up``, since your laptop will not be the head node. + +Here is an example of using the Cluster Launcher on AWS: + +.. code-block:: shell + + # First, run `pip install boto3` and `aws configure` + # + # Create or update the cluster. When the command finishes, it will print + # out the command that can be used to SSH into the cluster head node. + $ ray up ray/python/ray/autoscaler/aws/example-full.yaml + +You can monitor the Ray cluster status with ``ray monitor cluster.yaml`` and ssh into the head node with ``ray attach cluster.yaml``. + +.. _manual-cluster: + +Manual Ray Cluster Setup ------------------------ -One of Ray's strengths is the ability to leverage multiple machines in the same program. Ray can, of course, be run on a single machine (and is done so often), but the real power is using Ray on a cluster of machines. +The most preferable way to run a Ray cluster is via the :ref:`Ray Cluster Launcher `. However, it is also possible to start a Ray cluster by hand. + +This section assumes that you have a list of machines and that the nodes in the cluster can communicate with each other. It also assumes that Ray is installed +on each machine. To install Ray, follow the `installation instructions`_. + +.. _`installation instructions`: http://docs.ray.io/en/master/installation.html + +Starting Ray on each machine +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +On the head node (just choose some node to be the head node), run the following. +If the ``--port`` argument is omitted, Ray will choose port 6379, falling back to a +random port. + +.. code-block:: bash + + $ ray start --head --port=6379 + ... + Next steps + To connect to this Ray runtime from another node, run + ray start --address=':6379' --redis-password='' + + If connection fails, check your firewall settings and network configuration. + +The command will print out the address of the Redis server that was started +(the local node IP address plus the port number you specified). + +**Then on each of the other nodes**, run the following. Make sure to replace +``
`` with the value printed by the command on the head node (it +should look something like ``123.45.67.89:6379``). + +Note that if your compute nodes are on their own subnetwork with Network +Address Translation, to connect from a regular machine outside that subnetwork, +the command printed by the head node will not work. You need to find the +address that will reach the head node from the second machine. If the head node +has a domain address like compute04.berkeley.edu, you can simply use that in +place of an IP address and rely on the DNS. + +.. code-block:: bash + + $ ray start --address=
--redis-password='' + -------------------- + Ray runtime started. + -------------------- + + To terminate the Ray runtime, run + ray stop + +If you wish to specify that a machine has 10 CPUs and 1 GPU, you can do this +with the flags ``--num-cpus=10`` and ``--num-gpus=1``. See the :ref:`Configuration ` page for more information. + +If you see ``Unable to connect to Redis. If the Redis instance is on a +different machine, check that your firewall is configured properly.``, +this means the ``--port`` is inaccessible at the given IP address (because, for +example, the head node is not actually running Ray, or you have the wrong IP +address). + +If you see ``Ray runtime started.``, then the node successfully connected to +the IP address at the ``--port``. You should now be able to connect to the +cluster with ``ray.init(address='auto')``. + +If ``ray.init(address='auto')`` keeps repeating +``redis_context.cc:303: Failed to connect to Redis, retrying.``, then the node +is failing to connect to some other port(s) besides the main port. + +.. code-block:: bash + + If connection fails, check your firewall settings and network configuration. + +If the connection fails, to check whether each port can be reached from a node, +you can use a tool such as ``nmap`` or ``nc``. + +.. code-block:: bash + + $ nmap -sV --reason -p $PORT $HEAD_ADDRESS + Nmap scan report for compute04.berkeley.edu (123.456.78.910) + Host is up, received echo-reply ttl 60 (0.00087s latency). + rDNS record for 123.456.78.910: compute04.berkeley.edu + PORT STATE SERVICE REASON VERSION + 6379/tcp open redis syn-ack ttl 60 Redis key-value store + Service detection performed. Please report any incorrect results at https://nmap.org/submit/ . + $ nc -vv -z $HEAD_ADDRESS $PORT + Connection to compute04.berkeley.edu 6379 port [tcp/*] succeeded! + +If the node cannot access that port at that IP address, you might see + +.. code-block:: bash + + $ nmap -sV --reason -p $PORT $HEAD_ADDRESS + Nmap scan report for compute04.berkeley.edu (123.456.78.910) + Host is up (0.0011s latency). + rDNS record for 123.456.78.910: compute04.berkeley.edu + PORT STATE SERVICE REASON VERSION + 6379/tcp closed redis reset ttl 60 + Service detection performed. Please report any incorrect results at https://nmap.org/submit/ . + $ nc -vv -z $HEAD_ADDRESS $PORT + nc: connect to compute04.berkeley.edu port 6379 (tcp) failed: Connection refused + + +Stopping Ray +~~~~~~~~~~~~ + +When you want to stop the Ray processes, run ``ray stop`` on each node. + +.. _using-ray-on-a-cluster: + +Running a Ray program on the Ray cluster +---------------------------------------- + +To run a distributed Ray program, you'll need to execute your program on the same machine as one of the nodes. + +.. tabs:: + .. group-tab:: Python + + Within your program/script, you must call ``ray.init`` and add the ``address`` parameter to ``ray.init`` (like ``ray.init(address=...)``). This causes Ray to connect to the existing cluster. For example: + + .. code-block:: python + + ray.init(address="auto") + + .. group-tab:: Java + + You need to add the ``ray.address`` parameter to your command line (like ``-Dray.address=...``). + + To connect your program to the Ray cluster, run it like this: + + .. code-block:: bash + + java -classpath \ + -Dray.address=
\ + -A Ray cluster consists of a **head node** and a set of **worker nodes**. The head node needs to be started first, and the worker nodes are given the address of the head node to form the cluster. + .. note:: Specifying ``auto`` as the address hasn't been implemented in Java yet. You need to provide the actual address. You can find the address of the server from the output of the ``ray up`` command. -You can use the Ray Cluster Launcher to provision machines and launch a multi-node Ray cluster. You can use the cluster launcher on AWS, GCP, Azure, Kubernetes, on-premise, and Staroid or even on your custom node provider. Ray clusters can also make use of the Ray Autoscaler, which allows Ray to interact with a cloud provider to request or release instances according to application workload. -How does it work? ------------------ +.. note:: A common mistake is setting the address to be a cluster node while running the script on your laptop. This will not work because the script needs to be started/executed on one of the Ray nodes. -The Ray Cluster Launcher will automatically enable a load-based autoscaler. The autoscaler resource demand scheduler will look at the pending tasks, actors, and placement groups resource demands from the cluster, and try to add the minimum list of nodes that can fulfill these demands. When worker nodes are idle for more than :ref:`idle_timeout_minutes `, they will be removed (the head node is never removed unless the cluster is teared down). +To verify that the correct number of nodes have joined the cluster, you can run the following. -Autoscaler uses a simple binpacking algorithm to binpack the user demands into the available cluster resources. The remaining unfulfilled demands are placed on the smallest list of nodes that satisfies the demand while maximizing utilization (starting from the smallest node). +.. code-block:: python -**Here is "A Glimpse into the Ray Autoscaler" and how to debug/monitor your cluster:** + import time -2021-19-01 by Ameer Haj-Ali, Anyscale, Inc. + @ray.remote + def f(): + time.sleep(0.01) + return ray.services.get_node_ip_address() -.. youtube:: BJ06eJasdu4 + # Get a list of the IP addresses of the nodes that have joined the cluster. + set(ray.get([f.remote() for _ in range(1000)])) diff --git a/doc/source/cluster/k8s-operator.rst b/doc/source/cluster/k8s-operator.rst new file mode 100644 index 000000000000..2fb8efef8974 --- /dev/null +++ b/doc/source/cluster/k8s-operator.rst @@ -0,0 +1,238 @@ +.. _k8s-operator: + +The Ray Kubernetes Operator +================================= + +Ray provides a `Kubernetes Operator`_ for managing autoscaling Ray clusters. +Using the operator provides similar functionality to deploying a Ray cluster using +the :ref:`Ray Cluster Launcher`. However, working with the operator does not require +running Ray locally -- all interactions with your Ray cluster are mediated by Kubernetes. + +The operator makes use of a `Kubernetes Custom Resource`_ called a *RayCluster*. +A RayCluster is specified by a configuration similar to the ``yaml`` files used by the Ray Cluster Launcher. +Internally, the operator uses Ray's autoscaler to manage your Ray cluster. However, the autoscaler runs in a +separate operator pod, rather than on the Ray head node. Applying multiple RayCluster custom resources in the operator's +namespace allows the operator to manage several Ray clusters. + +The rest of this document explains step-by-step how to use the Ray Kubernetes Operator to launch a Ray cluster on your existing Kubernetes cluster. + +.. role:: bash(code) + :language: bash + +.. warning:: + The Ray Kubernetes Operator requires Kubernetes version at least ``v1.17.0``. Check Kubernetes version info with the command + :bash:`kubectl version`. + +.. note:: + The example commands in this document launch six Kubernetes pods, using a total of 6 CPU and 3.5Gi memory. + If you are experimenting using a test Kubernetes environment such as `minikube`_, make sure to provision sufficient resources, e.g. + :bash:`minikube start --cpus=6 --memory=\"4G\"`. + Alternatively, reduce resource usage by editing the ``yaml`` files referenced in this document; for example, reduce ``minWorkers`` + in ``example_cluster.yaml`` and ``example_cluster2.yaml``. + + +Applying the RayCluster Custom Resource Definition +-------------------------------------------------- +First, we need to apply the `Kubernetes Custom Resource Definition`_ (CRD) defining a RayCluster. + +.. note:: + + Creating a Custom Resource Definition requires the appropriate Kubernetes cluster-level privileges. + +.. code-block:: shell + + $ kubectl apply -f ray/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml + + customresourcedefinition.apiextensions.k8s.io/rayclusters.cluster.ray.io created + +Picking a Kubernetes Namespace +------------------------------- +The rest of the Kubernetes resources we will use are `namespaced`_. +You can use an existing namespace for your Ray clusters or create a new one if you have permissions. +For this example, we will create a namespace called ``ray``. + +.. code-block:: shell + + $ kubectl create namespace ray + + namespace/ray created + +Starting the Operator +---------------------- + +To launch the operator in our namespace, we execute the following command. + +.. code-block:: shell + + $ kubectl -n ray apply -f ray/python/ray/autoscaler/kubernetes/operator_configs/operator.yaml + + serviceaccount/ray-operator-serviceaccount created + role.rbac.authorization.k8s.io/ray-operator-role created + rolebinding.rbac.authorization.k8s.io/ray-operator-rolebinding created + pod/ray-operator-pod created + +The output shows that we've launched a Pod named ``ray-operator-pod``. This is the pod that runs the operator process. +The ServiceAccount, Role, and RoleBinding we have created grant the operator pod the `permissions`_ it needs to manage Ray clusters. + +Launching Ray Clusters +---------------------- +Finally, to launch a Ray cluster, we create a RayCluster custom resource. + +.. code-block:: shell + + $ kubectl -n ray apply -f ray/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml + + raycluster.cluster.ray.io/example-cluster created + +The operator detects the RayCluster resource we've created and launches an autoscaling Ray cluster. +Our RayCluster configuration specifies ``minWorkers:2`` in the second entry of ``spec.podTypes``, so we get a head node and two workers upon launch. + +.. note:: + + For more details about RayCluster resources, we recommend take a looking at the annotated example ``example_cluster.yaml`` applied in the last command. + +.. code-block:: shell + + $ kubectl -n ray get pods + NAME READY STATUS RESTARTS AGE + example-cluster-ray-head-hbxvv 1/1 Running 0 72s + example-cluster-ray-worker-4hvv6 1/1 Running 0 64s + example-cluster-ray-worker-78kp5 1/1 Running 0 64s + ray-operator-pod 1/1 Running 0 2m33s + +We see four pods: the operator, the Ray head node, and two Ray worker nodes. + +Let's launch another cluster in the same namespace, this one specifiying ``minWorkers:1``. + +.. code-block:: shell + + $ kubectl -n ray apply -f ray/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml + +We confirm that both clusters are running in our namespace. + +.. code-block:: shell + + $ kubectl -n ray get rayclusters + NAME AGE + example-cluster 12m + example-cluster2 114s + + $ kubectl -n ray get pods + NAME READY STATUS RESTARTS AGE + example-cluster-ray-head-th4wv 1/1 Running 0 10m + example-cluster-ray-worker-q9pjn 1/1 Running 0 10m + example-cluster-ray-worker-qltnp 1/1 Running 0 10m + example-cluster2-ray-head-kj5mg 1/1 Running 0 10s + example-cluster2-ray-worker-qsgnd 1/1 Running 0 1s + ray-operator-pod 1/1 Running 0 10m + +Now we can :ref:`run Ray programs` on our Ray clusters. + +Monitoring +---------- +Autoscaling logs are written to the operator pod's ``stdout`` and can be accessed with :code:`kubectl logs`. +Each line of output is prefixed by the name of the cluster followed by a colon. +The following command gets the last hundred lines of autoscaling logs for our second cluster. + +.. code-block:: shell + + $ kubectl -n ray logs ray-operator-pod | grep ^example-cluster2: | tail -n 100 + +The output should include monitoring updates that look like this: + +.. code-block:: shell + + example-cluster2:2020-12-12 13:55:36,814 DEBUG autoscaler.py:693 -- Cluster status: 1 nodes + example-cluster2: - MostDelayedHeartbeats: {'172.17.0.4': 0.04093289375305176, '172.17.0.5': 0.04084634780883789} + example-cluster2: - NodeIdleSeconds: Min=36 Mean=38 Max=41 + example-cluster2: - ResourceUsage: 0.0/2.0 CPU, 0.0/1.0 Custom1, 0.0/1.0 is_spot, 0.0 GiB/0.58 GiB memory, 0.0 GiB/0.1 GiB object_store_memory + example-cluster2: - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 + example-cluster2:Worker node types: + example-cluster2: - worker-nodes: 1 + example-cluster2:2020-12-12 13:55:36,870 INFO resource_demand_scheduler.py:148 -- Cluster resources: [{'object_store_memory': 1.0, 'node:172.17.0.4': 1.0, 'memory': 5.0, 'CPU': 1.0}, {'object_store_memory': 1.0, 'is_spot': 1.0, 'memory': 6.0, 'node:172.17.0.5': 1.0, 'Custom1': 1.0, 'CPU': 1.0}] + example-cluster2:2020-12-12 13:55:36,870 INFO resource_demand_scheduler.py:149 -- Node counts: defaultdict(, {'head-node': 1, 'worker-nodes + ': 1}) + example-cluster2:2020-12-12 13:55:36,870 INFO resource_demand_scheduler.py:159 -- Placement group demands: [] + example-cluster2:2020-12-12 13:55:36,870 INFO resource_demand_scheduler.py:186 -- Resource demands: [] + example-cluster2:2020-12-12 13:55:36,870 INFO resource_demand_scheduler.py:187 -- Unfulfilled demands: [] + example-cluster2:2020-12-12 13:55:36,891 INFO resource_demand_scheduler.py:209 -- Node requests: {} + example-cluster2:2020-12-12 13:55:36,903 DEBUG autoscaler.py:654 -- example-cluster2-ray-worker-tdxdr is not being updated and passes config check (can_update=True). + example-cluster2:2020-12-12 13:55:36,923 DEBUG autoscaler.py:654 -- example-cluster2-ray-worker-tdxdr is not being updated and passes config check (can_update=True). + + +Updating and Retrying +--------------------- +To update a Ray cluster's configuration, edit the ``yaml`` file of the corresponding RayCluster resource +and apply it again: + +.. code-block:: shell + + $ kubectl -n ray apply -f ray/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml + +To force a restart with the same configuration, you can add an `annotation`_ to the RayCluster resource's ``metadata.labels`` field, e.g. + +.. code-block:: yaml + + apiVersion: cluster.ray.io/v1 + kind: RayCluster + metadata: + name: example-cluster + annotations: + try: again + spec: + ... + +Then reapply the RayCluster, as above. + +Currently, editing and reapplying a RayCluster resource will stop and restart Ray processes running on the corresponding +Ray cluster. Similarly, deleting and relaunching the operator pod will stop and restart Ray processes on all Ray clusters in the operator's namespace. +This behavior may be modified in future releases. + + +Cleaning Up +----------- +We shut down a Ray cluster by deleting the associated RayCluster resource. +Either of the next two commands will delete our second cluster ``example-cluster2``. + +.. code-block:: shell + + $ kubectl -n ray delete raycluster example-cluster2 + # OR + $ kubectl -n ray delete -f ray/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml + +The pods associated with ``example-cluster2`` go into ``TERMINATING`` status. In a few moments, we check that these pods are gone: + +.. code-block:: shell + + $ kubectl -n ray get pods + NAME READY STATUS RESTARTS AGE + example-cluster-ray-head-th4wv 1/1 Running 0 57m + example-cluster-ray-worker-q9pjn 1/1 Running 0 56m + example-cluster-ray-worker-qltnp 1/1 Running 0 56m + ray-operator-pod 1/1 Running 0 57m + +Only the operator pod and the first ``example-cluster`` remain. + +To finish clean-up, we delete the cluster ``example-cluster`` and then the operator's resources. + +.. code-block:: shell + + $ kubectl -n ray delete raycluster example-cluster + $ kubectl -n ray delete -f ray/python/ray/autoscaler/kubernetes/operator_configs/operator.yaml + +If you like, you can delete the RayCluster customer resource definition. +(Using the operator again will then require reapplying the CRD.) + +.. code-block:: shell + + $ kubectl delete crd rayclusters.cluster.ray.io + # OR + $ kubectl delete -f ray/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml + +.. _`Kubernetes Operator`: https://kubernetes.io/docs/concepts/extend-kubernetes/operator/ +.. _`Kubernetes Custom Resource`: https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/custom-resources/ +.. _`Kubernetes Custom Resource Definition`: https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definitions/ +.. _`annotation`: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/#attaching-metadata-to-objects +.. _`permissions`: https://kubernetes.io/docs/reference/access-authn-authz/rbac/ +.. _`minikube`: https://minikube.sigs.k8s.io/docs/start/ +.. _`namespaced`: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/ diff --git a/doc/source/cluster/kubernetes-gpu.rst b/doc/source/cluster/kubernetes-gpu.rst deleted file mode 100644 index c91382bf6e7a..000000000000 --- a/doc/source/cluster/kubernetes-gpu.rst +++ /dev/null @@ -1,91 +0,0 @@ -:orphan: - -.. _k8s-gpus: - -GPU Usage with Kubernetes -========================= -This document provides some notes on GPU usage with Kubernetes. - -To use GPUs on Kubernetes, you will need to configure both your Kubernetes setup and add additional values to your Ray cluster configuration. - -For relevant documentation for GPU usage on different clouds, see instructions for `GKE`_, for `EKS`_, and for `AKS`_. - -The `Ray Docker Hub `_ hosts CUDA-based images packaged with Ray for use in Kubernetes pods. -For example, the image ``rayproject/ray-ml:nightly-gpu`` is ideal for running GPU-based ML workloads with the most recent nightly build of Ray. -Read :ref:`here` for further details on Ray images. - -Using Nvidia GPUs requires specifying the relevant resource `limits` in the container fields of your Kubernetes configurations. -(Kubernetes `sets `_ -the GPU request equal to the limit.) The configuration for a pod running a Ray GPU image and -using one Nvidia GPU looks like this: - -.. code-block:: yaml - - apiVersion: v1 - kind: Pod - metadata: - generateName: example-cluster-ray-worker - spec: - ... - containers: - - name: ray-node - image: rayproject/ray:nightly-gpu - ... - resources: - cpu: 1000m - memory: 512Mi - limits: - memory: 512Mi - nvidia.com/gpu: 1 - -GPU taints and tolerations --------------------------- -.. note:: - - Users using a managed Kubernetes service probably don't need to worry about this section. - -The `Nvidia gpu plugin`_ for Kubernetes applies `taints`_ to GPU nodes; these taints prevent non-GPU pods from being scheduled on GPU nodes. -Managed Kubernetes services like GKE, EKS, and AKS automatically apply matching `tolerations`_ -to pods requesting GPU resources. Tolerations are applied by means of Kubernetes's `ExtendedResourceToleration`_ `admission controller`_. -If this admission controller is not enabled for your Kubernetes cluster, you may need to manually add a GPU toleration each of to your GPU pod configurations. For example, - -.. code-block:: yaml - - apiVersion: v1 - kind: Pod - metadata: - generateName: example-cluster-ray-worker - spec: - ... - tolerations: - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - ... - containers: - - name: ray-node - image: rayproject/ray:nightly-gpu - ... - -Further reference and discussion --------------------------------- -Read about Kubernetes device plugins `here `__, -about Kubernetes GPU plugins `here `__, -and about Nvidia's GPU plugin for Kubernetes `here `__. - -If you run into problems setting up GPUs for your Ray cluster on Kubernetes, please reach out to us at ``_. - -Questions or Issues? --------------------- - -.. include:: /_help.rst - -.. _`GKE`: https://cloud.google.com/kubernetes-engine/docs/how-to/gpus -.. _`EKS`: https://docs.aws.amazon.com/eks/latest/userguide/eks-optimized-ami.html -.. _`AKS`: https://docs.microsoft.com/en-us/azure/aks/gpu-cluster - -.. _`tolerations`: https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ -.. _`taints`: https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ -.. _`Nvidia gpu plugin`: https://github.com/NVIDIA/k8s-device-plugin -.. _`admission controller`: https://kubernetes.io/docs/reference/access-authn-authz/admission-controllers/ -.. _`ExtendedResourceToleration`: https://kubernetes.io/docs/reference/access-authn-authz/admission-controllers/#extendedresourcetoleration diff --git a/doc/source/cluster/kubernetes-manual.rst b/doc/source/cluster/kubernetes-manual.rst deleted file mode 100644 index 5cd6e10ffc0a..000000000000 --- a/doc/source/cluster/kubernetes-manual.rst +++ /dev/null @@ -1,162 +0,0 @@ -:orphan: - -.. _ray-k8s-static: - -Deploying a Static Cluster -========================== - -This document gives an example of how to manually deploy a non-autoscaling Ray cluster on Kubernetes. - -To learn about deploying an autoscaling Ray cluster using :ref:`Ray's Kubernetes operator`, read -:ref:`here`. - -To learn about deploying an autoscaling Ray cluster using the :ref:`Ray Cluster Launcher`, read -:ref:`here`. - - -Creating a Ray Namespace ------------------------- - -First, create a `Kubernetes Namespace`_ for Ray resources on your cluster. The -following commands will create resources under this Namespace, so if you want -to use a different one than ``ray``, please be sure to also change the -``namespace`` fields in the provided ``yaml`` files and anytime you see a ``-n`` -flag passed to ``kubectl``. - -.. code-block:: shell - - $ kubectl create namespace ray - -Starting a Ray Cluster ----------------------- - - -A Ray cluster consists of a single head node and a set of worker nodes (the -provided ``ray-cluster.yaml`` file will start 3 worker nodes). In the example -Kubernetes configuration, this is implemented as: - -- A ``ray-head`` `Kubernetes Service`_ that enables the worker nodes to discover the location of the head node on start up. - This Service also enables access to the Ray Client and Ray Dashboard. -- A ``ray-head`` `Kubernetes Deployment`_ that backs the ``ray-head`` Service with a single head node pod (replica). -- A ``ray-worker`` `Kubernetes Deployment`_ with multiple worker node pods (replicas) that connect to the ``ray-head`` pod using the ``ray-head`` Service. - -Note that because the head and worker nodes are Deployments, Kubernetes will -automatically restart pods that crash to maintain the correct number of -replicas. - -- If a worker node goes down, a replacement pod will be started and joined to the cluster. -- If the head node goes down, it will be restarted. This will start a new Ray cluster. Worker nodes that were connected to the old head node will crash and be restarted, connecting to the new head node when they come back up. - -Try deploying a cluster with the provided Kubernetes config by running the -following command: - -.. code-block:: shell - - $ kubectl apply -f ray/doc/kubernetes/ray-cluster.yaml - -Verify that the pods are running by running ``kubectl get pods -n ray``. You -may have to wait up to a few minutes for the pods to enter the 'Running' -state on the first run. - -.. code-block:: shell - - $ kubectl -n ray get pods - NAME READY STATUS RESTARTS AGE - ray-head-5455bb66c9-6bxvz 1/1 Running 0 10s - ray-worker-5c49b7cc57-c6xs8 1/1 Running 0 5s - ray-worker-5c49b7cc57-d9m86 1/1 Running 0 5s - ray-worker-5c49b7cc57-kzk4s 1/1 Running 0 5s - -.. note:: - - You might see a nonzero number of RESTARTS for the worker pods. That can - happen when the worker pods start up before the head pod and the workers - aren't able to connect. This shouldn't affect the behavior of the cluster. - -To change the number of worker nodes in the cluster, change the ``replicas`` -field in the worker deployment configuration in that file and then re-apply -the config as follows: - -.. code-block:: shell - - # Edit 'ray/doc/kubernetes/ray-cluster.yaml' and change the 'replicas' - # field under the ray-worker deployment to, e.g., 4. - - # Re-apply the new configuration to the running deployment. - $ kubectl apply -f ray/doc/kubernetes/ray-cluster.yaml - service/ray-head unchanged - deployment.apps/ray-head unchanged - deployment.apps/ray-worker configured - - # Verify that there are now the correct number of worker pods running. - $ kubectl -n ray get pods - NAME READY STATUS RESTARTS AGE - ray-head-5455bb66c9-6bxvz 1/1 Running 0 30s - ray-worker-5c49b7cc57-c6xs8 1/1 Running 0 25s - ray-worker-5c49b7cc57-d9m86 1/1 Running 0 25s - ray-worker-5c49b7cc57-kzk4s 1/1 Running 0 25s - ray-worker-5c49b7cc57-zzfg2 1/1 Running 0 0s - -To validate that the restart behavior is working properly, try killing pods -and checking that they are restarted by Kubernetes: - -.. code-block:: shell - - # Delete a worker pod. - $ kubectl -n ray delete pod ray-worker-5c49b7cc57-c6xs8 - pod "ray-worker-5c49b7cc57-c6xs8" deleted - - # Check that a new worker pod was started (this may take a few seconds). - $ kubectl -n ray get pods - NAME READY STATUS RESTARTS AGE - ray-head-5455bb66c9-6bxvz 1/1 Running 0 45s - ray-worker-5c49b7cc57-d9m86 1/1 Running 0 40s - ray-worker-5c49b7cc57-kzk4s 1/1 Running 0 40s - ray-worker-5c49b7cc57-ypq8x 1/1 Running 0 0s - - # Delete the head pod. - $ kubectl -n ray delete pod ray-head-5455bb66c9-6bxvz - pod "ray-head-5455bb66c9-6bxvz" deleted - - # Check that a new head pod was started and the worker pods were restarted. - $ kubectl -n ray get pods - NAME READY STATUS RESTARTS AGE - ray-head-5455bb66c9-gqzql 1/1 Running 0 0s - ray-worker-5c49b7cc57-d9m86 1/1 Running 1 50s - ray-worker-5c49b7cc57-kzk4s 1/1 Running 1 50s - ray-worker-5c49b7cc57-ypq8x 1/1 Running 1 10s - - # You can even try deleting all of the pods in the Ray namespace and checking - # that Kubernetes brings the right number back up. - $ kubectl -n ray delete pods --all - $ kubectl -n ray get pods - NAME READY STATUS RESTARTS AGE - ray-head-5455bb66c9-7l6xj 1/1 Running 0 10s - ray-worker-5c49b7cc57-57tpv 1/1 Running 0 10s - ray-worker-5c49b7cc57-6m4kp 1/1 Running 0 10s - ray-worker-5c49b7cc57-jx2w2 1/1 Running 0 10s - -Now that we have a running cluster, :ref:`we can execute Ray programs `. - -Cleaning Up ------------ - -To delete a running Ray cluster, you can run the following command: - -.. code-block:: shell - - kubectl delete -f ray/doc/kubernetes/ray-cluster.yaml - - -Questions or Issues? --------------------- - -.. include:: /_help.rst - - -.. _`Kubernetes Namespace`: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/ -.. _`Kubernetes Service`: https://kubernetes.io/docs/concepts/services-networking/service/ -.. _`Kubernetes Deployment`: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/ -.. _`Kubernetes Job`: https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/ - -.. _`Discussion Board`: https://discuss.ray.io/ diff --git a/doc/source/cluster/kubernetes.rst b/doc/source/cluster/kubernetes.rst index 1234ece998c0..36a9dc126c62 100644 --- a/doc/source/cluster/kubernetes.rst +++ b/doc/source/cluster/kubernetes.rst @@ -1,430 +1,254 @@ -*********************** -Deploying on Kubernetes -*********************** - .. _ray-k8s-deploy: -Introduction -============ -You can leverage your Kubernetes cluster as a substrate for execution of distributed Ray programs. -The Ray Autoscaler spins up and deletes Kubernetes pods according to resource demands of the Ray workload - each Ray node runs in its own Kubernetes pod. - -Quick Guide ------------ - -This document covers the following topics: - -- :ref:`Overview of methods for launching a Ray Cluster on Kubernetes` -- :ref:`Managing clusters with the Ray Cluster Launcher` -- :ref:`Managing clusters with the Ray Kubernetes Operator` -- :ref:`Interacting with a Ray Cluster via a Kubernetes Service` -- :ref:`Comparison of the Ray Cluster Launcher and Ray Kubernetes Operator` - -You can find more information at the following links: - -- :ref:`GPU usage with Kubernetes` -- :ref:`Using Ray Tune on your Kubernetes cluster` -- :ref:`How to manually set up a non-autoscaling Ray cluster on Kubernetes` - -.. _k8s-overview: - -Ray on Kubernetes -================= - -Ray supports two ways of launching an autoscaling Ray cluster on Kubernetes. - -- Using the :ref:`Ray Cluster Launcher ` -- Using the :ref:`Ray Kubernetes Operator ` - -The Cluster Launcher and Ray Kubernetes Operator provide similar functionality; each serves as an `interface to the Ray autoscaler`. -Below is a brief overview of the two tools. - -The Ray Cluster Launcher ------------------------- -The :ref:`Ray Cluster Launcher ` is geared towards experimentation and development and can be used to launch Ray clusters on Kubernetes (among other backends). -It allows you to manage an autoscaling Ray Cluster from your local environment using the :ref:`Ray CLI `. -For example, you can use ``ray up`` to launch a Ray cluster on Kubernetes and ``ray exec`` to execute commands in the Ray head node's pod. -Note that using the Cluster Launcher requires Ray to be :ref:`installed locally `. - -* Get started with the :ref:`Ray Cluster Launcher on Kubernetes`. - -The Ray Kubernetes Operator ---------------------------- -The Ray Kubernetes Operator is a Kubernetes-native solution geared towards production use cases. -Rather than handling cluster launching locally, cluster launching and autoscaling are centralized in the Operator's Pod. -The Operator follows the standard Kubernetes `pattern `__ - it runs -a control loop which manages a `Kubernetes Custom Resource`_ specifying the desired state of your Ray cluster. -Using the Kubernetes Operator does not require a local installation of Ray - all interactions with your Ray cluster are mediated by Kubernetes. - -* Get started with the :ref:`Ray Kubernetes Operator`. - - -Further reading ---------------- - -Read :ref:`here` for more details on the comparison between the Operator and Cluster Launcher. -Note that it is also possible to manually deploy a :ref:`non-autoscaling Ray cluster ` on Kubernetes. - -.. note:: - - The configuration ``yaml`` files used in this document are provided in the `Ray repository`_ - as examples to get you started. When deploying real applications, you will probably - want to build and use your own container images, add more worker nodes to the - cluster, and change the resource requests for the head and worker nodes. Refer to the provided ``yaml`` - files to be sure that you maintain important configuration options for Ray to - function properly. - - -.. _`Ray repository`: https://github.com/ray-project/ray/tree/master/python/ray/autoscaler/kubernetes - -.. _k8s-cluster-launcher: - -Managing Clusters with the Ray Cluster Launcher -=============================================== - -This section briefly explains how to use the Ray Cluster Launcher to launch a Ray cluster on your existing Kubernetes cluster. - -First, install the Kubernetes API client (``pip install kubernetes``), then make sure your Kubernetes credentials are set up properly to access the cluster (if a command like ``kubectl get pods`` succeeds, you should be good to go). - -Once you have ``kubectl`` configured locally to access the remote cluster, you should be ready to launch your cluster. The provided `ray/python/ray/autoscaler/kubernetes/example-full.yaml `__ cluster config file will create a small cluster of one pod for the head node configured to autoscale up to two worker node pods, with all pods requiring 1 CPU and 0.5GiB of memory. - -Test that it works by running the following commands from your local machine: - -.. _cluster-launcher-commands: - -.. code-block:: bash - - # Create or update the cluster. When the command finishes, it will print - # out the command that can be used to get a remote shell into the head node. - $ ray up ray/python/ray/autoscaler/kubernetes/example-full.yaml - - # List the pods running in the cluster. You shoud only see one head node - # until you start running an application, at which point worker nodes - # should be started. Don't forget to include the Ray namespace in your - # 'kubectl' commands ('ray' by default). - $ kubectl -n ray get pods - - # Get a remote screen on the head node. - $ ray attach ray/python/ray/autoscaler/kubernetes/example-full.yaml - $ # Try running a Ray program with 'ray.init(address="auto")'. - - # View monitor logs - $ ray monitor ray/python/ray/autoscaler/kubernetes/example-full.yaml - - # Tear down the cluster - $ ray down ray/python/ray/autoscaler/kubernetes/example-full.yaml - -* Learn about :ref:`running Ray programs on Kubernetes ` - -.. _k8s-operator: - -Managing clusters with the Ray Kubernetes Operator -================================================== - -.. role:: bash(code) - :language: bash - -This section explains how to use the Ray Kubernetes Operator to launch a Ray cluster on your existing Kubernetes cluster. - -The example commands in this document launch six Kubernetes pods, using a total of 6 CPU and 3.5Gi memory. -If you are experimenting using a test Kubernetes environment such as `minikube`_, make sure to provision sufficient resources, e.g. -:bash:`minikube start --cpus=6 --memory=\"4G\"`. -Alternatively, reduce resource usage by editing the ``yaml`` files referenced in this document; for example, reduce ``minWorkers`` -in ``example_cluster.yaml`` and ``example_cluster2.yaml``. +Deploying on Kubernetes +======================= .. note:: - 1. The Ray Kubernetes Operator is still experimental. For the yaml files in the examples below, we recommend using the latest master version of Ray. - 2. The Ray Kubernetes Operator requires Kubernetes version at least ``v1.17.0``. Check Kubernetes version info with the command :bash:`kubectl version`. + This document is mainly for advanced Kubernetes usage. The easiest way to run a Ray cluster on Kubernetes is by using the built-in Cluster Launcher. Please see the :ref:`Cluster Launcher documentation ` for details. -Applying the RayCluster Custom Resource Definition --------------------------------------------------- -The Ray Kubernetes operator works by managing a user-submitted `Kubernetes Custom Resource`_ (CR) called a ``RayCluster``. -A RayCluster custom resource describes the desired state of the Ray cluster. -To get started, we need to apply the `Kubernetes Custom Resource Definition`_ (CRD) defining a RayCluster. +This document assumes that you have access to a Kubernetes cluster and have +``kubectl`` installed locally and configured to access the cluster. It will +first walk you through how to deploy a Ray cluster on your existing Kubernetes +cluster, then explore a few different ways to run programs on the Ray cluster. -.. code-block:: shell +To learn about deploying an autoscaling Ray cluster using :ref:`Ray's Kubernetes operator`, read +:ref:`here`. - $ kubectl apply -f ray/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml +For information on using GPUs with Ray on Kubernetes, see :ref:`here`. - customresourcedefinition.apiextensions.k8s.io/rayclusters.cluster.ray.io created +The configuration ``yaml`` files used here are provided in the `Ray repository`_ +as examples to get you started. When deploying real applications, you will probably +want to build and use your own container images, add more worker nodes to the +cluster (or use the `Kubernetes Horizontal Pod Autoscaler`_), and change the +resource requests for the head and worker nodes. Refer to the provided ``yaml`` +files to be sure that you maintain important configuration options for Ray to +function properly. -.. note:: +.. _`Ray repository`: https://github.com/ray-project/ray/tree/master/doc/kubernetes - The file ``cluster_crd.yaml`` defining the CRD is not meant to meant to be modified by the user. Rather, users :ref:`configure ` a RayCluster CR via a file like `ray/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml `__. - The Kubernetes API server then validates the user-submitted RayCluster resource against the CRD. +Creating a Ray Namespace +------------------------ -Picking a Kubernetes Namespace -------------------------------- -The rest of the Kubernetes resources we will use are `namespaced`_. -You can use an existing namespace for your Ray clusters or create a new one if you have permissions. -For this example, we will create a namespace called ``ray``. +First, create a `Kubernetes Namespace`_ for Ray resources on your cluster. The +following commands will create resources under this Namespace, so if you want +to use a different one than ``ray``, please be sure to also change the +`namespace` fields in the provided ``yaml`` files and anytime you see a ``-n`` +flag passed to ``kubectl``. .. code-block:: shell - $ kubectl create namespace ray + $ kubectl create -f ray/doc/kubernetes/ray-namespace.yaml - namespace/ray created - -Starting the Operator +Starting a Ray Cluster ---------------------- -To launch the operator in our namespace, we execute the following command. +.. toctree:: + :hidden: -.. code-block:: shell + /cluster/k8s-operator.rst - $ kubectl -n ray apply -f ray/python/ray/autoscaler/kubernetes/operator_configs/operator.yaml +A Ray cluster consists of a single head node and a set of worker nodes (the +provided ``ray-cluster.yaml`` file will start 3 worker nodes). In the example +Kubernetes configuration, this is implemented as: - serviceaccount/ray-operator-serviceaccount created - role.rbac.authorization.k8s.io/ray-operator-role created - rolebinding.rbac.authorization.k8s.io/ray-operator-rolebinding created - pod/ray-operator-pod created +- A ``ray-head`` `Kubernetes Service`_ that enables the worker nodes to discover the location of the head node on start up. +- A ``ray-head`` `Kubernetes Deployment`_ that backs the ``ray-head`` Service with a single head node pod (replica). +- A ``ray-worker`` `Kubernetes Deployment`_ with multiple worker node pods (replicas) that connect to the ``ray-head`` pod using the ``ray-head`` Service. -The output shows that we've launched a Pod named ``ray-operator-pod``. This is the pod that runs the operator process. -The ServiceAccount, Role, and RoleBinding we have created grant the operator pod the `permissions`_ it needs to manage Ray clusters. +Note that because the head and worker nodes are Deployments, Kubernetes will +automatically restart pods that crash to maintain the correct number of +replicas. -.. _operator-launch: +- If a worker node goes down, a replacement pod will be started and joined to the cluster. +- If the head node goes down, it will be restarted. This will start a new Ray cluster. Worker nodes that were connected to the old head node will crash and be restarted, connecting to the new head node when they come back up. -Launching Ray Clusters ----------------------- -Finally, to launch a Ray cluster, we create a RayCluster custom resource. +Try deploying a cluster with the provided Kubernetes config by running the +following command: .. code-block:: shell - $ kubectl -n ray apply -f ray/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml + $ kubectl apply -f ray/doc/kubernetes/ray-cluster.yaml - raycluster.cluster.ray.io/example-cluster created - -The operator detects the RayCluster resource we've created and launches an autoscaling Ray cluster. -Our RayCluster configuration specifies ``minWorkers:2`` in the second entry of ``spec.podTypes``, so we get a head node and two workers upon launch. - -.. note:: - - For more details about RayCluster resources, we recommend take a looking at the annotated example `example_cluster.yaml `__ applied in the last command. +Verify that the pods are running by running ``kubectl get pods -n ray``. You +may have to wait up to a few minutes for the pods to enter the 'Running' +state on the first run. .. code-block:: shell - $ kubectl -n ray get pods - NAME READY STATUS RESTARTS AGE - example-cluster-ray-head-hbxvv 1/1 Running 0 72s - example-cluster-ray-worker-4hvv6 1/1 Running 0 64s - example-cluster-ray-worker-78kp5 1/1 Running 0 64s - ray-operator-pod 1/1 Running 0 2m33s - -We see four pods: the operator, the Ray head node, and two Ray worker nodes. - -Let's launch another cluster in the same namespace, this one specifiying ``minWorkers:1``. + $ kubectl -n ray get pods + NAME READY STATUS RESTARTS AGE + ray-head-5455bb66c9-6bxvz 1/1 Running 0 10s + ray-worker-5c49b7cc57-c6xs8 1/1 Running 0 5s + ray-worker-5c49b7cc57-d9m86 1/1 Running 0 5s + ray-worker-5c49b7cc57-kzk4s 1/1 Running 0 5s -.. code-block:: shell +.. note:: - $ kubectl -n ray apply -f ray/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml + You might see a nonzero number of RESTARTS for the worker pods. That can + happen when the worker pods start up before the head pod and the workers + aren't able to connect. This shouldn't affect the behavior of the cluster. -We confirm that both clusters are running in our namespace. +To change the number of worker nodes in the cluster, change the ``replicas`` +field in the worker deployment configuration in that file and then re-apply +the config as follows: .. code-block:: shell - $ kubectl -n ray get rayclusters - NAME STATUS AGE - example-cluster Running 19s - example-cluster2 Running 19s - - - $ kubectl -n ray get pods - NAME READY STATUS RESTARTS AGE - example-cluster-ray-head-th4wv 1/1 Running 0 10m - example-cluster-ray-worker-q9pjn 1/1 Running 0 10m - example-cluster-ray-worker-qltnp 1/1 Running 0 10m - example-cluster2-ray-head-kj5mg 1/1 Running 0 10s - example-cluster2-ray-worker-qsgnd 1/1 Running 0 1s - ray-operator-pod 1/1 Running 0 10m + # Edit 'ray/doc/kubernetes/ray-cluster.yaml' and change the 'replicas' + # field under the ray-worker deployment to, e.g., 4. -Now we can :ref:`run Ray programs` on our Ray clusters. - -.. _operator-logs: - -Monitoring ----------- -Autoscaling logs are written to the operator pod's ``stdout`` and can be accessed with :code:`kubectl logs`. -Each line of output is prefixed by the name of the cluster followed by a colon. -The following command gets the last hundred lines of autoscaling logs for our second cluster. - -.. code-block:: shell + # Re-apply the new configuration to the running deployment. + $ kubectl apply -f ray/doc/kubernetes/ray-cluster.yaml + service/ray-head unchanged + deployment.apps/ray-head unchanged + deployment.apps/ray-worker configured - $ kubectl -n ray logs ray-operator-pod | grep ^example-cluster2: | tail -n 100 + # Verify that there are now the correct number of worker pods running. + $ kubectl -n ray get pods + NAME READY STATUS RESTARTS AGE + ray-head-5455bb66c9-6bxvz 1/1 Running 0 30s + ray-worker-5c49b7cc57-c6xs8 1/1 Running 0 25s + ray-worker-5c49b7cc57-d9m86 1/1 Running 0 25s + ray-worker-5c49b7cc57-kzk4s 1/1 Running 0 25s + ray-worker-5c49b7cc57-zzfg2 1/1 Running 0 0s -The output should include monitoring updates that look like this: +To validate that the restart behavior is working properly, try killing pods +and checking that they are restarted by Kubernetes: .. code-block:: shell - example-cluster2:2020-12-12 13:55:36,814 DEBUG autoscaler.py:693 -- Cluster status: 1 nodes - example-cluster2: - MostDelayedHeartbeats: {'172.17.0.4': 0.04093289375305176, '172.17.0.5': 0.04084634780883789} - example-cluster2: - NodeIdleSeconds: Min=36 Mean=38 Max=41 - example-cluster2: - ResourceUsage: 0.0/2.0 CPU, 0.0/1.0 Custom1, 0.0/1.0 is_spot, 0.0 GiB/0.58 GiB memory, 0.0 GiB/0.1 GiB object_store_memory - example-cluster2: - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 - example-cluster2:Worker node types: - example-cluster2: - worker-nodes: 1 - example-cluster2:2020-12-12 13:55:36,870 INFO resource_demand_scheduler.py:148 -- Cluster resources: [{'object_store_memory': 1.0, 'node:172.17.0.4': 1.0, 'memory': 5.0, 'CPU': 1.0}, {'object_store_memory': 1.0, 'is_spot': 1.0, 'memory': 6.0, 'node:172.17.0.5': 1.0, 'Custom1': 1.0, 'CPU': 1.0}] - example-cluster2:2020-12-12 13:55:36,870 INFO resource_demand_scheduler.py:149 -- Node counts: defaultdict(, {'head-node': 1, 'worker-nodes - ': 1}) - example-cluster2:2020-12-12 13:55:36,870 INFO resource_demand_scheduler.py:159 -- Placement group demands: [] - example-cluster2:2020-12-12 13:55:36,870 INFO resource_demand_scheduler.py:186 -- Resource demands: [] - example-cluster2:2020-12-12 13:55:36,870 INFO resource_demand_scheduler.py:187 -- Unfulfilled demands: [] - example-cluster2:2020-12-12 13:55:36,891 INFO resource_demand_scheduler.py:209 -- Node requests: {} - example-cluster2:2020-12-12 13:55:36,903 DEBUG autoscaler.py:654 -- example-cluster2-ray-worker-tdxdr is not being updated and passes config check (can_update=True). - example-cluster2:2020-12-12 13:55:36,923 DEBUG autoscaler.py:654 -- example-cluster2-ray-worker-tdxdr is not being updated and passes config check (can_update=True). + # Delete a worker pod. + $ kubectl -n ray delete pod ray-worker-5c49b7cc57-c6xs8 + pod "ray-worker-5c49b7cc57-c6xs8" deleted -Cleaning Up ------------ -We shut down a Ray cluster by deleting the associated RayCluster resource. -Either of the next two commands will delete our second cluster ``example-cluster2``. - -.. code-block:: shell - - $ kubectl -n ray delete raycluster example-cluster2 - # OR - $ kubectl -n ray delete -f ray/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml + # Check that a new worker pod was started (this may take a few seconds). + $ kubectl -n ray get pods + NAME READY STATUS RESTARTS AGE + ray-head-5455bb66c9-6bxvz 1/1 Running 0 45s + ray-worker-5c49b7cc57-d9m86 1/1 Running 0 40s + ray-worker-5c49b7cc57-kzk4s 1/1 Running 0 40s + ray-worker-5c49b7cc57-ypq8x 1/1 Running 0 0s -The pods associated with ``example-cluster2`` go into the ``TERMINATING`` phase. In a few moments, we check that these pods are gone: + # Delete the head pod. + $ kubectl -n ray delete pod ray-head-5455bb66c9-6bxvz + pod "ray-head-5455bb66c9-6bxvz" deleted -.. code-block:: shell + # Check that a new head pod was started and the worker pods were restarted. + $ kubectl -n ray get pods + NAME READY STATUS RESTARTS AGE + ray-head-5455bb66c9-gqzql 1/1 Running 0 0s + ray-worker-5c49b7cc57-d9m86 1/1 Running 1 50s + ray-worker-5c49b7cc57-kzk4s 1/1 Running 1 50s + ray-worker-5c49b7cc57-ypq8x 1/1 Running 1 10s + + # You can even try deleting all of the pods in the Ray namespace and checking + # that Kubernetes brings the right number back up. + $ kubectl -n ray delete pods --all + $ kubectl -n ray get pods + NAME READY STATUS RESTARTS AGE + ray-head-5455bb66c9-7l6xj 1/1 Running 0 10s + ray-worker-5c49b7cc57-57tpv 1/1 Running 0 10s + ray-worker-5c49b7cc57-6m4kp 1/1 Running 0 10s + ray-worker-5c49b7cc57-jx2w2 1/1 Running 0 10s - $ kubectl -n ray get pods - NAME READY STATUS RESTARTS AGE - example-cluster-ray-head-th4wv 1/1 Running 0 57m - example-cluster-ray-worker-q9pjn 1/1 Running 0 56m - example-cluster-ray-worker-qltnp 1/1 Running 0 56m - ray-operator-pod 1/1 Running 0 57m +.. _ray-k8s-run: -Only the operator pod and the first ``example-cluster`` remain. +Running Ray Programs +-------------------- -To finish clean-up, we delete the cluster ``example-cluster`` and then the operator's resources. +This section assumes that you have a running Ray cluster (if you don't, please +refer to the section above to get started) and will walk you through three +different options to run a Ray program on it: -.. code-block:: shell +1. Using `kubectl exec` to run a Python script. +2. Using `kubectl exec -it bash` to work interactively in a remote shell. +3. Submitting a `Kubernetes Job`_. - $ kubectl -n ray delete raycluster example-cluster - $ kubectl -n ray delete -f ray/python/ray/autoscaler/kubernetes/operator_configs/operator.yaml +Running a program using 'kubectl exec' +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -If you like, you can delete the RayCluster customer resource definition. -(Using the operator again will then require reapplying the CRD.) +To run an example program that tests object transfers between nodes in the +cluster, try the following commands (don't forget to replace the head pod name +- you can find it by running ``kubectl -n ray get pods``): .. code-block:: shell - $ kubectl delete crd rayclusters.cluster.ray.io - # OR - $ kubectl delete -f ray/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml - - -.. _ray-k8s-interact: - -Interacting with a Ray Cluster -============================== -:ref:`Ray Client ` allows you to connect to your Ray cluster on Kubernetes and execute Ray programs. -The Ray Client server runs the Ray head node, by default on port 10001. - -:ref:`Ray Dashboard ` gives visibility into the state of your cluster. -By default, the dashboard uses port 8265 on the Ray head node. - -.. _k8s-service: + # Copy the test script onto the head node. + $ kubectl -n ray cp ray/doc/kubernetes/example.py ray-head-5455bb66c9-7l6xj:/example.py -Configuring a head node service -------------------------------- -To use Ray Client and Ray Dashboard, -you can connect via a `Kubernetes Service`_ targeting the relevant ports on the head node: + # Run the example program on the head node. + $ kubectl -n ray exec ray-head-5455bb66c9-7l6xj -- python example.py + # You should see repeated output for 10 iterations and then 'Success!' -.. _svc-example: +Running a program in a remote shell +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. code-block:: yaml - - apiVersion: v1 - kind: Service - metadata: - name: example-cluster-ray-head - spec: - # This selector must match the head node pod's selector. - selector: - component: example-cluster-ray-head - ports: - - name: client - protocol: TCP - port: 10001 - targetPort: 10001 - - name: dashboard - protocol: TCP - port: 8265 - targetPort: 8265 - - -The head node pod's ``metadata`` should have a ``label`` matching the service's ``selector`` field: +You can also run tasks interactively on the cluster by connecting a remote +shell to one of the pods. -.. code-block:: yaml +.. code-block:: shell - apiVersion: v1 - kind: Pod - metadata: - # Automatically generates a name for the pod with this prefix. - generateName: example-cluster-ray-head- - # Must match the head node service selector above if a head node - # service is required. - labels: - component: example-cluster-ray-head + # Copy the test script onto the head node. + $ kubectl -n ray cp ray/doc/kubernetes/example.py ray-head-5455bb66c9-7l6xj:/example.py -- The Ray Kubernetes Operator automatically configures a default service exposing ports 10001 and 8265 \ - on the head node pod. The Operator also adds the relevant label to the head node pod's configuration. \ - If this default service does not suit your use case, you can modify the service or create a new one, \ - for example by using the tools ``kubectl edit``, ``kubectl create``, or ``kubectl apply``. + # Get a remote shell to the head node. + $ kubectl -n ray exec -it ray-head-5455bb66c9-7l6xj -- bash -- The Ray Cluster launcher does not automatically configure a service targeting the head node. A \ - head node service can be specified in the cluster launching config's ``provider.services`` field. The example cluster lauching \ - config `example-full.yaml `__ includes \ - the :ref:`above ` service configuration as an example. + # Run the example program on the head node. + root@ray-head-6f566446c-5rdmb:/# python example.py + # You should see repeated output for 10 iterations and then 'Success!' -After launching a Ray cluster with either the Operator or Cluster Launcher, you can view the configured service: +You can also start an IPython interpreter to work interactively: .. code-block:: shell - $ kubectl -n ray get services + # From your local machine. + $ kubectl -n ray exec -it ray-head-5455bb66c9-7l6xj -- ipython - NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE - example-cluster-ray-head ClusterIP 10.106.123.159 10001/TCP,8265/TCP 52s + # From a remote shell on the head node. + $ kubectl -n ray exec -it ray-head-5455bb66c9-7l6xj -- bash + root@ray-head-6f566446c-5rdmb:/# ipython -.. _ray-k8s-run: +Once you have the IPython interpreter running, try running the following example +program: -Running Ray Programs --------------------- -Given a running Ray cluster and a :ref:`Service ` exposing the Ray Client server's port on the head pod, -we can now run Ray programs on our cluster. +.. code-block:: python -In the following examples, we assume that we have a running Ray cluster with one head node and -two worker nodes. This can be achieved in one of two ways: + from collections import Counter + import platform + import time + import ray -- Using the :ref:`Operator ` with the example resource `ray/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml `__. -- Using :ref:`Cluster Launcher `. Modify the example file `ray/python/ray/autoscaler/kubernetes/example-full.yaml `__ - by setting the field ``available_node_types.worker_node.min_workers`` - to 2 and then run ``ray up`` with the modified config. + ray.init(address="$RAY_HEAD_SERVICE_HOST:$RAY_HEAD_SERVICE_PORT_REDIS_PRIMARY") + @ray.remote + def f(x): + time.sleep(0.01) + return x + (platform.node(), ) -Using Ray Client to connect from within the Kubernetes cluster -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -You can connect to your Ray cluster from another pod in the same Kubernetes cluster. + # Check that objects can be transferred from each node to each other node. + %time Counter(ray.get([f.remote(f.remote(())) for _ in range(100)])) -For example, you can submit a Ray application to run on the Kubernetes cluster as a `Kubernetes +Submitting a Job +~~~~~~~~~~~~~~~~ + +You can also submit a Ray application to run on the cluster as a `Kubernetes Job`_. The Job will run a single pod running the Ray driver program to completion, then terminate the pod but allow you to access the logs. -The following command submits a Job which executes an `example Ray program`_. +To submit a Job that downloads and executes an `example program`_ that tests +object transfers between nodes in the cluster, run the following command: -.. code-block:: yaml +.. code-block:: shell - $ kubectl create -f ray/python/ray/autoscaler/kubernetes/job-example.yaml + $ kubectl create -f ray/doc/kubernetes/ray-job.yaml + job.batch/ray-test-job-kw5gn created -The program executed by the Job waits for three Ray nodes to connect and then tests object transfer -between the nodes. Note that the program uses the environment variables -``EXAMPLE_CLUSTER_RAY_HEAD_SERVICE_HOST`` and ``EXAMPLE_CLUSTER_RAY_HEAD_SERVICE_PORT_CLIENT`` -to access Ray Client. These `environment variables`_ are set by Kubernetes based on -the service we are using to expose the Ray head node. +.. _`example program`: https://github.com/ray-project/ray/blob/master/doc/kubernetes/example.py To view the output of the Job, first find the name of the pod that ran it, then fetch its logs: @@ -432,15 +256,16 @@ then fetch its logs: .. code-block:: shell $ kubectl -n ray get pods - NAME READY STATUS RESTARTS AGE - example-cluster-ray-head-rpqfb 1/1 Running 0 11m - example-cluster-ray-worker-4c7cn 1/1 Running 0 11m - example-cluster-ray-worker-zvglb 1/1 Running 0 11m - ray-test-job-8x2pm-77lb5 1/1 Running 0 8s + NAME READY STATUS RESTARTS AGE + ray-head-5455bb66c9-7l6xj 1/1 Running 0 15s + ray-test-job-kw5gn-5g7tv 0/1 Completed 0 10s + ray-worker-5c49b7cc57-57tpv 1/1 Running 0 15s + ray-worker-5c49b7cc57-6m4kp 1/1 Running 0 15s + ray-worker-5c49b7cc57-jx2w2 1/1 Running 0 15s # Fetch the logs. You should see repeated output for 10 iterations and then # 'Success!' - $ kubectl -n ray logs ray-test-job-8x2pm-77lb5 + $ kubectl -n ray logs ray-test-job-kw5gn-5g7tv To clean up the resources created by the Job after checking its output, run the following: @@ -457,139 +282,94 @@ the following: # Verify that the Job's pod was cleaned up. $ kubectl -n ray get pods - NAME READY STATUS RESTARTS AGE - example-cluster-ray-head-rpqfb 1/1 Running 0 11m - example-cluster-ray-worker-4c7cn 1/1 Running 0 11m - example-cluster-ray-worker-zvglb 1/1 Running 0 11m - -.. _`environment variables`: https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables -.. _`example Ray program`: https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/kubernetes/example_scripts/job_example.py - + NAME READY STATUS RESTARTS AGE + ray-head-5455bb66c9-7l6xj 1/1 Running 0 60s + ray-worker-5c49b7cc57-57tpv 1/1 Running 0 60s + ray-worker-5c49b7cc57-6m4kp 1/1 Running 0 60s + ray-worker-5c49b7cc57-jx2w2 1/1 Running 0 60s -Using Ray Client to connect from outside the Kubernetes cluster -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -To connect to the Ray cluster from outside your Kubernetes cluster, -the head node Service needs to communicate with the outside world. +Cleaning Up +----------- -One way to achieve this is by port-forwarding. -Run the following command locally: +To delete a running Ray cluster, you can run the following command: .. code-block:: shell - $ kubectl -n ray port-forward service/example-cluster-ray-head 10001:10001 + kubectl delete -f ray/doc/kubernetes/ray-cluster.yaml -`Alternatively`, you can find the head node pod and connect to it directly with -the following command: +.. _k8s-gpus: -.. code-block:: shell +Using GPUs +---------- - # Substitute the name of your Ray cluster if using a name other than "example-cluster". - $ kubectl -n ray port-forward \ - $(kubectl -n ray get pods -l ray-cluster-name=example-cluster -l ray-node-type=head -o custom-columns=:metadata.name) 10001:10001 +To use GPUs on Kubernetes, you will need to configure both your Kubernetes setup and add additional values to your Ray cluster configuration. -Then open a new shell and try out a sample program: +For relevant documentation for GPU usage on different clouds, see instructions for `GKE`_, for `EKS`_, and for `AKS`_. -.. code-block:: shell +The `Ray Docker Hub `_ hosts CUDA-based images packaged with Ray for use in Kubernetes pods. +For example, the image ``rayproject/ray-ml:nightly-gpu`` is ideal for running GPU-based ML workloads with the most recent nightly build of Ray. +Read :ref:`here` for further details on Ray images. - $ python ray/python/ray/autoscaler/kubernetes/example_scripts/run_local_example.py +Using Nvidia GPUs requires specifying the relevant resource `limits` in the container fields of your Kubernetes configurations. +(Kubernetes `sets `_ +the GPU request equal to the limit.) The configuration for a pod running a Ray GPU image and +using one Nvidia GPU looks like this: -The program in this example uses ``ray.util.connect(127.0.0.1:10001)`` to connect to the Ray cluster. +.. code-block:: yaml + apiVersion: v1 + kind: Pod + metadata: + generateName: example-cluster-ray-worker + spec: + ... + containers: + - name: ray-node + image: rayproject/ray:nightly-gpu + ... + resources: + cpu: 1000m + memory: 512Mi + limits: + memory: 512Mi + nvidia.com/gpu: 1 + +GPU taints and tolerations +~~~~~~~~~~~~~~~~~~~~~~~~~~ .. note:: - Connecting with Ray client requires using the matching minor versions of Python (for example 3.7) - on the server and client end -- that is on the Ray head node and in the environment where - ``ray.util.connect`` is invoked. Note that the default ``rayproject/ray`` images use Python 3.7. - Nightly builds are now available for Python 3.6 and 3.8 at the `Ray Docker Hub `_. + Users using a managed Kubernetes service probably don't need to worry about this section. -Running the program on the head node -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -It is also possible to execute a Ray program on the Ray head node. -(Replace the pod name with the name of your head pod -- you can find it by running ``kubectl -n ray get pods``.) +The `Nvidia gpu plugin`_ for Kubernetes applies `taints`_ to GPU nodes; these taints prevent non-GPU pods from being scheduled on GPU nodes. +Managed Kubernetes services like GKE, EKS, and AKS automatically apply matching `tolerations`_ +to pods requesting GPU resources. Tolerations are applied by means of Kubernetes's `ExtendedResourceToleration`_ `admission controller`_. +If this admission controller is not enabled for your Kubernetes cluster, you may need to manually add a GPU toleration each of to your GPU pod configurations. For example, -.. code-block:: shell - - $ kubectl -n ray exec example-cluster-ray-head-5455bb66c9-7l6xj -- python /home/ray/anaconda3/lib/python3.7/site-packages/ray/autoscaler/kubernetes/example_scripts/run_on_head.py - - -Alternatively, you can run tasks interactively on the cluster by connecting a remote -shell to one of the pods. - -.. code-block:: shell - - # Get a remote shell to the head node. - $ kubectl -n ray exec -it example-cluster-ray-head-5455bb66c9-7l6xj -- bash - - # Run the example program on the head node. - root@ray-head-6f566446c-5rdmb:/# python /home/ray/anaconda3/lib/python3.7/site-packages/ray/autoscaler/kubernetes/example_scripts/run_on_head.py - # You should see repeated output for 10 iterations and then 'Success!' - - -The program in this example uses ``ray.init(address="auto")`` to connect to the Ray cluster. - -Accessing the Dashboard ------------------------ - -The Ray Dashboard can accessed locally using ``kubectl port-forward``. - -.. code-block:: shell - - $ kubectl -n ray port-forward service/example-cluster-ray-head 8265:8265 - -After running the above command locally, the Dashboard will be accessible at ``http://localhost:8265``. - -You can also monitor the state of the cluster with ``kubectl logs`` when using the :ref:`Operator ` or with ``ray monitor`` when using -the :ref:`Ray Cluster Launcher `. - -.. warning:: - The Dashboard currently shows resource limits of the physical host each Ray node is running on, - rather than the limits of the container the node is running in. - This is a known bug tracked `here `_. - - -.. _k8s-comparison: - -Cluster Launcher vs Operator -============================ - -We compare the Ray Cluster Launcher and Ray Kubernetes Operator as methods of managing an autoscaling Ray cluster. - - -Comparison of use cases ------------------------ - -- The Cluster Launcher is convenient for development and experimentation. Using the Cluster Launcher requires a local installation of Ray. The Ray CLI then provides a convenient interface for interacting with a Ray cluster. - -- The Operator is geared towards production use cases. It does not require installing Ray locally - all interactions with your Ray cluster are mediated by Kubernetes. - - -Comparison of architectures ---------------------------- - -- With the Cluster Launcher, the user launches a Ray cluster from their local environment by invoking ``ray up``. This provisions a pod for the Ray head node, which then runs the `autoscaling process `__. - -- The `Operator `__ centralizes cluster launching and autoscaling in the `Operator pod `__. \ - The user creates a `Kubernetes Custom Resource`_ describing the intended state of the Ray cluster. \ - The Operator then detects the resource, launches a Ray cluster, and runs the autoscaling process in the operator pod. \ - The Operator can manage multiple Ray clusters by running an autoscaling process for each Ray cluster. - -Comparison of configuration options ------------------------------------ - -The configuration options for the two methods are completely analogous - compare sample configurations for the `Cluster Launcher `__ -and for the `Operator `__. -With a few exceptions, the fields of the RayCluster resource managed by the Operator are camelCase versions of the corresponding snake_case Cluster Launcher fields. -In fact, the Operator `internally `__ converts -RayCluster resources to Cluster Launching configs. - -A summary of the configuration differences: +.. code-block:: yaml -- The Cluster Launching field ``available_node_types`` for specifiying the types of pods available for autoscaling is renamed to ``podTypes`` in the Operator's RayCluster configuration. -- The Cluster Launching field ``resources`` for specifying custom Ray resources provided by a node type is renamed to ``rayResources`` in the Operator's RayCluster configuration. -- The ``provider`` field in the Cluster Launching config has no analogue in the Operator's RayCluster configuration. (The Operator fills this field internally.) -- * When using the Cluster Launcher, ``head_ray_start_commands`` should include the argument ``--autoscaling-config=~/ray_bootstrap_config.yaml``; this is important for the configuration of the head node's autoscaler. - * On the other hand, the Operator's ``headRayStartCommands`` should include a ``--no-monitor`` flag to prevent the autoscaling/monitoring process from running on the head node. + apiVersion: v1 + kind: Pod + metadata: + generateName: example-cluster-ray-worker + spec: + ... + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + ... + containers: + - name: ray-node + image: rayproject/ray:nightly-gpu + ... + +Further reference and discussion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Read about Kubernetes device plugins `here `__, +about Kubernetes GPU plugins `here `__, +and about Nvidia's GPU plugin for Kubernetes `here `__. + +If you run into problems setting up GPUs for your Ray cluster on Kubernetes, please reach out to us at ``_. Questions or Issues? -------------------- @@ -597,13 +377,19 @@ Questions or Issues? .. include:: /_help.rst - -.. _`Kubernetes Job`: https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/ +.. _`Kubernetes Horizontal Pod Autoscaler`: https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/ +.. _`Kubernetes Namespace`: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/ .. _`Kubernetes Service`: https://kubernetes.io/docs/concepts/services-networking/service/ -.. _`Kubernetes Operator`: https://kubernetes.io/docs/concepts/extend-kubernetes/operator/ -.. _`Kubernetes Custom Resource`: https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/custom-resources/ -.. _`Kubernetes Custom Resource Definition`: https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definitions/ -.. _`annotation`: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/#attaching-metadata-to-objects -.. _`permissions`: https://kubernetes.io/docs/reference/access-authn-authz/rbac/ -.. _`minikube`: https://minikube.sigs.k8s.io/docs/start/ -.. _`namespaced`: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/ +.. _`Kubernetes Deployment`: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/ +.. _`Kubernetes Job`: https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/ + +.. _`Discussion Board`: https://discuss.ray.io/ +.. _`GKE`: https://cloud.google.com/kubernetes-engine/docs/how-to/gpus +.. _`EKS`: https://docs.aws.amazon.com/eks/latest/userguide/eks-optimized-ami.html +.. _`AKS`: https://docs.microsoft.com/en-us/azure/aks/gpu-cluster + +.. _`tolerations`: https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ +.. _`taints`: https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ +.. _`Nvidia gpu plugin`: https://github.com/NVIDIA/k8s-device-plugin +.. _`admission controller`: https://kubernetes.io/docs/reference/access-authn-authz/admission-controllers/ +.. _`ExtendedResourceToleration`: https://kubernetes.io/docs/reference/access-authn-authz/admission-controllers/#extendedresourcetoleration diff --git a/doc/source/cluster/launcher.rst b/doc/source/cluster/launcher.rst new file mode 100644 index 000000000000..8c63f04f9a4f --- /dev/null +++ b/doc/source/cluster/launcher.rst @@ -0,0 +1,66 @@ +.. _ref-automatic-cluster: + +Launching Cloud Clusters with Ray +================================= + +Ray comes with a built-in cluster launcher that makes deploying a Ray cluster simple. + +The cluster launcher will provision resources from a node provider (like :ref:`AWS EC2 ` or :ref:`Kubernetes `) to instantiate the specified cluster, and start a Ray cluster on the provisioned resources. + +You can configure the Ray Cluster Launcher to use with :ref:`a cloud provider `, an existing :ref:`Kubernetes cluster `, or a private cluster of machines. + +.. tabs:: + .. group-tab:: AWS + + .. code-block:: shell + + # First, run `pip install boto3` and `aws configure` + # + # Create or update the cluster. When the command finishes, it will print + # out the command that can be used to SSH into the cluster head node. + $ ray up ray/python/ray/autoscaler/aws/example-full.yaml + + See :ref:`the AWS section ` for full instructions. + + .. group-tab:: GCP + + .. code-block:: shell + + # First, ``pip install google-api-python-client`` + # set up your GCP credentials, and + # create a new GCP project. + # + # Create or update the cluster. When the command finishes, it will print + # out the command that can be used to SSH into the cluster head node. + $ ray up ray/python/ray/autoscaler/gcp/example-full.yaml + + See :ref:`the GCP section ` for full instructions. + + .. group-tab:: Azure + + .. code-block:: shell + + # First, install the Azure CLI + # ``pip install azure-cli azure-core``) then + # login using (``az login``). + # + # Create or update the cluster. When the command finishes, it will print + # out the command that can be used to SSH into the cluster head node. + $ ray up ray/python/ray/autoscaler/azure/example-full.yaml + + See :ref:`the Azure section ` for full instructions. + + +Once the Ray cluster is running, you can manually SSH into it or use provided commands like ``ray attach``, ``ray rsync-up``, and ``ray exec`` to access it and run Ray programs. + + +.. toctree:: + + /cluster/cloud.rst + /cluster/config.rst + /cluster/commands.rst + +Questions or Issues? +-------------------- + +.. include:: /_help.rst diff --git a/doc/source/cluster/quickstart.rst b/doc/source/cluster/quickstart.rst deleted file mode 100644 index f02db280e4b4..000000000000 --- a/doc/source/cluster/quickstart.rst +++ /dev/null @@ -1,240 +0,0 @@ -.. _ref-cluster-quick-start: - -Quick Start Cluster Autoscaling Demo -==================================== - -This quick start demonstrates the capabilities of the Ray cluster. Using the Ray cluster, we'll take a sample application designed to run on a laptop and scale it up in the cloud. Ray will launch clusters and scale Python with just a few commands. - -About the demo --------------- - -This demo will walk through an end-to-end flow: - -1. Create a (basic) Python application. -2. Launch a cluster on a cloud provider. -3. Run the application in the cloud. - -Requirements -~~~~~~~~~~~~ - -To run this demo, you will need: - -* Python installed on your development machine (typically your laptop), and -* an account at your preferred cloud provider (AWS, Azure or GCP). - -Setup -~~~~~ - -Before we start, you will need to install some Python dependencies as follows: - -.. tabs:: - .. group-tab:: AWS - - .. code-block:: shell - - $ pip install -U ray boto3 - - .. group-tab:: Azure - - .. code-block:: shell - - $ pip install -U ray azure-cli azure-core - - .. group-tab:: GCP - - .. code-block:: shell - - $ pip install -U ray google-api-python-client - -Next, if you're not set up to use your cloud provider from the command line, you'll have to configure your credentials: - -.. tabs:: - .. group-tab:: AWS - - Configure your credentials in ``~/.aws/credentials`` as described in `the AWS docs `_. - - .. group-tab:: Azure - - Log in using ``az login``, then configure your credentials with ``az account set -s ``. - - .. group-tab:: GCP - - Set the ``GOOGLE_APPLICATION_CREDENTIALS`` environment variable as described in `the GCP docs `_. - -Create a (basic) Python application ------------------------------------ - -We will write a simple Python application that tracks the IP addresses of the machines that its tasks are executed on: - -.. code-block:: python - - from collections import Counter - import socket - import time - - def f(): - time.sleep(0.001) - # Return IP address. - return socket.gethostbyname(socket.gethostname()) - - ip_addresses = [f() for _ in range(10000)] - print(Counter(ip_addresses)) - -Save this application as ``script.py`` and execute it by running the command ``python script.py``. The application should take 10 seconds to run and output something similar to ``Counter({'127.0.0.1': 10000})``. - -With some small changes, we can make this application run on Ray (for more information on how to do this, refer to :ref:`the Ray Core Walkthrough`): - -.. code-block:: python - - from collections import Counter - import socket - import time - - import ray - - ray.init() - - @ray.remote - def f(): - time.sleep(0.001) - # Return IP address. - return socket.gethostbyname(socket.gethostname()) - - object_ids = [f.remote() for _ in range(10000)] - ip_addresses = ray.get(object_ids) - print(Counter(ip_addresses)) - -Finally, let's add some code to make the output more interesting: - -.. code-block:: python - - from collections import Counter - import socket - import time - - import ray - - ray.init() - - print('''This cluster consists of - {} nodes in total - {} CPU resources in total - '''.format(len(ray.nodes()), ray.cluster_resources()['CPU'])) - - @ray.remote - def f(): - time.sleep(0.001) - # Return IP address. - return socket.gethostbyname(socket.gethostname()) - - object_ids = [f.remote() for _ in range(10000)] - ip_addresses = ray.get(object_ids) - - print('Tasks executed') - for ip_address, num_tasks in Counter(ip_addresses).items(): - print(' {} tasks on {}'.format(num_tasks, ip_address)) - -Running ``python script.py`` should now output something like: - -.. parsed-literal:: - - This cluster consists of - 1 nodes in total - 4.0 CPU resources in total - - Tasks executed - 10000 tasks on 127.0.0.1 - -Launch a cluster on a cloud provider ------------------------------------- - -To start a Ray Cluster, first we need to define the cluster configuration. The cluster configuration is defined within a YAML file that will be used by the Cluster Launcher to launch the head node, and by the Autoscaler to launch worker nodes. - -A minimal sample cluster configuration file looks as follows: - -.. tabs:: - .. group-tab:: AWS - - .. code-block:: yaml - - # An unique identifier for the head node and workers of this cluster. - cluster_name: minimal - - # Cloud-provider specific configuration. - provider: - type: aws - region: us-west-2 - - .. group-tab:: Azure - - .. code-block:: yaml - - # An unique identifier for the head node and workers of this cluster. - cluster_name: minimal - - # Cloud-provider specific configuration. - provider: - type: azure - location: westus2 - resource_group: ray-cluster - - # How Ray will authenticate with newly launched nodes. - auth: - ssh_user: ubuntu - # you must specify paths to matching private and public key pair files - # use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair - ssh_private_key: ~/.ssh/id_rsa - # changes to this should match what is specified in file_mounts - ssh_public_key: ~/.ssh/id_rsa.pub - - .. group-tab:: GCP - - .. code-block:: yaml - - # A unique identifier for the head node and workers of this cluster. - cluster_name: minimal - - # Cloud-provider specific configuration. - provider: - type: gcp - region: us-west1 - -Save this configuration file as ``config.yaml``. You can specify a lot more details in the configuration file: instance types to use, minimum and maximum number of workers to start, autoscaling strategy, files to sync, and more. For a full reference on the available configuration properties, please refer to the :ref:`cluster YAML configuration options reference `. - -After defining our configuration, we will use the Ray Cluster Launcher to start a cluster on the cloud, creating a designated "head node" and worker nodes. To start the Ray cluster, we will use the :ref:`Ray CLI `. Run the following command: - -.. code-block:: shell - - $ ray up -y config.yaml - -Run the application in the cloud --------------------------------- - -We are now ready to execute the application in across multiple machines on our Ray cloud cluster. Run the following command: - -.. code-block:: shell - - $ ray submit config.yaml script.py - -The output should now look similar to the following: - -.. parsed-literal:: - - This cluster consists of - 3 nodes in total - 6.0 CPU resources in total - - Tasks executed - 3425 tasks on xxx.xxx.xxx.xxx - 3834 tasks on xxx.xxx.xxx.xxx - 2741 tasks on xxx.xxx.xxx.xxx - -In this sample output, 3 nodes were started. If the output only shows 1 node, you may want to increase the ``secs`` in ``time.sleep(secs)`` to give Ray more time to start additional nodes. - -The Ray CLI offers additional functionality. For example, you can monitor the Ray cluster status with ``ray monitor config.yaml``, and you can connect to the cluster (ssh into the head node) with ``ray attach config.yaml``. For a full reference on the Ray CLI, please refer to :ref:`the cluster commands reference `. - -To finish, don't forget to shut down the cluster. Run the following command: - -.. code-block:: shell - - $ ray down -y config.yaml diff --git a/doc/source/cluster/reference.rst b/doc/source/cluster/reference.rst deleted file mode 100644 index ad9388060ae6..000000000000 --- a/doc/source/cluster/reference.rst +++ /dev/null @@ -1,11 +0,0 @@ -.. _cluster-reference: - -Config YAML and CLI Reference -============================= - -.. toctree:: - :maxdepth: 2 - - config.rst - commands.rst - sdk.rst diff --git a/doc/source/cluster/sdk.rst b/doc/source/cluster/sdk.rst deleted file mode 100644 index 7238ee55823f..000000000000 --- a/doc/source/cluster/sdk.rst +++ /dev/null @@ -1,13 +0,0 @@ -.. _ref-autoscaler-sdk: - -Autoscaler SDK -============== - -.. _ref-autoscaler-sdk-request-resources: - -ray.autoscaler.sdk.request_resources ------------------------------------- - -Within a Ray program, you can command the autoscaler to scale the cluster up to a desired size with ``request_resources()`` call. The cluster will immediately attempt to scale to accommodate the requested resources, bypassing normal upscaling speed constraints. - -.. autofunction:: ray.autoscaler.sdk.request_resources \ No newline at end of file diff --git a/doc/source/conf.py b/doc/source/conf.py index b1a74f2634ee..bdff928f76ba 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -148,7 +148,6 @@ class SimpleClass2(object): 'sphinx_gallery.gen_gallery', 'sphinxemoji.sphinxemoji', 'sphinx_copybutton', - 'sphinxcontrib.yt', 'versionwarning.extension', ] diff --git a/doc/source/dask-on-ray.rst b/doc/source/dask-on-ray.rst index 486dc9a1fcd8..b5383ac8beda 100644 --- a/doc/source/dask-on-ray.rst +++ b/doc/source/dask-on-ray.rst @@ -1,32 +1,22 @@ +*********** Dask on Ray -=========== +*********** -.. _dask-on-ray: +Ray offers a scheduler integration for Dask, allowing you to build data +analyses using the familiar Dask collections (dataframes, arrays) and execute +the underlying computations on a Ray cluster. Using this Dask scheduler, the +entire Dask ecosystem can be executed on top of Ray. -`Dask `__ is a Python parallel computing library geared towards scaling analytics and -scientific computing workloads. It provides `big data collections -`__ that mimic the APIs of -the familiar `NumPy `__ and `Pandas `__ libraries, -allowing those abstractions to represent -larger-than-memory data and/or allowing operations on that data to be run on a multi-machine cluster, -while also providing automatic data parallelism, smart scheduling, -and optimized operations. Operations on these collections create a task graph, which is -executed by a scheduler. - -Ray provides a scheduler for Dask (`dask_on_ray`) which allows you to build data -analyses using Dask's collections and execute -the underlying tasks on a Ray cluster. +.. note:: -`dask_on_ray` uses Dask's scheduler API, which allows you to -specify any callable as the scheduler that you would like Dask to use to execute your -workload. Using the Dask-on-Ray scheduler, the entire Dask ecosystem can be executed on top of Ray. + Note that Ray does not currently support object spilling, and hence cannot + process datasets larger than cluster memory. This is a planned feature. +========= Scheduler ---------- +========= -.. _dask-on-ray-scheduler: - -The Dask-on-Ray scheduler can execute any valid Dask graph, and can be used with +The Dask-Ray scheduler can execute any valid Dask graph, and can be used with any Dask `.compute() `__ call. Here's an example: @@ -35,99 +25,53 @@ Here's an example: import ray from ray.util.dask import ray_dask_get - import dask.array as da - import dask.dataframe as dd - import numpy as np - import pandas as pd + import dask.delayed import time # Start Ray. # Tip: If you're connecting to an existing cluster, use ray.init(address="auto"). ray.init() - d_arr = da.from_array(np.random.randint(0, 1000, size=(256, 256))) - # The Dask scheduler submits the underlying task graph to Ray. - d_arr.mean().compute(scheduler=ray_dask_get) + @dask.delayed + def inc(x): + time.sleep(1) + return x + 1 - # Set the scheduler to ray_dask_get in your config so you don't have to specify it on - # each compute call. - dask.config.set(scheduler=ray_dask_get) + @dask.delayed + def add(x, y): + time.sleep(3) + return x + y - df = dd.from_pandas(pd.DataFrame( - np.random.randint(0, 100, size=(1024, 2)), - columns=["age", "grade"])) - df.groupby(["age"]).mean().compute() - - -.. note:: - For execution on a Ray cluster, you should *not* use the - `Dask.distributed `__ - client; simply use plain Dask and its collections, and pass ``ray_dask_get`` - to ``.compute()`` calls or set the scheduler in one of the other ways detailed `here `__. Follow the instructions for - :ref:`using Ray on a cluster ` to modify the - ``ray.init()`` call. + x = inc(1) + y = inc(2) + z = add(x, y) + # The Dask scheduler submits the underlying task graph to Ray. + z.compute(scheduler=ray_dask_get) Why use Dask on Ray? -1. To take advantage of Ray-specific features such as the - :ref:`launching cloud clusters ` and + 1. If you'd like to create data analyses using the familiar NumPy and Pandas + APIs provided by Dask and execute them on a production-ready distributed + task execution system like Ray. + 2. If you'd like to use Dask and Ray libraries in the same application + without having two different task execution backends. + 3. To take advantage of Ray-specific features such as the + :ref:`cluster launcher ` and :ref:`shared-memory store `. -2. If you'd like to use Dask and Ray libraries in the same application without having two different clusters. -3. If you'd like to create data analyses using the familiar NumPy and Pandas APIs provided by Dask and execute them on a fast, fault-tolerant distributed task execution system geared towards production, like Ray. - -Dask-on-Ray is an ongoing project and is not expected to achieve the same performance as using Ray directly. All `Dask abstractions `__ should run seamlessly on top of Ray using this scheduler, so if you find that one of these abstractions doesn't run on Ray, please `open an issue `__. - -Out-of-Core Data Processing ---------------------------- - -.. _dask-on-ray-out-of-core: - -Processing datasets larger than cluster memory is supported via Ray's :ref:`object spilling `: if -the in-memory object store is full, objects will be spilled to external storage (local disk by -default). This feature is available but off by default in Ray 1.2, and is on by default -in Ray 1.3+. Please see your Ray version's object spilling documentation for steps to enable and/or configure -object spilling. - -Custom optimization for Dask DataFrame shuffling ------------------------------------------------- -.. _dask-on-ray-shuffle-optimization: +Note that for execution on a Ray cluster, you should *not* use the +`Dask.distributed `__ +client; simply use plain Dask and its collections, and pass ``ray_dask_get`` +to ``.compute()`` calls. Follow the instructions for +:ref:`using Ray on a cluster ` to modify the +``ray.init()`` call. -Dask on Ray provides a Dask DataFrame optimizer that leverages Ray's ability to -execute multiple-return tasks in order to speed up shuffling by as much as 4x on Ray. -Simply set the `dataframe_optimize` configuration option to our optimizer function, similar to how you specify the Dask-on-Ray scheduler: - -.. code-block:: python - - import ray - from ray.util.dask import ray_dask_get, dataframe_optimize - import dask.dataframe as dd - import numpy as np - import pandas as pd - import time - - # Start Ray. - # Tip: If you're connecting to an existing cluster, use ray.init(address="auto"). - ray.init() - - # Set the scheduler to ray_dask_get, and set the Dask DataFrame optimizer to our - # custom optimization function, this time using the config setter as a context manager. - with dask.config.set(scheduler=ray_dask_get, dataframe_optimize=dataframe_optimize): - npartitions = 100 - df = dd.from_pandas(pd.DataFrame( - np.random.randint(0, 100, size=(10000, 2)), - columns=["age", "grade"]), npartitions=npartitions) - # We set max_branch to infinity in order to ensure that the task-based shuffle - # happens in a single stage, which is required in order for our optimization to - # work. - df.set_index( - ["age"], shuffle="tasks", max_branch=float("inf")).head(10, npartitions=-1) +Dask-on-Ray is an ongoing project and is not expected to achieve the same performance as using Ray directly. +========= Callbacks ---------- - -.. _dask-on-ray-callbacks: +========= Dask's `custom callback abstraction `__ is extended with Ray-specific callbacks, allowing the user to hook into the @@ -264,12 +208,11 @@ execution time exceeds some user-defined threshold: with cache_callback: z.compute(scheduler=ray_dask_get) -.. note:: - The existing Dask scheduler callbacks (``start``, ``start_state``, - ``pretask``, ``posttask``, ``finish``) are also available, which can be used to - introspect the Dask task to Ray task conversion process, but note that the ``pretask`` - and ``posttask`` hooks are executed before and after the Ray task is *submitted*, not - executed, and that ``finish`` is executed after all Ray tasks have been - *submitted*, not executed. +Note that the existing Dask scheduler callbacks (``start``, ``start_state``, +``pretask``, ``posttask``, ``finish``) are also available, which can be used to +introspect the Dask task to Ray task conversion process, but that ``pretask`` +and ``posttask`` are executed before and after the Ray task is *submitted*, not +executed, and that ``finish`` is executed after all Ray tasks have been +*submitted*, not executed. This callback API is currently unstable and subject to change. diff --git a/doc/source/getting-involved.rst b/doc/source/getting-involved.rst index f1ef61b0938e..2ee0318a24a4 100644 --- a/doc/source/getting-involved.rst +++ b/doc/source/getting-involved.rst @@ -6,7 +6,8 @@ Getting Involved / Contributing Ray is more than a framework for distributed applications but also an active community of developers, researchers, and folks that love machine learning. -.. tip:: Ask questions on `our forum `_! The +.. tip:: Join our `community Slack `_ to + discuss Ray or ask questions on `our forum `_! The community is extremely active in helping people succeed in building their Ray applications. diff --git a/doc/source/index.rst b/doc/source/index.rst index 182ff7ef7ce4..9edb823b20ad 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -117,17 +117,14 @@ Ray provides Python, Java, and *EXPERIMENTAL* C++ API. And Ray uses Tasks (funct | The C++ Ray API is currently experimental with limited support. You can track its development `here `__ and report issues on GitHub. | Run the following commands to get started: | - Build ray from source with *bazel* as shown `here `__. - | - Modify `cpp/example/example.cc`. - | - Run `"bazel build //cpp:example"`. - | Option 1: run the example directly with a dynamic library path. It will start a Ray cluster automatically. + | - Run `"cd ray/cpp"`. + | - Run `"cp dev_BUILD.bazel BUILD.bazel"`. + | - Modify `src/ray/example/example.cc`. | - Run `"ray stop"`. - | - Run `"./bazel-bin/cpp/example/example --dynamic-library-path=bazel-bin/cpp/example/example.so"` - | Option 2: connect to an existing Ray cluster with a known redis address (e.g. `127.0.0.1:6379`). - | - Run `"ray stop"`. - | - Run `"ray start --head --port 6379 --redis-password 5241590000000000 --node-manager-port 62665"`. - | - Run `"./bazel-bin/cpp/example/example --dynamic-library-path=bazel-bin/cpp/example/example.so --redis-address=127.0.0.1:6379"`. + | - Run `"bazel build //cpp:all"`. + | - Run `"bazel run //cpp:example"`. - .. literalinclude:: ../../cpp/example/example.cc + .. literalinclude:: ../../cpp/src/ray/example/example.cc :language: cpp You can also get started by visiting our `Tutorials `_. For the latest wheels (nightlies), see the `installation page `__. @@ -231,12 +228,11 @@ Papers .. toctree:: :hidden: :maxdepth: -1 - :caption: Ray Clusters/Autoscaler + :caption: Ray Cluster cluster/index.rst - cluster/quickstart.rst - cluster/reference.rst - cluster/cloud.rst + cluster/launcher.rst + cluster/autoscaling.rst cluster/deploy.rst .. toctree:: @@ -297,16 +293,6 @@ Papers raysgd/raysgd_tune.rst raysgd/raysgd_ref.rst -.. toctree:: - :hidden: - :maxdepth: -1 - :caption: Data Processing - - modin/index.rst - dask-on-ray.rst - mars-on-ray.rst - raydp.rst - .. toctree:: :hidden: :maxdepth: -1 @@ -316,6 +302,8 @@ Papers joblib.rst iter.rst xgboost-ray.rst + dask-on-ray.rst + mars-on-ray.rst ray-client.rst .. toctree:: diff --git a/doc/source/installation.rst b/doc/source/installation.rst index a35dffea39cc..397113d95c04 100644 --- a/doc/source/installation.rst +++ b/doc/source/installation.rst @@ -3,6 +3,8 @@ Installing Ray ============== +.. tip:: Join our `community slack `_ to discuss Ray! + Ray currently supports MacOS and Linux. Windows wheels are now available, but :ref:`Windows support ` is experimental and under development. @@ -22,7 +24,22 @@ You can install the latest official version of Ray as follows. Official releases Daily Releases (Nightlies) -------------------------- -You can install the nightly Ray wheels via the following links. These daily releases are tested via automated tests but do not go through the full release process. To install these wheels, use the following ``pip`` command and wheels: +You can install the latest Ray wheels via the following command. These daily releases are tested via automated tests but do not go through the full release process: + +.. code-block:: bash + + pip install -U ray + ray install-nightly + + +.. note:: ``ray install-nightly`` may not capture updated library dependencies. After running ``ray install-nightly``, consider running ``pip install ray[]`` *without upgrading (via -U)* to update dependencies. + + +.. note:: If you're currently on ``ray<=1.0.1.post1``, ``ray install-nightly`` will not install the most recent nightly wheels. Please use the links below instead. + +Alternatively, here are the links to the latest wheels (which are built for each commit on the +master branch). To install these wheels, use the following ``pip`` command and wheels +instead of the ones above: .. code-block:: bash diff --git a/doc/source/memory-management.rst b/doc/source/memory-management.rst index f12f7efefd33..ca4551750c50 100644 --- a/doc/source/memory-management.rst +++ b/doc/source/memory-management.rst @@ -18,7 +18,7 @@ Ray system memory: this is memory used internally by Ray Application memory: this is memory used by your application - **Worker heap**: memory used by your application (e.g., in Python code or TensorFlow), best measured as the *resident set size (RSS)* of your application minus its *shared memory usage (SHR)* in commands such as ``top``. The reason you need to subtract *SHR* is that object store shared memory is reported by the OS as shared with each worker. Not subtracting *SHR* will result in double counting memory usage. - - **Object store memory**: memory used when your application creates objects in the object store via ``ray.put`` and when returning values from remote functions. Objects are reference counted and evicted when they fall out of scope. There is an object store server running on each node. In Ray 1.3+, objects will be `spilled to disk <#object-spilling>`__ if the object store fills up. + - **Object store memory**: memory used when your application creates objects in the object store via ``ray.put`` and when returning values from remote functions. Objects are reference counted and evicted when they fall out of scope. There is an object store server running on each node. - **Object store shared memory**: memory used when your application reads objects via ``ray.get``. Note that if an object is already present on the node, this does not cause additional allocations. This allows large objects to be efficiently shared among many actors and tasks. ObjectRef Reference Counting @@ -26,6 +26,27 @@ ObjectRef Reference Counting Ray implements distributed reference counting so that any ``ObjectRef`` in scope in the cluster is pinned in the object store. This includes local python references, arguments to pending tasks, and IDs serialized inside of other objects. +Frequently Asked Questions (FAQ) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**My application failed with ObjectStoreFullError. What happened?** + +Ensure that you're removing ``ObjectRef`` references when they're no longer needed. See `Debugging using 'ray memory'`_ for information on how to identify what objects are in scope in your application. + +This exception is raised when the object store on a node was full of pinned objects when the application tried to create a new object (either by calling ``ray.put()`` or returning an object from a task). If you're sure that the configured object store size was large enough for your application to run, ensure that you're removing ``ObjectRef`` references when they're no longer in use so their objects can be evicted from the object store. + +**I'm running Ray inside IPython or a Jupyter Notebook and there are ObjectRef references causing problems even though I'm not storing them anywhere.** + +Try `Enabling LRU Fallback`_, which will cause unused objects referenced by IPython to be LRU evicted when the object store is full instead of erroring. + +IPython stores the output of every cell in a local Python variable indefinitely. This causes Ray to pin the objects even though your application may not actually be using them. + +**My application used to run on previous versions of Ray but now I'm getting ObjectStoreFullError.** + +Either modify your application to remove ``ObjectRef`` references when they're no longer needed or try `Enabling LRU Fallback`_ to revert to the old behavior. + +In previous versions of Ray, there was no reference counting and instead objects in the object store were LRU evicted once the object store ran out of space. Some applications (e.g., applications that keep references to all objects ever created) may have worked with LRU eviction but do not with reference counting. + Debugging using 'ray memory' ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -177,17 +198,38 @@ In this example, we first create an object via ``ray.put()``, then capture its ` In the output of ``ray memory``, we see that the second object displays as a normal ``LOCAL_REFERENCE``, but the first object is listed as ``CAPTURED_IN_OBJECT``. +Enabling LRU Fallback +~~~~~~~~~~~~~~~~~~~~~ + +By default, Ray will raise an exception if the object store is full of pinned objects when an application tries to create a new object. However, in some cases applications might keep references to objects much longer than they actually use them, so simply LRU evicting objects from the object store when it's full can prevent the application from failing. + +Please note that relying on this is **not recommended** - instead, if possible you should try to remove references as they're no longer needed in your application to free space in the object store. + +To enable LRU eviction when the object store is full, initialize ray with the ``lru_evict`` option set: + +.. code-block:: python + + ray.init(lru_evict=True) + +.. code-block:: bash + + ray start --lru-evict + Object Spilling --------------- -.. _object-spilling: -Ray 1.3+ spills objects to external storage once the object store is full. By default, objects are spilled to the local filesystem. -To configure the directory where objects are placed, use: +Ray 1.2.0+ has *beta* support for spilling objects to external storage once the capacity +of the object store is used up. Please file a `GitHub issue `__ +if you encounter any problems with this new feature. Eventually, object spilling will be +enabled by default, but for now you need to enable it manually: + +To enable object spilling to the local filesystem (single node clusters only): .. code-block:: python ray.init( _system_config={ + "automatic_object_spilling_enabled": True, "object_spilling_config": json.dumps( {"type": "filesystem", "params": {"directory_path": "/tmp/spill"}}, ) @@ -200,6 +242,7 @@ To enable object spilling to remote storage (any URI supported by `smart_open `, you need to ensure that the -correct dependencies are installed at startup. Modin's repository has an -example `yaml file and set of tutorial notebooks`_ to ensure that the Ray -cluster has the correct dependencies. Once the cluster is up, connect Modin -by simply importing. - -.. code-block:: python - - import modin.pandas as pd - import ray - - ray.init(address="auto") - df = pd.read_parquet("s3://my-bucket/big.parquet") - -As long as Ray is initialized before any dataframes are created, Modin -will be able to connect to and use the Ray cluster. - -Modin with the Ray Client -------------------------- - -When using Modin with the :ref:`Ray Client `, it is important to ensure that the -cluster has all dependencies installed. - -.. code-block:: python - - import modin.pandas as pd - import ray - import ray.util - - ray.util.connect() - df = pd.read_parquet("s3://my-bucket/big.parquet") - -Modin will automatically use the Ray Client for computation when the file -is read. - -How Modin uses Ray ------------------- - -Modin has a layered architecture, and the core abstraction for data manipulation -is the Modin Dataframe, which implements a novel algebra that enables Modin to -handle all of pandas (see Modin's documentation_ for more on the architecture). -Modin's internal dataframe object has a scheduling layer that is able to partition -and operate on data with Ray. - -Dataframe operations -'''''''''''''''''''' - -The Modin Dataframe uses Ray tasks to perform data manipulations. Ray Tasks have -a number of benefits over the actor model for data manipulation: - -- Multiple tasks may be manipulating the same objects simultaneously -- Objects in Ray's object store are immutable, making provenance and lineage easier - to track -- As new workers come online the shuffling of data will happen as tasks are - scheduled on the new node -- Identical partitions need not be replicated, especially beneficial for operations - that selectively mutate the data (e.g. ``fillna``). -- Finer grained parallelism with finer grained placement control - -Machine Learning -'''''''''''''''' - -Modin uses Ray Actors for the machine learning support it currently provides. -Modin's implementation of XGBoost is able to spin up one actor for each node -and aggregate all of the partitions on that node to the XGBoost Actor. Modin -is able to specify precisely the node IP for each actor on creation, giving -fine-grained control over placement - a must for distributed training -performance. - -.. _Modin: https://github.com/modin-project/modin -.. _documentation: https://modin.readthedocs.io/en/latest/developer/architecture.html -.. _yaml file and set of tutorial notebooks: https://github.com/modin-project/modin/tree/master/examples/tutorial/tutorial_notebooks/cluster diff --git a/doc/source/multiprocessing.rst b/doc/source/multiprocessing.rst index 7d027b734fd9..3e3d57292b04 100644 --- a/doc/source/multiprocessing.rst +++ b/doc/source/multiprocessing.rst @@ -10,6 +10,11 @@ using `Ray Actors `__ instead of local processes. This makes it eas to scale existing applications that use ``multiprocessing.Pool`` from a single node to a cluster. +.. note:: + + This API is new and may be revised in future Ray releases. If you encounter + any bugs, please file an `issue on GitHub`_. + .. _`multiprocessing.Pool API`: https://docs.python.org/3/library/multiprocessing.html#module-multiprocessing.pool Quickstart diff --git a/doc/source/package-ref.rst b/doc/source/package-ref.rst index ebe059f972b1..db3cbd56004a 100644 --- a/doc/source/package-ref.rst +++ b/doc/source/package-ref.rst @@ -211,7 +211,6 @@ Experimental APIs .. automodule:: ray.experimental :members: -.. _ray-cli: The Ray Command Line API ------------------------ diff --git a/doc/source/placement-group.rst b/doc/source/placement-group.rst index 7db38fd84512..6fe8bc3a894d 100644 --- a/doc/source/placement-group.rst +++ b/doc/source/placement-group.rst @@ -252,77 +252,6 @@ Note that you can anytime remove the placement group to clean up resources. ray.shutdown() -Named Placement Groups ----------------------- - -A placement group can be given a globally unique name. -This allows you to retrieve the placement group from any job in the Ray cluster. -This can be useful if you cannot directly pass the placement group handle to -the actor or task that needs it, or if you are trying to -access a placement group launched by another driver. -Note that the placement group will still be destroyed if it's lifetime isn't `detached`. -See :ref:`placement-group-lifetimes` for more details. - -.. tabs:: - .. group-tab:: Python - - .. code-block:: python - - # first_driver.py - # Create a placement group with a global name. - pg = placement_group([{"CPU": 2}, {"CPU": 2}], strategy="STRICT_SPREAD", lifetime="detached", name="global_name") - ray.get(pg.ready()) - - Then, we can retrieve the actor later somewhere. - - .. code-block:: python - - # second_driver.py - # Retrieve a placement group with a global name. - pg = ray.util.get_placement_group("global_name") - - .. group-tab:: Java - - The named placement group is not implemented for Java APIs yet. - -.. _placement-group-lifetimes: - -Placement Group Lifetimes -------------------------- - -.. tabs:: - .. group-tab:: Python - - By default, the lifetimes of placement groups are not detached and will be destroyed - when the driver is terminated (but, if it is created from a detached actor, it is - killed when the detached actor is killed). If you'd like to keep the placement group - alive regardless of its job or detached actor, you should specify - `lifetime="detached"`. For example: - - .. code-block:: python - - # first_driver.py - pg = placement_group([{"CPU": 2}, {"CPU": 2}], strategy="STRICT_SPREAD", lifetime="detached") - ray.get(pg.ready()) - - The placement group's lifetime will be independent of the driver now. This means it - is possible to retrieve the placement group from other drivers regardless of when - the current driver exits. Let's see an example: - - .. code-block:: python - - # second_driver.py - table = ray.util.placement_group_table() - print(len(table)) - - Note that the lifetime option is decoupled from the name. If we only specified - the name without specifying ``lifetime="detached"``, then the placement group can - only be retrieved as long as the original driver is still running. - - .. group-tab:: Java - - The lifetime argument is not implemented for Java APIs yet. - Tips for Using Placement Groups ------------------------------- - Learn the :ref:`lifecycle ` of placement groups. diff --git a/doc/source/ray-client.rst b/doc/source/ray-client.rst index 487c24696330..a0335faaef1d 100644 --- a/doc/source/ray-client.rst +++ b/doc/source/ray-client.rst @@ -1,5 +1,3 @@ -.. _ray-client: - ********** Ray Client ********** @@ -12,13 +10,11 @@ Ray Client Basic usage =========== -The Ray client server is automatically started on port ``10001`` when you use ``ray start --head`` or Ray in an autoscaling cluster. The port can be changed by specifying --ray-client-server-port in the ``ray start`` command. - -To start the server manually, you can run: +While in beta, the server is available as an executable module. To start the server, run ``python -m ray.util.client.server [--host host_ip] [--port port] [--redis-address address] [--redis-password password]`` -This runs ``ray.init()`` with default options and exposes the client gRPC port at ``host_ip:port`` (by default, ``0.0.0.0:10001``). Providing ``redis-address`` and ``redis-password`` will be passed into ``ray.init()`` when the server starts, allowing connection to an existing Ray cluster, as per the `cluster setup `_ instructions. +This runs ``ray.init()`` with default options and exposes the client gRPC port at ``host_ip:port`` (by default, ``0.0.0.0:50051``). Providing ``redis-address`` and ``redis-password`` will be passed into ``ray.init()`` when the server starts, allowing connection to an existing Ray cluster, as per the `cluster setup `_ instructions. From here, another Ray script can access that server from a networked machine with ``ray.util.connect()`` @@ -27,7 +23,7 @@ From here, another Ray script can access that server from a networked machine wi import ray import ray.util - ray.util.connect(":10001") # replace with the appropriate host and port + ray.util.connect("0.0.0.0:50051") # replace with the appropriate host and port # Normal Ray code follows @ray.remote @@ -37,11 +33,12 @@ From here, another Ray script can access that server from a networked machine wi do_work.remote(2) #.... -When the client disconnects, any object or actor references held by the server on behalf of the client are dropped, as if directly disconnecting from the cluster. +When the client disconnects, any object or actor references held by the server on behalf of the client are dropped, as if directly disconnecting from the cluster + -============ -Known issues -============ +=================== +``RAY_CLIENT_MODE`` +=================== Because Ray client mode affects the behavior of the Ray API, larger scripts or libraries imported before ``ray.util.connect()`` may not realize they're in client mode. This feature is being tracked with `issue #13272 `_ but the workaround here is provided for beta users. @@ -52,3 +49,21 @@ Therefore, an environment variable is also available to force a Ray program into .. code-block:: bash RAY_CLIENT_MODE=1 python my_ray_program.py + + +=================================== +Programatically creating the server +=================================== + +For larger use-cases, it may be desirable to connect remote Ray clients to an existing Ray environment. The server can be started separately via + +.. code-block:: python + + from ray.util.client.server import serve + + server = serve("0.0.0.0:50051") + # Server does some work + # ... + # Time to clean up + server.stop(0) + diff --git a/doc/source/ray-dashboard.rst b/doc/source/ray-dashboard.rst index 6c7276b2a5da..09a935fa2311 100644 --- a/doc/source/ray-dashboard.rst +++ b/doc/source/ray-dashboard.rst @@ -1,5 +1,3 @@ -.. _ray-dashboard: - Ray Dashboard ============= Ray's built-in dashboard provides metrics, charts, and other features that help diff --git a/doc/source/ray-libraries.rst b/doc/source/ray-libraries.rst index 604e680befac..3a0f2d8673c1 100644 --- a/doc/source/ray-libraries.rst +++ b/doc/source/ray-libraries.rst @@ -46,14 +46,8 @@ Hugging Face Transformers |hugging| State-of-the-art Natural Language Processing for Pytorch and TensorFlow 2.0. -It integrates with Ray for distributed hyperparameter tuning of transformer models: - [`Link to integration `__] -As well as for distributed document retrieval for Retrieval Augmented Generation Models - -[`Link to integration `__] - Intel Analytics Zoo |zoo| ------------------------- @@ -88,13 +82,6 @@ PyCaret is an open source low-code machine learning library in Python that aims GitHub: `https://github.com/pycaret/pycaret `_ -PyTorch Lightning |ptl| ------------------------ - -PyTorch Lightning is a popular open-source library that provides a high level interface for PyTorch. The goal of PyTorch Lightning is to structure your PyTorch code to abstract the details of training, making AI research scalable and fast to iterate on. - -[`Link to integration `__] - RayDP |raydp| ------------- @@ -164,10 +151,6 @@ XGBoost is a popular gradient boosting library for classification and regression :class: inline-figure :height: 30 -.. |ptl| image:: images/pytorch_lightning_small.png - :class: inline-figure - :height: 30 - .. |raydp| image:: images/intel.png :class: inline-figure :height: 30 diff --git a/doc/source/raydp.rst b/doc/source/raydp.rst deleted file mode 100644 index a0ee98282895..000000000000 --- a/doc/source/raydp.rst +++ /dev/null @@ -1,104 +0,0 @@ -******************** -RayDP (Spark on Ray) -******************** - -RayDP combines your Spark and Ray clusters, making it easy to do large scale -data processing using the PySpark API and seemlessly use that data to train -your models using TensorFlow and PyTorch. - -For more information and examples, see the RayDP Github page: -https://github.com/oap-project/raydp - -================ -Installing RayDP -================ - -RayDP can be installed from PyPI and supports PySpark 3.0 and 3.1. - -.. code-block bash - - pip install raydp - -.. note:: - RayDP requires ray >= 1.2.0 - -.. note:: - In order to run Spark, the head and worker nodes will need Java installed. - -======================== -Creating a Spark Session -======================== - -To create a spark session, call ``raydp.init_spark`` - -For example, - -.. code-block:: python - - import raydp - - spark = raydp.init_spark( - app_name = "example", - num_executors = 10, - executor_cores = 64, - memory_per_executor = "256GB" - ) - -==================================== -Deep Learning with a Spark DataFrame -==================================== - -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Training a Spark DataFrame with TensorFlow -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -``raydp.tf.TFEstimator`` provides an API for training with TensorFlow. - -.. code-block:: python - - d = [{'age': 17 , 'grade': 12}] - df = spark.createDataFrame(d).collect() - - - from tensorflow import keras - model = keras.Sequential([]) - - estimator = raydp.tf.TFEstimator( - model = model, - num_worker = 10, - feature_columns = ["age"], - label_column = ["grade"] - ) - - estimator.fit_on_spark(df, test_df=None) - - tensorflow_model = estimator.get_model() - - -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Training a Spark DataFrame with PyTorch -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Similarly, ``raydp.torch.TorchEstimator`` provides an API for training with -PyTorch. - -.. code-block:: python - - d = [{'age': 17 , 'grade': 12}] - df = spark.createDataFrame(d).collect() - - - import torch - model = torch.nn.Sequential() - - estimator = raydp.tf.TFEstimator( - model = model, - num_worker = 10, - feature_columns = ["age"], - label_column = ["grade"] - ) - - estimator.fit_on_spark(df, test_df=None) - - pytorch_model = estimator.get_model() - diff --git a/doc/source/raysgd/raysgd.rst b/doc/source/raysgd/raysgd.rst index 85fd335f3fd8..5ab6503e44ad 100644 --- a/doc/source/raysgd/raysgd.rst +++ b/doc/source/raysgd/raysgd.rst @@ -14,6 +14,8 @@ The main features are: - **Composability**: RaySGD is built on top of the Ray Actor API, enabling seamless integration with existing Ray applications such as RLlib, Tune, and Ray.Serve. - **Scale up and down**: Start on single CPU. Scale up to multi-node, multi-CPU, or multi-GPU clusters by changing 2 lines of code. +.. tip:: Join our `community slack `_ to discuss Ray! + Getting Started --------------- diff --git a/doc/source/rllib-algorithms.rst b/doc/source/rllib-algorithms.rst index b4f42c7ceab8..8b0413273597 100644 --- a/doc/source/rllib-algorithms.rst +++ b/doc/source/rllib-algorithms.rst @@ -51,7 +51,7 @@ Exploration-based plug-ins (can be combined with any algo) ============================= ========== ======================= ================== =========== ===================== Algorithm Frameworks Discrete Actions Continuous Actions Multi-Agent Model Support ============================= ========== ======================= ================== =========== ===================== -`Curiosity`_ tf + torch **Yes** `+parametric`_ No **Yes** `+RNN`_ +`Curiosity`_ tf + torch **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_ ============================= ========== ======================= ================== =========== ===================== .. _`A2C, A3C`: rllib-algorithms.html#a3c diff --git a/doc/source/rllib-examples.rst b/doc/source/rllib-examples.rst index 0f70a536a4b4..9764644a0c46 100644 --- a/doc/source/rllib-examples.rst +++ b/doc/source/rllib-examples.rst @@ -123,5 +123,5 @@ Community Examples Example of using the multi-agent API to model several `social dilemma games `__. - `StarCraft2 `__: Example of training in StarCraft2 maps with RLlib / multi-agent. -- `Traffic Flow `__: +- `Traffic Flow `__: Example of optimizing mixed-autonomy traffic simulations with RLlib / multi-agent. diff --git a/doc/source/rllib-models.rst b/doc/source/rllib-models.rst index 279256de45dc..59678af7e187 100644 --- a/doc/source/rllib-models.rst +++ b/doc/source/rllib-models.rst @@ -453,7 +453,7 @@ with the remaining non-image (flat) inputs (the 1D Box and discrete/one-hot comp Take a look at this model example that does exactly that: -.. literalinclude:: ../../rllib/models/tf/complex_input_net.py +.. literalinclude:: ../../rllib/examples/models/cnn_plus_fc_concat_model.py :language: python :start-after: __sphinx_doc_begin__ :end-before: __sphinx_doc_end__ diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst index 33a808a042cd..bbe35f36ea60 100644 --- a/doc/source/rllib.rst +++ b/doc/source/rllib.rst @@ -9,6 +9,8 @@ RLlib is an open-source library for reinforcement learning that offers both high To get started, take a look over the `custom env example `__ and the `API documentation `__. If you're looking to develop custom algorithms with RLlib, also check out `concepts and custom algorithms `__. +.. tip:: Join our `community slack `_ to discuss Ray/RLlib! + RLlib in 60 seconds ------------------- diff --git a/doc/source/serialization.rst b/doc/source/serialization.rst index b36d48627e8f..a5e58a339f6f 100644 --- a/doc/source/serialization.rst +++ b/doc/source/serialization.rst @@ -5,24 +5,24 @@ Serialization Since Ray processes do not share memory space, data transferred between workers and nodes will need to **serialized** and **deserialized**. Ray uses the `Plasma object store `_ to efficiently transfer objects across different processes and different nodes. Numpy arrays in the object store are shared between workers on the same node (zero-copy deserialization). -Overview --------- - -Ray has decided to use a customized `Pickle protocol version 5 `_ backport to replace the original PyArrow serializer. This gets rid of several previous limitations (e.g. cannot serialize recursive objects). - -Ray is currently compatible with Pickle protocol version 5, while Ray supports serialization of a wider range of objects (e.g. lambda & nested functions, dynamic classes) with the help of cloudpickle. - .. _plasma-store: Plasma Object Store -~~~~~~~~~~~~~~~~~~~ +------------------- Plasma is an in-memory object store that is being developed as part of Apache Arrow. Ray uses Plasma to efficiently transfer objects across different processes and different nodes. All objects in Plasma object store are **immutable** and held in shared memory. This is so that they can be accessed efficiently by many workers on the same node. Each node has its own object store. When data is put into the object store, it does not get automatically broadcasted to other nodes. Data remains local to the writer until requested by another task or actor on another node. +Overview +-------- + +Ray has decided to use a customized `Pickle protocol version 5 `_ backport to replace the original PyArrow serializer. This gets rid of several previous limitations (e.g. cannot serialize recursive objects). + +Ray is currently compatible with Pickle protocol version 5, while Ray supports serialization of a wider range of objects (e.g. lambda & nested functions, dynamic classes) with the help of cloudpickle. + Numpy Arrays -~~~~~~~~~~~~ +------------ Ray optimizes for numpy arrays by using Pickle protocol 5 with out-of-band data. The numpy array is stored as a read-only object, and all Ray workers on the same node can read the numpy array in the object store without copying (zero-copy reads). Each numpy array object in the worker process holds a pointer to the relevant array held in shared memory. Any writes to the read-only object will require the user to first copy it into the local process memory. @@ -48,7 +48,7 @@ Serialization notes - Lock objects are mostly unserializable, because copying a lock is meaningless and could cause serious concurrency problems. You may have to come up with a workaround if your object contains a lock. Customized Serialization ------------------------- +________________________ Sometimes you may want to customize your serialization process because the default serializer used by Ray (pickle5 + cloudpickle) does @@ -61,29 +61,29 @@ There are at least 3 ways to define your custom serialization process: function inside the corresponding class. This is commonly done by most Python libraries. Example code: - .. code-block:: python +.. code-block:: python - import ray - import sqlite3 + import ray + import sqlite3 - ray.init() + ray.init() - class DBConnection: - def __init__(self, path): - self.path = path - self.conn = sqlite3.connect(path) + class DBConnection: + def __init__(self, path): + self.path = path + self.conn = sqlite3.connect(path) - # without '__reduce__', the instance is unserializable. - def __reduce__(self): - deserializer = DBConnection - serialized_data = (self.path,) - return deserializer, serialized_data + # without '__reduce__', the instance is unserializable. + def __reduce__(self): + deserializer = DBConnection + serialized_data = (self.path,) + return deserializer, serialized_data - original = DBConnection("/tmp/db") - print(original.conn) + original = DBConnection("/tmp/db") + print(original.conn) - copied = ray.get(ray.put(original)) - print(copied.conn) + copied = ray.get(ray.put(original)) + print(copied.conn) 2. If you want to customize the serialization of a type of objects, but you cannot access or modify the corresponding class, you can @@ -112,17 +112,8 @@ There are at least 3 ways to define your custom serialization process: A, serializer=custom_serializer, deserializer=custom_deserializer) ray.get(ray.put(A(1))) # success! - # You can deregister the serializer at any time. - ray.util.deregister_serializer(A) - ray.get(ray.put(A(1))) # fail! - - # Nothing happens when deregister an unavailable serializer. - ray.util.deregister_serializer(A) - NOTE: Serializers are managed locally for each Ray worker. So for every Ray worker, - if you want to use the serializer, you need to register the serializer. Deregister - a serializer also only applies locally. - + if you want to use the serializer, you need to register the serializer. If you register a new serializer for a class, the new serializer would replace the old serializer immediately in the worker. This API is also idempotent, there are no side effects caused by re-registering the same serializer. @@ -130,29 +121,29 @@ There are at least 3 ways to define your custom serialization process: 3. We also provide you an example, if you want to customize the serialization of a specific object: - .. code-block:: python +.. code-block:: python - import threading + import threading - class A: - def __init__(self, x): - self.x = x - self.lock = threading.Lock() # could not serialize! + class A: + def __init__(self, x): + self.x = x + self.lock = threading.Lock() # could not serialize! - ray.get(ray.put(A(1))) # fail! + ray.get(ray.put(A(1))) # fail! - class SerializationHelperForA: - """A helper class for serialization.""" - def __init__(self, a): - self.a = a + class SerializationHelperForA: + """A helper class for serialization.""" + def __init__(self, a): + self.a = a - def __reduce__(self): - return A, (self.a.x,) + def __reduce__(self): + return A, (self.a.x,) - ray.get(ray.put(SerializationHelperForA(A(1)))) # success! - # the serializer only works for a specific object, not all A - # instances, so we still expect failure here. - ray.get(ray.put(A(1))) # still fail! + ray.get(ray.put(SerializationHelperForA(A(1)))) # success! + # the serializer only works for a specific object, not all A + # instances, so we still expect failure here. + ray.get(ray.put(A(1))) # still fail! Troubleshooting diff --git a/doc/source/serve/advanced.rst b/doc/source/serve/advanced.rst index ca9b8e9cebf2..3ac191f1b3a4 100644 --- a/doc/source/serve/advanced.rst +++ b/doc/source/serve/advanced.rst @@ -321,10 +321,6 @@ The following metrics are exposed by Ray Serve: - The number of HTTP requests processed. * - ``serve_num_router_requests`` - The number of requests processed by the router. - * - ``serve_handle_request_counter`` - - The number of requests processed by this ServeHandle. - * - ``backend_queued_queries`` - - The number of queries for this backend waiting to be assigned to a replica. To see this in action, run ``ray start --head --metrics-export-port=8080`` in your terminal, and then run the following script: @@ -398,9 +394,10 @@ as shown below. The dependencies required in the backend may be different than the dependencies installed in the driver program (the one running Serve API -calls). In this case, you can pass the backend in as an import path that will -be imported in the Python environment in the workers, but not the driver. -Example: +calls). In this case, you can use an +:mod:`ImportedBackend ` to specify a +backend based on a class that is installed in the Python environment that +the workers will run in. Example: .. literalinclude:: ../../../python/ray/serve/examples/doc/imported_backend.py @@ -420,37 +417,4 @@ in :mod:`serve.start `: .. note:: Using the "EveryNode" option, you can point a cloud load balancer to the instance group of Ray cluster to achieve high availability of Serve's HTTP - proxies. - -Variable HTTP Routes -==================== - -Ray Serve supports capturing path parameters. For example, in a call of the form - -.. code-block:: python - - client.create_endpoint("my_endpoint", backend="my_backend", route="/api/{username}") - -the ``username`` parameter will be accessible in your backend code as follows: - -.. code-block:: python - - def my_backend(request): - username = request.path_params["username"] - ... - -Ray Serve uses Starlette's Router class under the hood for routing, so type -conversion for path parameters is also supported, as well as multiple path parameters. -For example, suppose this route is used: - -.. code-block:: python - - client.create_endpoint( - "complex", backend="f", route="/api/{user_id:int}/{number:float}") - -Then for a query to the route ``/api/123/3.14``, the ``request.path_params`` dictionary -available in the backend will be ``{"user_id": 123, "number": 3.14}``, where ``123`` is -a Python int and ``3.14`` is a Python float. - -For full details on the supported path parameters, see Starlette's -`path parameters documentation `_. + proxies. \ No newline at end of file diff --git a/doc/source/serve/deployment.rst b/doc/source/serve/deployment.rst index ed397ec83266..5ab65a7a35c1 100644 --- a/doc/source/serve/deployment.rst +++ b/doc/source/serve/deployment.rst @@ -140,7 +140,7 @@ In order to deploy Ray Serve on Kubernetes, we need to do the following: 3. Start Ray Serve on the cluster. There are multiple ways to start a Ray cluster on Kubernetes, see :ref:`ray-k8s-deploy` for more information. -Here, we will be using the :ref:`Ray Cluster Launcher ` tool, which has support for Kubernetes as a backend. +Here, we will be using the :ref:`Ray Cluster Launcher ` tool, which has support for Kubernetes as a backend. The cluster launcher takes in a yaml config file that describes the cluster. Here, we'll be using the `Kubernetes default config`_ with a few small modifications. @@ -225,7 +225,7 @@ With the cluster now running, we can run a simple script to start Ray Serve and # Connect to the running Ray cluster. ray.init(address="auto") # Bind on 0.0.0.0 to expose the HTTP server on external IPs. - client = serve.start(detached=True, http_options={"host": "0.0.0.0"}) + client = serve.start(http_options={"host": "0.0.0.0"}) def hello(): return "hello world" diff --git a/doc/source/serve/faq.rst b/doc/source/serve/faq.rst index 6faa5711266e..a9d66b610a60 100644 --- a/doc/source/serve/faq.rst +++ b/doc/source/serve/faq.rst @@ -73,20 +73,6 @@ To call a method via Python, use :mod:`handle.options `_ from your backend code: - -.. code-block:: python - - from starlette.responses import Response - - def f(starlette_request): - return Response('Hello, world!', status_code=123, media_type='text/plain') - - client.create_backend("hello", f) - How do I enable CORS and other HTTP features? --------------------------------------------- @@ -95,6 +81,14 @@ and custom middlewares in Starlette format. The example below shows how to enabl `Cross-Origin Resource Sharing (CORS) `_. You can follow the same pattern for other Starlette middlewares. +.. note:: + + Serve does not list ``Starlette`` as one of its dependencies. To utilize this feature, + you will need to: + + .. code-block:: bash + + pip install starlette .. code-block:: python diff --git a/doc/source/serve/index.rst b/doc/source/serve/index.rst index d5c6853dfc13..e9f76d89b7a9 100644 --- a/doc/source/serve/index.rst +++ b/doc/source/serve/index.rst @@ -1,7 +1,3 @@ -.. warning:: - Ray Serve is changing fast! You're probably running the latest pip release and not the nightly build, so please ensure you're viewing the correct version of this documentation. - `Here's the documentation for the latest pip release of Ray Serve `_. - .. _rayserve: ============================================ @@ -34,7 +30,7 @@ Ray Serve can be used in two primary ways to deploy your models at scale: .. tip:: - Chat with Ray Serve users and developers on our `forum `_! + Chat with Ray Serve users and developers on our `community Slack `_ in the #serve channel and on our `forum `_! .. note:: Starting with Ray version 1.2.0, Ray Serve backends take in a Starlette Request object instead of a Flask Request object. diff --git a/doc/source/serve/package-ref.rst b/doc/source/serve/package-ref.rst index 20ed340be1fb..3df9c291557f 100644 --- a/doc/source/serve/package-ref.rst +++ b/doc/source/serve/package-ref.rst @@ -37,3 +37,7 @@ objects instead of Starlette requests. Batching Requests ----------------- .. autofunction:: ray.serve.accept_batch + +Built-in Backends +----------------- +.. autoclass:: ray.serve.backends.ImportedBackend diff --git a/doc/source/starting-ray.rst b/doc/source/starting-ray.rst index b4bf4ce0206a..1791cc25b8ed 100644 --- a/doc/source/starting-ray.rst +++ b/doc/source/starting-ray.rst @@ -164,7 +164,7 @@ You can connect other nodes to the head node, creating a Ray cluster by also cal Launching a Ray cluster (``ray up``) ------------------------------------ -Ray clusters can be launched with the :ref:`Cluster Launcher `. +Ray clusters can be launched with the :ref:`Cluster Launcher `. The ``ray up`` command uses the Ray cluster launcher to start a cluster on the cloud, creating a designated "head node" and worker nodes. Underneath the hood, it automatically calls ``ray start`` to create a Ray cluster. Your code **only** needs to execute on one machine in the cluster (usually the head node). Read more about :ref:`running programs on a Ray cluster `. diff --git a/doc/source/tune/_tutorials/overview.rst b/doc/source/tune/_tutorials/overview.rst index 8e79b8ca158a..0517c2f0a9e5 100644 --- a/doc/source/tune/_tutorials/overview.rst +++ b/doc/source/tune/_tutorials/overview.rst @@ -71,9 +71,9 @@ Take a look at any of the below tutorials to get started with Tune. :description: :doc:`Track your experiment process with the Weights & Biases tools ` .. customgalleryitem:: - :tooltip: Use MLflow with Ray Tune. + :tooltip: Use MLFlow with Ray Tune. :figure: /images/mlflow.png - :description: :doc:`Log and track your hyperparameter sweep with MLflow Tracking & AutoLogging ` + :description: :doc:`Log and track your hyperparameter sweep with MLFlow Tracking & AutoLogging ` .. raw:: html diff --git a/doc/source/tune/_tutorials/tune-distributed.rst b/doc/source/tune/_tutorials/tune-distributed.rst index 46b47e3bc757..498576e5b1d8 100644 --- a/doc/source/tune/_tutorials/tune-distributed.rst +++ b/doc/source/tune/_tutorials/tune-distributed.rst @@ -55,7 +55,7 @@ Launching a cloud cluster If you have already have a list of nodes, go to :ref:`tune-distributed-local`. -Ray currently supports AWS and GCP. Follow the instructions below to launch nodes on AWS (using the Deep Learning AMI). See the :ref:`cluster setup documentation `. Save the below cluster configuration (``tune-default.yaml``): +Ray currently supports AWS and GCP. Follow the instructions below to launch nodes on AWS (using the Deep Learning AMI). See the :ref:`cluster setup documentation `. Save the below cluster configuration (``tune-default.yaml``): .. literalinclude:: /../../python/ray/tune/examples/tune-default.yaml :language: yaml @@ -130,7 +130,7 @@ If you used a cluster configuration (starting a cluster with ``ray up`` or ``ray Syncing ------- -Tune automatically syncs the trial folder on remote nodes back to the head node. This requires the ray cluster to be started with the :ref:`cluster launcher `. +Tune automatically syncs the trial folder on remote nodes back to the head node. This requires the ray cluster to be started with the :ref:`cluster launcher `. By default, local syncing requires rsync to be installed. You can customize the sync command with the ``sync_to_driver`` argument in ``tune.SyncConfig`` by providing either a function or a string. If a string is provided, then it must include replacement fields ``{source}`` and ``{target}``, like ``rsync -savz -e "ssh -i ssh_key.pem" {source} {target}``. Alternatively, a function can be provided with the following signature: @@ -290,7 +290,7 @@ Upon a second run, this will restore the entire experiment state from ``~/path/t Common Commands --------------- -Below are some commonly used commands for submitting experiments. Please see the :ref:`Autoscaler page ` to see find more comprehensive documentation of commands. +Below are some commonly used commands for submitting experiments. Please see the :ref:`Autoscaler page ` to see find more comprehensive documentation of commands. .. code-block:: bash diff --git a/doc/source/tune/api_docs/logging.rst b/doc/source/tune/api_docs/logging.rst index 1bdc400cc802..b976a898ed08 100644 --- a/doc/source/tune/api_docs/logging.rst +++ b/doc/source/tune/api_docs/logging.rst @@ -162,7 +162,7 @@ CSVLogger MLFlowLogger ------------ -Tune also provides a default logger for `MLflow `_. You can install MLflow via ``pip install mlflow``. +Tune also provides a default logger for `MLFlow `_. You can install MLFlow via ``pip install mlflow``. You can see the :doc:`tutorial here `. WandbLogger diff --git a/doc/source/tune/examples/index.rst b/doc/source/tune/examples/index.rst index acdb758929ea..27fde3a05711 100644 --- a/doc/source/tune/examples/index.rst +++ b/doc/source/tune/examples/index.rst @@ -82,13 +82,13 @@ Pytorch Lightning - :doc:`/tune/examples/mnist_pytorch_lightning`: A comprehensive example using `Pytorch Lightning `_ to train a MNIST model. This example showcases how to use various search optimization techniques. It utilizes the Ray Tune-provided :ref:`PyTorch Lightning callbacks `. - :ref:`A walkthrough tutorial for using Ray Tune with Pytorch-Lightning `. -Wandb, MLflow +Wandb, MLFlow ~~~~~~~~~~~~~ - :ref:`Tutorial ` for using `wandb `__ with Ray Tune - :doc:`/tune/examples/wandb_example`: Example for using `Weights and Biases `__ with Ray Tune. -- :doc:`/tune/examples/mlflow_example`: Example for using `MLflow `__ with Ray Tune. -- :doc:`/tune/examples/mlflow_ptl_example`: Example for using `MLflow `__ and `Pytorch Lightning `_ with Ray Tune. +- :doc:`/tune/examples/mlflow_example`: Example for using `MLFlow `__ with Ray Tune. +- :doc:`/tune/examples/mlflow_ptl_example`: Example for using `MLFlow `__ and `Pytorch Lightning `_ with Ray Tune. Tensorflow/Keras ~~~~~~~~~~~~~~~~ diff --git a/doc/source/tune/index.rst b/doc/source/tune/index.rst index 59fd6ad0efaf..86f312cf8ddd 100644 --- a/doc/source/tune/index.rst +++ b/doc/source/tune/index.rst @@ -21,6 +21,9 @@ Tune is a Python library for experiment execution and hyperparameter tuning at a **Want to get started?** Head over to the :doc:`Key Concepts page `. +.. tip:: Join the `Ray community slack `_ to discuss Ray Tune (and other Ray libraries)! + + Quick Start ----------- @@ -70,7 +73,7 @@ A key problem with machine learning frameworks is the need to restructure all of With Tune, you can optimize your model just by :ref:`adding a few code snippets `. -Further, Tune actually removes boilerplate from your code training workflow, automatically :ref:`managing checkpoints ` and :ref:`logging results to tools ` such as MLflow and TensorBoard. +Further, Tune actually removes boilerplate from your code training workflow, automatically :ref:`managing checkpoints ` and :ref:`logging results to tools ` such as MLFlow and TensorBoard. Multi-GPU & distributed training out of the box diff --git a/doc/source/tune/user-guide.rst b/doc/source/tune/user-guide.rst index 8dd636042510..a830791d09fe 100644 --- a/doc/source/tune/user-guide.rst +++ b/doc/source/tune/user-guide.rst @@ -261,11 +261,10 @@ You can restore a single trial checkpoint by using ``tune.run(restore=` and also requires rsync to be installed. +On a multinode cluster, Tune automatically creates a copy of all trial checkpoints on the head node. This requires the Ray cluster to be started with the :ref:`cluster launcher ` and also requires rsync to be installed. Note that you must use the ``tune.checkpoint_dir`` API to trigger syncing. Also, if running Tune on Kubernetes, be sure to use the :ref:`KubernetesSyncer ` to transfer files between different pods. @@ -278,60 +277,6 @@ disable cross-node syncing: tune.run(func, sync_config=sync_config) -Stopping and resuming a tuning run ----------------------------------- -Ray Tune periodically checkpoints the experiment state so that it can be -restarted when it fails or stops. The checkpointing period is -dynamically adjusted so that at least 95% of the time is used for handling -training results and scheduling. - -If you send a SIGINT signal to the process running ``tune.run()`` (which is -usually what happens when you press Ctrl+C in the console), Ray Tune shuts -down training gracefully and saves a final experiment-level checkpoint. You -can then call ``tune.run()`` with ``resume=True`` to continue this run in -the future: - -.. code-block:: python - :emphasize-lines: 14 - - tune.run( - train, - # ... - name="my_experiment" - ) - - # This is interrupted e.g. by sending a SIGINT signal - # Next time, continue the run like so: - - tune.run( - train, - # ... - name="my_experiment", - resume=True - ) - -You will have to pass a ``name`` if you are using ``resume=True`` so that -Ray Tune can detect the experiment folder (which is usually stored at e.g. -``~/ray_results/my_experiment``). If you forgot to pass a name in the first -call, you can still pass the name when you resume the run. Please note that -in this case it is likely that your experiment name has a date suffix, so if you -ran ``tune.run(my_trainable)``, the ``name`` might look like something like this: -``my_trainable_2021-01-29_10-16-44``. - -You can see which name you need to pass by taking a look at the results table -of your original tuning run: - -.. code-block:: - :emphasize-lines: 5 - - == Status == - Memory usage on this node: 11.0/16.0 GiB - Using FIFO scheduling algorithm. - Resources requested: 1/16 CPUs, 0/0 GPUs, 0.0/4.69 GiB heap, 0.0/1.61 GiB objects - Result logdir: /Users/ray/ray_results/my_trainable_2021-01-29_10-16-44 - Number of trials: 1/1 (1 RUNNING) - - Handling Large Datasets ----------------------- @@ -737,10 +682,6 @@ These are the environment variables Ray Tune currently considers: or a search algorithm, Tune will error if the metric was not reported in the result. Setting this environment variable to ``1`` will disable this check. -* **TUNE_DISABLE_SIGINT_HANDLER**: Ray Tune catches SIGINT signals (e.g. sent by - Ctrl+C) to gracefully shutdown and do a final checkpoint. Setting this variable - to ``1`` will disable signal handling and stop execution right away. Defaults to - ``0``. * **TUNE_FUNCTION_THREAD_TIMEOUT_S**: Time in seconds the function API waits for threads to finish after instructing them to complete. Defaults to ``2``. * **TUNE_GLOBAL_CHECKPOINT_S**: Time in seconds that limits how often Tune's diff --git a/doc/source/walkthrough.rst b/doc/source/walkthrough.rst index ec0f0ec3a0f9..11ecb02bae75 100644 --- a/doc/source/walkthrough.rst +++ b/doc/source/walkthrough.rst @@ -92,8 +92,8 @@ Ray enables arbitrary functions to be executed asynchronously. These asynchronou @ray.remote def slow_function(): - time.sleep(10) - return 1 + time.sleep(10) + return 1 # Invocations of Ray remote functions happen in parallel. # All computation is performed in the background, driven by Ray's internal event loop. @@ -401,11 +401,21 @@ works as follows. System.out.println(waitResult.getReady()); // List of ready objects. System.out.println(waitResult.getUnready()); // list of unready objects. -Object Spilling +Object Eviction --------------- -When the object store gets full, objects will be `spilled to disk `__. -This feature is available in Ray 1.3+. +When the object store gets full, objects will be evicted to make room for new objects. +This happens in approximate LRU (least recently used) order. To avoid objects from +being evicted, you can call ``get`` and store their values instead. Numpy array +objects cannot be evicted while they are mapped in any Python process. + +.. note:: + + Objects created with ``put`` are pinned in memory while a Python/Java reference + to the object ref returned by the put exists. This only applies to the specific + ref returned by put, not refs in general or copies of that refs. + +See also: `object spilling `__. Remote Classes (Actors) ----------------------- diff --git a/docker/base-deps/Dockerfile b/docker/base-deps/Dockerfile index e00ca141c9d5..a5bcfedbf6be 100644 --- a/docker/base-deps/Dockerfile +++ b/docker/base-deps/Dockerfile @@ -1,6 +1,6 @@ # The base-deps Docker image installs main libraries needed to run Ray -# The GPU option is nvidia/cuda:11.0-cudnn8-runtime-ubuntu18.04 +# The GPU option is nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 ARG BASE_IMAGE="ubuntu:focal" FROM ${BASE_IMAGE} # If this arg is not "autoscaler" then no autoscaler requirements will be included @@ -30,8 +30,6 @@ RUN sudo apt-get update -y && sudo apt-get upgrade -y \ git \ wget \ cmake \ - g++ \ - zlib1g-dev \ $(if [ "$AUTOSCALER" = "autoscaler" ]; then echo \ tmux \ screen \ @@ -54,14 +52,12 @@ RUN sudo apt-get update -y && sudo apt-get upgrade -y \ numpy==1.15.4 \ psutil \ blist \ - atari-py \ # blist is needed for numpy (which is re-installed when ray is installed) - # atari-py is built from source for Python 3.8 (requires g++ & zlib1g-dev) # To avoid the following error on Jenkins: # AttributeError: 'numpy.ufunc' object has no attribute '__module__' && $HOME/anaconda3/bin/pip uninstall -y dask \ - # We install cmake temporarily to get psutil, blist & atari-py - && sudo apt-get autoremove -y cmake g++ zlib1g-dev \ + # We install cmake temporarily to get psutil + && sudo apt-get autoremove -y cmake \ # Either install kubectl or remove wget && (if [ "$AUTOSCALER" = "autoscaler" ]; \ then wget -O - -q https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add - \ @@ -73,5 +69,3 @@ RUN sudo apt-get update -y && sudo apt-get upgrade -y \ fi;) \ && sudo rm -rf /var/lib/apt/lists/* \ && sudo apt-get clean - -WORKDIR $HOME \ No newline at end of file diff --git a/docker/ray-ml/Dockerfile b/docker/ray-ml/Dockerfile index 2c5f37540a2c..25211085edc7 100644 --- a/docker/ray-ml/Dockerfile +++ b/docker/ray-ml/Dockerfile @@ -1,12 +1,12 @@ ARG GPU FROM rayproject/ray:nightly"$GPU" -ARG PYTHON_MINOR_VERSION=7 # We have to uninstall wrapt this way for Tensorflow compatibility COPY requirements.txt ./ COPY requirements_ml_docker.txt ./ COPY requirements_rllib.txt ./ -COPY requirements_tune.txt ./requirements_tune.txt +# Docker image uses Python 3.7 +COPY linux-py3.7-requirements_tune.txt ./requirements_tune.txt RUN sudo apt-get update \ && sudo apt-get install -y gcc \ @@ -14,13 +14,12 @@ RUN sudo apt-get update \ libgtk2.0-dev \ zlib1g-dev \ libgl1-mesa-dev \ - && $HOME/anaconda3/bin/pip --no-cache-dir install -U -r requirements_ml_docker.txt \ && $HOME/anaconda3/bin/pip --use-deprecated=legacy-resolver --no-cache-dir install -r requirements.txt \ && $HOME/anaconda3/bin/pip --no-cache-dir install -r requirements_rllib.txt \ && $HOME/anaconda3/bin/pip --no-cache-dir install -r requirements_tune.txt \ - # Remove dataclasses & typing because they are included in Python > 3.6 - && if [ $(python -c 'import sys; print(sys.version_info.minor)') != "6" ]; then \ - $HOME/anaconda3/bin/pip uninstall dataclasses typing -y; fi \ + && $HOME/anaconda3/bin/pip --no-cache-dir install -U -r requirements_ml_docker.txt \ + # Remove dataclasses & typing because they are included in Py3.7 + && $HOME/anaconda3/bin/pip uninstall dataclasses typing -y \ && sudo rm requirements.txt && sudo rm requirements_ml_docker.txt \ && sudo rm requirements_tune.txt && sudo rm requirements_rllib.txt \ && sudo apt-get clean diff --git a/java/api/src/main/java/io/ray/api/Ray.java b/java/api/src/main/java/io/ray/api/Ray.java index fb71a3bacbdf..02ffc59c85e8 100644 --- a/java/api/src/main/java/io/ray/api/Ray.java +++ b/java/api/src/main/java/io/ray/api/Ray.java @@ -51,7 +51,7 @@ public static synchronized void shutdown() { /** * Check if {@link #init} has been called yet. * - * @return True if {@link #init} has already been called and false otherwise. + *

Returns True if {@link #init} has already been called and false otherwise. */ public static boolean isInitialized() { return runtime != null; @@ -60,8 +60,8 @@ public static boolean isInitialized() { /** * Store an object in the object store. * - * @param obj The Java object to be stored. - * @return A ObjectRef instance that represents the in-store object. + * @param obj The Java object to be stored. Returns A ObjectRef instance that represents the + * in-store object. */ public static ObjectRef put(T obj) { return internal().put(obj); @@ -70,8 +70,7 @@ public static ObjectRef put(T obj) { /** * Get an object by `ObjectRef` from the object store. * - * @param objectRef The reference of the object to get. - * @return The Java object. + * @param objectRef The reference of the object to get. Returns The Java object. */ public static T get(ObjectRef objectRef) { return internal().get(objectRef); @@ -80,64 +79,46 @@ public static T get(ObjectRef objectRef) { /** * Get a list of objects by `ObjectRef`s from the object store. * - * @param objectList A list of object references. - * @return A list of Java objects. + * @param objectList A list of object references. Returns A list of Java objects. */ public static List get(List> objectList) { return internal().get(objectList); } - /** - * Wait for a list of RayObjects to be available, until specified number of objects are ready, or - * specified timeout has passed. - * - * @param waitList A list of object references to wait for. - * @param numReturns The number of objects that should be returned. - * @param timeoutMs The maximum time in milliseconds to wait before returning. - * @param fetchLocal If true, wait for the object to be downloaded onto the local node before - * returning it as ready. If false, ray.wait() will not trigger fetching of objects to the - * local node and will return immediately once the object is available anywhere in the - * cluster. - * @return Two lists, one containing locally available objects, one containing the rest. - */ - public static WaitResult wait( - List> waitList, int numReturns, int timeoutMs, boolean fetchLocal) { - return internal().wait(waitList, numReturns, timeoutMs, fetchLocal); - } - /** * Wait for a list of RayObjects to be locally available, until specified number of objects are * ready, or specified timeout has passed. * * @param waitList A list of object references to wait for. * @param numReturns The number of objects that should be returned. - * @param timeoutMs The maximum time in milliseconds to wait before returning. - * @return Two lists, one containing locally available objects, one containing the rest. + * @param timeoutMs The maximum time in milliseconds to wait before returning. Returns Two lists, + * one containing locally available objects, one containing the rest. */ public static WaitResult wait(List> waitList, int numReturns, int timeoutMs) { - return wait(waitList, numReturns, timeoutMs, true); + return internal().wait(waitList, numReturns, timeoutMs); } /** - * Wait for a list of RayObjects to be locally available, until specified number of objects are - * ready. + * A convenient helper method for Ray.wait. It will wait infinitely until specified number of + * objects are locally available. * * @param waitList A list of object references to wait for. - * @param numReturns The number of objects that should be returned. - * @return Two lists, one containing locally available objects, one containing the rest. + * @param numReturns The number of objects that should be returned. Returns Two lists, one + * containing locally available objects, one containing the rest. */ public static WaitResult wait(List> waitList, int numReturns) { - return wait(waitList, numReturns, Integer.MAX_VALUE); + return internal().wait(waitList, numReturns, Integer.MAX_VALUE); } /** - * Wait for a list of RayObjects to be locally available. + * A convenient helper method for Ray.wait. It will wait infinitely until all objects are locally + * available. * - * @param waitList A list of object references to wait for. - * @return Two lists, one containing locally available objects, one containing the rest. + * @param waitList A list of object references to wait for. Returns Two lists, one containing + * locally available objects, one containing the rest. */ public static WaitResult wait(List> waitList) { - return wait(waitList, waitList.size()); + return internal().wait(waitList, waitList.size(), Integer.MAX_VALUE); } /** @@ -146,9 +127,8 @@ public static WaitResult wait(List> waitList) { *

Gets a handle to a named actor with the given name. The actor must have been created with * name specified. * - * @param name The name of the named actor. - * @return an ActorHandle to the actor if the actor of specified name exists or an - * Optional.empty() + * @param name The name of the named actor. Returns an ActorHandle to the actor if the actor of + * specified name exists or an Optional.empty() */ public static Optional getActor(String name) { return internal().getActor(name, false); @@ -160,9 +140,8 @@ public static Optional getActor(String name) { *

Gets a handle to a global named actor with the given name. The actor must have been created * with global name specified. * - * @param name The global name of the named actor. - * @return an ActorHandle to the actor if the actor of specified name exists or an - * Optional.empty() + * @param name The global name of the named actor. Returns an ActorHandle to the actor if the + * actor of specified name exists or an Optional.empty() */ public static Optional getGlobalActor(String name) { return internal().getActor(name, true); @@ -172,7 +151,7 @@ public static Optional getGlobalActor(String name * If users want to use Ray API in their own threads, call this method to get the async context * and then call {@link #setAsyncContext} at the beginning of the new thread. * - * @return The async context. + *

Returns The async context. */ public static Object getAsyncContext() { return internal().getAsyncContext(); @@ -196,8 +175,7 @@ public static void setAsyncContext(Object asyncContext) { * If users want to use Ray API in their own threads, they should wrap their {@link Runnable} * objects with this method. * - * @param runnable The runnable to wrap. - * @return The wrapped runnable. + * @param runnable The runnable to wrap. Returns The wrapped runnable. */ public static Runnable wrapRunnable(Runnable runnable) { return internal().wrapRunnable(runnable); @@ -207,8 +185,7 @@ public static Runnable wrapRunnable(Runnable runnable) { * If users want to use Ray API in their own threads, they should wrap their {@link Callable} * objects with this method. * - * @param callable The callable to wrap. - * @return The wrapped callable. + * @param callable The callable to wrap. Returns The wrapped callable. */ public static Callable wrapCallable(Callable callable) { return internal().wrapCallable(callable); @@ -261,8 +238,7 @@ public static RuntimeContext getRuntimeContext() { * * @param name Name of the placement group. * @param bundles Pre-allocated resource list. - * @param strategy Actor placement strategy. - * @return A handle to the created placement group. + * @param strategy Actor placement strategy. Returns A handle to the created placement group. */ public static PlacementGroup createPlacementGroup( String name, List> bundles, PlacementStrategy strategy) { @@ -289,8 +265,7 @@ public static void exitActor() { /** * Get a placement group by placement group Id. * - * @param id placement group id. - * @return The placement group. + * @param id placement group id. Returns The placement group. */ public static PlacementGroup getPlacementGroup(PlacementGroupId id) { return internal().getPlacementGroup(id); @@ -299,7 +274,7 @@ public static PlacementGroup getPlacementGroup(PlacementGroupId id) { /** * Get all placement groups in this cluster. * - * @return All placement groups. + *

Returns All placement groups. */ public static List getAllPlacementGroups() { return internal().getAllPlacementGroups(); diff --git a/java/api/src/main/java/io/ray/api/call/ActorCreator.java b/java/api/src/main/java/io/ray/api/call/ActorCreator.java index b64a4fbcd0e5..c6bb9cce8ea7 100644 --- a/java/api/src/main/java/io/ray/api/call/ActorCreator.java +++ b/java/api/src/main/java/io/ray/api/call/ActorCreator.java @@ -23,8 +23,7 @@ public ActorCreator(RayFuncR func, Object[] args) { * *

Note, if this is set, this actor won't share Java worker with other actors or tasks. * - * @param jvmOptions JVM options for the Java worker that this actor is running in. - * @return self + * @param jvmOptions JVM options for the Java worker that this actor is running in. Returns self * @see io.ray.api.options.ActorCreationOptions.Builder#setJvmOptions(java.lang.String) */ public ActorCreator setJvmOptions(String jvmOptions) { @@ -35,7 +34,7 @@ public ActorCreator setJvmOptions(String jvmOptions) { /** * Create a java actor remotely and return a handle to the created actor. * - * @return a handle to the created java actor. + *

Returns a handle to the created java actor. */ public ActorHandle remote() { return Ray.internal().createActor(func, args, buildOptions()); diff --git a/java/api/src/main/java/io/ray/api/call/ActorTaskCaller.java b/java/api/src/main/java/io/ray/api/call/ActorTaskCaller.java index 4579acbb876d..4b9d25a21478 100644 --- a/java/api/src/main/java/io/ray/api/call/ActorTaskCaller.java +++ b/java/api/src/main/java/io/ray/api/call/ActorTaskCaller.java @@ -25,7 +25,7 @@ public ActorTaskCaller(ActorHandle actor, RayFuncR func, Object[] args) { * Execute an java actor method remotely and return an object reference to the result object in * the object store. * - * @return an object reference to an object in the object store. + *

Returns an object reference to an object in the object store. */ @SuppressWarnings("unchecked") public ObjectRef remote() { diff --git a/java/api/src/main/java/io/ray/api/call/BaseActorCreator.java b/java/api/src/main/java/io/ray/api/call/BaseActorCreator.java index 7e761b4c2859..5f488124b16c 100644 --- a/java/api/src/main/java/io/ray/api/call/BaseActorCreator.java +++ b/java/api/src/main/java/io/ray/api/call/BaseActorCreator.java @@ -18,8 +18,7 @@ public class BaseActorCreator { * name via {@link Ray#getActor(java.lang.String)}. If you want create a named actor that is * accessible from all jobs, use {@link BaseActorCreator#setGlobalName(java.lang.String)} instead. * - * @param name The name of the named actor. - * @return self + * @param name The name of the named actor. Returns self * @see io.ray.api.options.ActorCreationOptions.Builder#setName(String) */ public T setName(String name) { @@ -32,8 +31,7 @@ public T setName(String name) { * Ray#getGlobalActor(java.lang.String)}. If you want to create a named actor that is only * accessible from this job, use {@link BaseActorCreator#setName(java.lang.String)} instead. * - * @param name The name of the named actor. - * @return self + * @param name The name of the named actor. Returns self * @see io.ray.api.options.ActorCreationOptions.Builder#setGlobalName(String) */ public T setGlobalName(String name) { @@ -47,8 +45,7 @@ public T setGlobalName(String name) { * used. * * @param resourceName resource name - * @param resourceQuantity resource quantity - * @return self + * @param resourceQuantity resource quantity Returns self * @see ActorCreationOptions.Builder#setResource(java.lang.String, java.lang.Double) */ public T setResource(String resourceName, Double resourceQuantity) { @@ -61,8 +58,7 @@ public T setResource(String resourceName, Double resourceQuantity) { * called multiple times. If the same resource is set multiple times, the latest quantity will be * used. * - * @param resources requirements for multiple resources. - * @return self + * @param resources requirements for multiple resources. Returns self * @see BaseActorCreator#setResources(java.util.Map) */ public T setResources(Map resources) { @@ -75,8 +71,7 @@ public T setResources(Map resources) { * unexpectedly. The minimum valid value is 0 (default), which indicates that the actor doesn't * need to be restarted. A value of -1 indicates that an actor should be restarted indefinitely. * - * @param maxRestarts max number of actor restarts - * @return self + * @param maxRestarts max number of actor restarts Returns self * @see ActorCreationOptions.Builder#setMaxRestarts(int) */ public T setMaxRestarts(int maxRestarts) { @@ -90,8 +85,7 @@ public T setMaxRestarts(int maxRestarts) { *

The max concurrency defaults to 1 for threaded execution. Note that the execution order is * not guaranteed when {@code max_concurrency > 1}. * - * @param maxConcurrency The max number of concurrent calls to allow for this actor. - * @return self + * @param maxConcurrency The max number of concurrent calls to allow for this actor. Returns self * @see ActorCreationOptions.Builder#setMaxConcurrency(int) */ public T setMaxConcurrency(int maxConcurrency) { @@ -103,8 +97,7 @@ public T setMaxConcurrency(int maxConcurrency) { * Set the placement group to place this actor in. * * @param group The placement group of the actor. - * @param bundleIndex The index of the bundle to place this actor in. - * @return self + * @param bundleIndex The index of the bundle to place this actor in. Returns self * @see ActorCreationOptions.Builder#setPlacementGroup(PlacementGroup, int) */ public T setPlacementGroup(PlacementGroup group, int bundleIndex) { diff --git a/java/api/src/main/java/io/ray/api/call/BaseTaskCaller.java b/java/api/src/main/java/io/ray/api/call/BaseTaskCaller.java index 88c58e05350f..8b683c7bdf55 100644 --- a/java/api/src/main/java/io/ray/api/call/BaseTaskCaller.java +++ b/java/api/src/main/java/io/ray/api/call/BaseTaskCaller.java @@ -14,8 +14,7 @@ public class BaseTaskCaller> { /** * Set a name for this task. * - * @param name task name - * @return self + * @param name task name Returns self * @see CallOptions.Builder#setName(java.lang.String) */ public T setName(String name) { @@ -28,8 +27,7 @@ public T setName(String name) { * times. If the same resource is set multiple times, the latest quantity will be used. * * @param name resource name - * @param value resource capacity - * @return self + * @param value resource capacity Returns self * @see CallOptions.Builder#setResource(java.lang.String, java.lang.Double) */ public T setResource(String name, Double value) { @@ -41,8 +39,7 @@ public T setResource(String name, Double value) { * Set custom requirements for multiple resources. This method can be called multiple times. If * the same resource is set multiple times, the latest quantity will be used. * - * @param resources requirements for multiple resources. - * @return self + * @param resources requirements for multiple resources. Returns self * @see CallOptions.Builder#setResources(java.util.Map) */ public T setResources(Map resources) { diff --git a/java/api/src/main/java/io/ray/api/call/PyActorCreator.java b/java/api/src/main/java/io/ray/api/call/PyActorCreator.java index fb87a1eac7da..5add65346c73 100644 --- a/java/api/src/main/java/io/ray/api/call/PyActorCreator.java +++ b/java/api/src/main/java/io/ray/api/call/PyActorCreator.java @@ -17,7 +17,7 @@ public PyActorCreator(PyActorClass pyActorClass, Object[] args) { /** * Create a python actor remotely and return a handle to the created actor. * - * @return a handle to the created python actor. + *

Returns a handle to the created python actor. */ public PyActorHandle remote() { return Ray.internal().createActor(pyActorClass, args, buildOptions()); diff --git a/java/api/src/main/java/io/ray/api/call/PyActorTaskCaller.java b/java/api/src/main/java/io/ray/api/call/PyActorTaskCaller.java index 7ee7d8a13c92..c9444548f407 100644 --- a/java/api/src/main/java/io/ray/api/call/PyActorTaskCaller.java +++ b/java/api/src/main/java/io/ray/api/call/PyActorTaskCaller.java @@ -25,7 +25,7 @@ public PyActorTaskCaller(PyActorHandle actor, PyActorMethod method, Object[] * Execute a python actor method remotely and return an object reference to the result object in * the object store. * - * @return an object reference to an object in the object store. + *

Returns an object reference to an object in the object store. */ @SuppressWarnings("unchecked") public ObjectRef remote() { diff --git a/java/api/src/main/java/io/ray/api/call/PyTaskCaller.java b/java/api/src/main/java/io/ray/api/call/PyTaskCaller.java index ecd7aa3c8987..8d58e9b300a8 100644 --- a/java/api/src/main/java/io/ray/api/call/PyTaskCaller.java +++ b/java/api/src/main/java/io/ray/api/call/PyTaskCaller.java @@ -22,7 +22,7 @@ public PyTaskCaller(PyFunction func, Object[] args) { * Execute a python function remotely and return an object reference to the result object in the * object store. * - * @return an object reference to an object in the object store. + *

Returns an object reference to an object in the object store. */ @SuppressWarnings("unchecked") public ObjectRef remote() { diff --git a/java/api/src/main/java/io/ray/api/call/TaskCaller.java b/java/api/src/main/java/io/ray/api/call/TaskCaller.java index 80dacec2dfdc..82f72d63e6cd 100644 --- a/java/api/src/main/java/io/ray/api/call/TaskCaller.java +++ b/java/api/src/main/java/io/ray/api/call/TaskCaller.java @@ -22,7 +22,7 @@ public TaskCaller(RayFuncR func, Object[] args) { * Execute a java function remotely and return an object reference to the result object in the * object store. * - * @return an object reference to an object in the object store. + *

Returns an object reference to an object in the object store. */ @SuppressWarnings("unchecked") public ObjectRef remote() { diff --git a/java/api/src/main/java/io/ray/api/function/PyActorClass.java b/java/api/src/main/java/io/ray/api/function/PyActorClass.java index d76385919b9b..c753e1f27b72 100644 --- a/java/api/src/main/java/io/ray/api/function/PyActorClass.java +++ b/java/api/src/main/java/io/ray/api/function/PyActorClass.java @@ -38,8 +38,7 @@ private PyActorClass(String moduleName, String className) { * Create a python actor class. * * @param moduleName The full module name of this actor class - * @param className The name of this actor class - * @return a python actor class + * @param className The name of this actor class Returns a python actor class */ public static PyActorClass of(String moduleName, String className) { return new PyActorClass(moduleName, className); diff --git a/java/api/src/main/java/io/ray/api/function/PyActorMethod.java b/java/api/src/main/java/io/ray/api/function/PyActorMethod.java index 6f24b5d11a3c..f91b0c9f9c10 100644 --- a/java/api/src/main/java/io/ray/api/function/PyActorMethod.java +++ b/java/api/src/main/java/io/ray/api/function/PyActorMethod.java @@ -43,8 +43,7 @@ private PyActorMethod(String methodName, Class returnType) { /** * Create a python actor method. * - * @param methodName The name of this actor method - * @return a python actor method. + * @param methodName The name of this actor method Returns a python actor method. */ public static PyActorMethod of(String methodName) { return of(methodName, Object.class); @@ -55,8 +54,7 @@ public static PyActorMethod of(String methodName) { * * @param methodName The name of this actor method * @param returnType Class of the return value of this actor method - * @param The type of the return value of this actor method - * @return a python actor method. + * @param The type of the return value of this actor method Returns a python actor method. */ public static PyActorMethod of(String methodName, Class returnType) { return new PyActorMethod<>(methodName, returnType); diff --git a/java/api/src/main/java/io/ray/api/function/PyFunction.java b/java/api/src/main/java/io/ray/api/function/PyFunction.java index 2119b0bbf310..119bba4e5be2 100644 --- a/java/api/src/main/java/io/ray/api/function/PyFunction.java +++ b/java/api/src/main/java/io/ray/api/function/PyFunction.java @@ -49,8 +49,7 @@ private PyFunction(String moduleName, String functionName, Class returnType) * Create a python function. * * @param moduleName The full module name of this function - * @param functionName The name of this function - * @return a python function. + * @param functionName The name of this function Returns a python function. */ public static PyFunction of(String moduleName, String functionName) { return of(moduleName, functionName, Object.class); @@ -62,8 +61,7 @@ public static PyFunction of(String moduleName, String functionName) { * @param moduleName The full module name of this function * @param functionName The name of this function * @param returnType Class of the return value of this function - * @param Type of the return value of this function - * @return a python function. + * @param Type of the return value of this function Returns a python function. */ public static PyFunction of(String moduleName, String functionName, Class returnType) { return new PyFunction<>(moduleName, functionName, returnType); diff --git a/java/api/src/main/java/io/ray/api/id/BaseId.java b/java/api/src/main/java/io/ray/api/id/BaseId.java index ee91a77d63c4..573f549b2fa3 100644 --- a/java/api/src/main/java/io/ray/api/id/BaseId.java +++ b/java/api/src/main/java/io/ray/api/id/BaseId.java @@ -52,7 +52,7 @@ public boolean isNil() { /** * Derived class should implement this function. * - * @return The length of this id in bytes. + *

Returns The length of this id in bytes. */ public abstract int size(); diff --git a/java/api/src/main/java/io/ray/api/options/ActorCreationOptions.java b/java/api/src/main/java/io/ray/api/options/ActorCreationOptions.java index 303239735586..29a13c115052 100644 --- a/java/api/src/main/java/io/ray/api/options/ActorCreationOptions.java +++ b/java/api/src/main/java/io/ray/api/options/ActorCreationOptions.java @@ -50,8 +50,7 @@ public static class Builder { * this name via {@link Ray#getActor(java.lang.String)}. If you want create a named actor that * is accessible from all jobs, use {@link Builder#setGlobalName(java.lang.String)} instead. * - * @param name The name of the named actor. - * @return self + * @param name The name of the named actor. Returns self */ public Builder setName(String name) { this.name = name; @@ -64,8 +63,7 @@ public Builder setName(String name) { * {@link Ray#getGlobalActor(java.lang.String)}. If you want to create a named actor that is * only accessible from this job, use {@link Builder#setName(java.lang.String)} instead. * - * @param name The name of the named actor. - * @return self + * @param name The name of the named actor. Returns self */ public Builder setGlobalName(String name) { this.name = name; @@ -79,8 +77,7 @@ public Builder setGlobalName(String name) { * will be used. * * @param resourceName resource name - * @param resourceQuantity resource quantity - * @return self + * @param resourceQuantity resource quantity Returns self */ public Builder setResource(String resourceName, Double resourceQuantity) { this.resources.put(resourceName, resourceQuantity); @@ -92,8 +89,7 @@ public Builder setResource(String resourceName, Double resourceQuantity) { * be called multiple times. If the same resource is set multiple times, the latest quantity * will be used. * - * @param resources requirements for multiple resources. - * @return self + * @param resources requirements for multiple resources. Returns self */ public Builder setResources(Map resources) { this.resources.putAll(resources); @@ -105,8 +101,7 @@ public Builder setResources(Map resources) { * unexpectedly. The minimum valid value is 0 (default), which indicates that the actor doesn't * need to be restarted. A value of -1 indicates that an actor should be restarted indefinitely. * - * @param maxRestarts max number of actor restarts - * @return self + * @param maxRestarts max number of actor restarts Returns self */ public Builder setMaxRestarts(int maxRestarts) { this.maxRestarts = maxRestarts; @@ -118,8 +113,7 @@ public Builder setMaxRestarts(int maxRestarts) { * *

Note, if this is set, this actor won't share Java worker with other actors or tasks. * - * @param jvmOptions JVM options for the Java worker that this actor is running in. - * @return self + * @param jvmOptions JVM options for the Java worker that this actor is running in. Returns self */ public Builder setJvmOptions(String jvmOptions) { this.jvmOptions = jvmOptions; @@ -132,8 +126,8 @@ public Builder setJvmOptions(String jvmOptions) { *

The max concurrency defaults to 1 for threaded execution. Note that the execution order is * not guaranteed when {@code max_concurrency > 1}. * - * @param maxConcurrency The max number of concurrent calls to allow for this actor. - * @return self + * @param maxConcurrency The max number of concurrent calls to allow for this actor. Returns + * self */ public Builder setMaxConcurrency(int maxConcurrency) { if (maxConcurrency <= 0) { @@ -148,8 +142,7 @@ public Builder setMaxConcurrency(int maxConcurrency) { * Set the placement group to place this actor in. * * @param group The placement group of the actor. - * @param bundleIndex The index of the bundle to place this actor in. - * @return self + * @param bundleIndex The index of the bundle to place this actor in. Returns self */ public Builder setPlacementGroup(PlacementGroup group, int bundleIndex) { this.group = group; diff --git a/java/api/src/main/java/io/ray/api/options/CallOptions.java b/java/api/src/main/java/io/ray/api/options/CallOptions.java index 37e474d55a33..233c30aa3fe2 100644 --- a/java/api/src/main/java/io/ray/api/options/CallOptions.java +++ b/java/api/src/main/java/io/ray/api/options/CallOptions.java @@ -22,8 +22,7 @@ public static class Builder { /** * Set a name for this task. * - * @param name task name - * @return self + * @param name task name Returns self */ public Builder setName(String name) { this.name = name; @@ -35,8 +34,7 @@ public Builder setName(String name) { * multiple times. If the same resource is set multiple times, the latest quantity will be used. * * @param name resource name - * @param value resource capacity - * @return self + * @param value resource capacity Returns self */ public Builder setResource(String name, Double value) { this.resources.put(name, value); @@ -47,8 +45,7 @@ public Builder setResource(String name, Double value) { * Set custom requirements for multiple resources. This method can be called multiple times. If * the same resource is set multiple times, the latest quantity will be used. * - * @param resources requirements for multiple resources. - * @return self + * @param resources requirements for multiple resources. Returns self */ public Builder setResources(Map resources) { this.resources.putAll(resources); diff --git a/java/api/src/main/java/io/ray/api/placementgroup/PlacementGroup.java b/java/api/src/main/java/io/ray/api/placementgroup/PlacementGroup.java index 0c5b31b67889..9b4080deb988 100644 --- a/java/api/src/main/java/io/ray/api/placementgroup/PlacementGroup.java +++ b/java/api/src/main/java/io/ray/api/placementgroup/PlacementGroup.java @@ -1,57 +1,9 @@ package io.ray.api.placementgroup; -import io.ray.api.id.PlacementGroupId; -import java.util.List; -import java.util.Map; - /** * A placement group is used to place interdependent actors according to a specific strategy {@link * PlacementStrategy}. When a placement group is created, the corresponding actor slots and * resources are preallocated. A placement group consists of one or more bundles plus a specific * placement strategy. */ -public interface PlacementGroup { - - /** - * Get the id of current placement group. - * - * @return Id of current placement group. - */ - PlacementGroupId getId(); - - /** - * Get the name of current placement group. - * - * @return Name of current placement group. - */ - String getName(); - - /** - * Get all bundles which key is resource name and value is resource value. - * - * @return All bundles of current placement group. - */ - List> getBundles(); - - /** - * Get the strategy of current placement group. - * - * @return Strategy of current placement group. - */ - PlacementStrategy getStrategy(); - - /** - * Get the state of current placement group. - * - * @return Creation state of current placement group. - */ - PlacementGroupState getState(); - - /** - * Wait for the placement group to be ready within the specified time. - * - * @param timeoutSeconds Timeout in seconds. - * @return True if the placement group is created. False otherwise. - */ - boolean wait(int timeoutSeconds); -} +public interface PlacementGroup {} diff --git a/java/api/src/main/java/io/ray/api/runtime/RayRuntime.java b/java/api/src/main/java/io/ray/api/runtime/RayRuntime.java index ac5f44f3f139..2f3eeb2a7160 100644 --- a/java/api/src/main/java/io/ray/api/runtime/RayRuntime.java +++ b/java/api/src/main/java/io/ray/api/runtime/RayRuntime.java @@ -31,42 +31,35 @@ public interface RayRuntime { /** * Store an object in the object store. * - * @param obj The Java object to be stored. - * @return A ObjectRef instance that represents the in-store object. + * @param obj The Java object to be stored. Returns A ObjectRef instance that represents the + * in-store object. */ ObjectRef put(T obj); /** * Get an object from the object store. * - * @param objectRef The reference of the object to get. - * @return The Java object. + * @param objectRef The reference of the object to get. Returns The Java object. */ T get(ObjectRef objectRef); /** * Get a list of objects from the object store. * - * @param objectRefs The list of object references. - * @return A list of Java objects. + * @param objectRefs The list of object references. Returns A list of Java objects. */ List get(List> objectRefs); /** - * Wait for a list of RayObjects to be available, until specified number of objects are ready, or - * specified timeout has passed. + * Wait for a list of RayObjects to be locally available, until specified number of objects are + * ready, or specified timeout has passed. * * @param waitList A list of ObjectRef to wait for. * @param numReturns The number of objects that should be returned. - * @param timeoutMs The maximum time in milliseconds to wait before returning. - * @param fetchLocal If true, wait for the object to be downloaded onto the local node before - * returning it as ready. If false, ray.wait() will not trigger fetching of objects to the - * local node and will return immediately once the object is available anywhere in the - * cluster. - * @return Two lists, one containing locally available objects, one containing the rest. + * @param timeoutMs The maximum time in milliseconds to wait before returning. Returns Two lists, + * one containing locally available objects, one containing the rest. */ - WaitResult wait( - List> waitList, int numReturns, int timeoutMs, boolean fetchLocal); + WaitResult wait(List> waitList, int numReturns, int timeoutMs); /** * Free a list of objects from Plasma Store. @@ -94,8 +87,7 @@ WaitResult wait( * name specified. * * @param name The name of the named actor. - * @param global Whether the named actor is global. - * @return ActorHandle to the actor. + * @param global Whether the named actor is global. Returns ActorHandle to the actor. */ Optional getActor(String name, boolean global); @@ -112,8 +104,7 @@ WaitResult wait( * * @param func The remote function to run. * @param args The arguments of the remote function. - * @param options The options for this call. - * @return The result object. + * @param options The options for this call. Returns The result object. */ ObjectRef call(RayFunc func, Object[] args, CallOptions options); @@ -122,8 +113,7 @@ WaitResult wait( * * @param pyFunction The Python function. * @param args Arguments of the function. - * @param options The options for this call. - * @return The result object. + * @param options The options for this call. Returns The result object. */ ObjectRef call(PyFunction pyFunction, Object[] args, CallOptions options); @@ -132,8 +122,7 @@ WaitResult wait( * * @param actor A handle to the actor. * @param func The remote function to run, it must be a method of the given actor. - * @param args The arguments of the remote function. - * @return The result object. + * @param args The arguments of the remote function. Returns The result object. */ ObjectRef callActor(ActorHandle actor, RayFunc func, Object[] args); @@ -142,8 +131,7 @@ WaitResult wait( * * @param pyActor A handle to the actor. * @param pyActorMethod The actor method. - * @param args Arguments of the function. - * @return The result object. + * @param args Arguments of the function. Returns The result object. */ ObjectRef callActor(PyActorHandle pyActor, PyActorMethod pyActorMethod, Object[] args); @@ -153,8 +141,7 @@ WaitResult wait( * @param actorFactoryFunc A remote function whose return value is the actor object. * @param args The arguments for the remote function. * @param The type of the actor object. - * @param options The options for creating actor. - * @return A handle to the actor. + * @param options The options for creating actor. Returns A handle to the actor. */ ActorHandle createActor( RayFunc actorFactoryFunc, Object[] args, ActorCreationOptions options); @@ -164,8 +151,7 @@ ActorHandle createActor( * * @param pyActorClass The Python actor class. * @param args Arguments of the actor constructor. - * @param options The options for creating actor. - * @return A handle to the actor. + * @param options The options for creating actor. Returns A handle to the actor. */ PyActorHandle createActor(PyActorClass pyActorClass, Object[] args, ActorCreationOptions options); @@ -184,16 +170,14 @@ PlacementGroup createPlacementGroup( /** * Wrap a {@link Runnable} with necessary context capture. * - * @param runnable The runnable to wrap. - * @return The wrapped runnable. + * @param runnable The runnable to wrap. Returns The wrapped runnable. */ Runnable wrapRunnable(Runnable runnable); /** * Wrap a {@link Callable} with necessary context capture. * - * @param callable The callable to wrap. - * @return The wrapped callable. + * @param callable The callable to wrap. Returns The wrapped callable. */ Callable wrapCallable(Callable callable); @@ -203,15 +187,14 @@ PlacementGroup createPlacementGroup( /** * Get a placement group by id. * - * @param id placement group id. - * @return The placement group. + * @param id placement group id. Returns The placement group. */ PlacementGroup getPlacementGroup(PlacementGroupId id); /** * Get all placement groups in this cluster. * - * @return All placement groups. + *

Returns All placement groups. */ List getAllPlacementGroups(); @@ -226,8 +209,8 @@ PlacementGroup createPlacementGroup( * Wait for the placement group to be ready within the specified time. * * @param id Id of placement group. - * @param timeoutMs Timeout in milliseconds. - * @return True if the placement group is created. False otherwise. + * @param timeoutMs Timeout in milliseconds. Returns True if the placement group is created. False + * otherwise. */ boolean waitPlacementGroupReady(PlacementGroupId id, int timeoutMs); } diff --git a/java/api/src/main/java/io/ray/api/runtimecontext/RuntimeContext.java b/java/api/src/main/java/io/ray/api/runtimecontext/RuntimeContext.java index d00ea4f1195b..b5fa486aa586 100644 --- a/java/api/src/main/java/io/ray/api/runtimecontext/RuntimeContext.java +++ b/java/api/src/main/java/io/ray/api/runtimecontext/RuntimeContext.java @@ -21,7 +21,7 @@ public interface RuntimeContext { boolean wasCurrentActorRestarted(); /** - * Returns true if Ray is running in single-process mode, false if Ray is running in cluster mode. + * Return true if Ray is running in single-process mode, false if Ray is running in cluster mode. */ boolean isSingleProcess(); diff --git a/java/runtime/src/main/java/io/ray/runtime/AbstractRayRuntime.java b/java/runtime/src/main/java/io/ray/runtime/AbstractRayRuntime.java index 15d9e9d76a53..f3478e4c6c68 100644 --- a/java/runtime/src/main/java/io/ray/runtime/AbstractRayRuntime.java +++ b/java/runtime/src/main/java/io/ray/runtime/AbstractRayRuntime.java @@ -105,9 +105,8 @@ public void free(List> objectRefs, boolean localOnly) { } @Override - public WaitResult wait( - List> waitList, int numReturns, int timeoutMs, boolean fetchLocal) { - return objectStore.wait(waitList, numReturns, timeoutMs, fetchLocal); + public WaitResult wait(List> waitList, int numReturns, int timeoutMs) { + return objectStore.wait(waitList, numReturns, timeoutMs); } @Override diff --git a/java/runtime/src/main/java/io/ray/runtime/actor/NativeActorHandle.java b/java/runtime/src/main/java/io/ray/runtime/actor/NativeActorHandle.java index 85a46ad8b963..1dd4b84f5c2b 100644 --- a/java/runtime/src/main/java/io/ray/runtime/actor/NativeActorHandle.java +++ b/java/runtime/src/main/java/io/ray/runtime/actor/NativeActorHandle.java @@ -71,7 +71,7 @@ public void readExternal(ObjectInput in) throws IOException, ClassNotFoundExcept /** * Serialize this actor handle to bytes. * - * @return the bytes of the actor handle + *

Returns the bytes of the actor handle */ public byte[] toBytes() { return nativeSerialize(actorId); @@ -80,7 +80,7 @@ public byte[] toBytes() { /** * Deserialize an actor handle from bytes. * - * @return the bytes of an actor handle + *

Returns the bytes of an actor handle */ public static NativeActorHandle fromBytes(byte[] bytes) { byte[] actorId = nativeDeserialize(bytes); diff --git a/java/runtime/src/main/java/io/ray/runtime/functionmanager/FunctionManager.java b/java/runtime/src/main/java/io/ray/runtime/functionmanager/FunctionManager.java index c9ef7ce3bbe6..d26a13dca193 100644 --- a/java/runtime/src/main/java/io/ray/runtime/functionmanager/FunctionManager.java +++ b/java/runtime/src/main/java/io/ray/runtime/functionmanager/FunctionManager.java @@ -69,8 +69,7 @@ public FunctionManager(List codeSearchPath) { * Get the RayFunction from a RayFunc instance (a lambda). * * @param jobId current job id. - * @param func The lambda. - * @return A RayFunction object. + * @param func The lambda. Returns A RayFunction object. */ public RayFunction getFunction(JobId jobId, RayFunc func) { JavaFunctionDescriptor functionDescriptor = RAY_FUNC_CACHE.get().get(func.getClass()); @@ -91,8 +90,7 @@ public RayFunction getFunction(JobId jobId, RayFunc func) { * Get the RayFunction from a function descriptor. * * @param jobId Current job id. - * @param functionDescriptor The function descriptor. - * @return A RayFunction object. + * @param functionDescriptor The function descriptor. Returns A RayFunction object. */ public RayFunction getFunction(JobId jobId, JavaFunctionDescriptor functionDescriptor) { JobFunctionTable jobFunctionTable = jobFunctionTables.get(jobId); diff --git a/java/runtime/src/main/java/io/ray/runtime/gcs/GcsClient.java b/java/runtime/src/main/java/io/ray/runtime/gcs/GcsClient.java index cc70bbd7e963..df34212e7eec 100644 --- a/java/runtime/src/main/java/io/ray/runtime/gcs/GcsClient.java +++ b/java/runtime/src/main/java/io/ray/runtime/gcs/GcsClient.java @@ -35,8 +35,7 @@ public GcsClient(String redisAddress, String redisPassword) { /** * Get placement group by {@link PlacementGroupId}. * - * @param placementGroupId Id of placement group. - * @return The placement group. + * @param placementGroupId Id of placement group. Returns The placement group. */ public PlacementGroup getPlacementGroupInfo(PlacementGroupId placementGroupId) { byte[] result = globalStateAccessor.getPlacementGroupInfo(placementGroupId); @@ -46,7 +45,7 @@ public PlacementGroup getPlacementGroupInfo(PlacementGroupId placementGroupId) { /** * Get all placement groups in this cluster. * - * @return All placement groups. + *

Returns All placement groups. */ public List getAllPlacementGroupInfo() { List results = globalStateAccessor.getAllPlacementGroupInfo(); diff --git a/java/runtime/src/main/java/io/ray/runtime/gcs/RedisClient.java b/java/runtime/src/main/java/io/ray/runtime/gcs/RedisClient.java index 811402994e4e..77004a8493a4 100644 --- a/java/runtime/src/main/java/io/ray/runtime/gcs/RedisClient.java +++ b/java/runtime/src/main/java/io/ray/runtime/gcs/RedisClient.java @@ -88,7 +88,7 @@ public byte[] get(byte[] key, byte[] field) { /** * Return the specified elements of the list stored at the specified key. * - * @return Multi bulk reply, specifically a list of elements in the specified range. + *

Returns Multi bulk reply, specifically a list of elements in the specified range. */ public List lrange(byte[] key, long start, long end) { try (Jedis jedis = jedisPool.getResource()) { diff --git a/java/runtime/src/main/java/io/ray/runtime/metric/Metric.java b/java/runtime/src/main/java/io/ray/runtime/metric/Metric.java index 80c39cf96f50..961cbfe9a9b8 100644 --- a/java/runtime/src/main/java/io/ray/runtime/metric/Metric.java +++ b/java/runtime/src/main/java/io/ray/runtime/metric/Metric.java @@ -54,7 +54,7 @@ public void record() { /** * Get the value to record and then reset. * - * @return latest updating value. + *

Returns latest updating value. */ protected abstract double getAndReset(); diff --git a/java/runtime/src/main/java/io/ray/runtime/metric/Metrics.java b/java/runtime/src/main/java/io/ray/runtime/metric/Metrics.java index f3af834f6715..85939ed79abb 100644 --- a/java/runtime/src/main/java/io/ray/runtime/metric/Metrics.java +++ b/java/runtime/src/main/java/io/ray/runtime/metric/Metrics.java @@ -111,7 +111,7 @@ public B tags(Map tags) { /** * Creates a metric by sub-class. * - * @return a metric + *

Returns a metric */ protected abstract M create(); diff --git a/java/runtime/src/main/java/io/ray/runtime/object/LocalModeObjectStore.java b/java/runtime/src/main/java/io/ray/runtime/object/LocalModeObjectStore.java index cb5752d00a81..e1bfc64faa62 100644 --- a/java/runtime/src/main/java/io/ray/runtime/object/LocalModeObjectStore.java +++ b/java/runtime/src/main/java/io/ray/runtime/object/LocalModeObjectStore.java @@ -60,8 +60,7 @@ public List getRaw(List objectIds, long timeoutMs) { } @Override - public List wait( - List objectIds, int numObjects, long timeoutMs, boolean fetchLocal) { + public List wait(List objectIds, int numObjects, long timeoutMs) { waitInternal(objectIds, numObjects, timeoutMs); return objectIds.stream().map(pool::containsKey).collect(Collectors.toList()); } diff --git a/java/runtime/src/main/java/io/ray/runtime/object/NativeObjectStore.java b/java/runtime/src/main/java/io/ray/runtime/object/NativeObjectStore.java index c68709e10e68..24dd5b8a2699 100644 --- a/java/runtime/src/main/java/io/ray/runtime/object/NativeObjectStore.java +++ b/java/runtime/src/main/java/io/ray/runtime/object/NativeObjectStore.java @@ -45,9 +45,8 @@ public List getRaw(List objectIds, long timeoutMs) { } @Override - public List wait( - List objectIds, int numObjects, long timeoutMs, boolean fetchLocal) { - return nativeWait(toBinaryList(objectIds), numObjects, timeoutMs, fetchLocal); + public List wait(List objectIds, int numObjects, long timeoutMs) { + return nativeWait(toBinaryList(objectIds), numObjects, timeoutMs); } @Override @@ -114,7 +113,7 @@ private static List toBinaryList(List ids) { private static native List nativeGet(List ids, long timeoutMs); private static native List nativeWait( - List objectIds, int numObjects, long timeoutMs, boolean fetchLocal); + List objectIds, int numObjects, long timeoutMs); private static native void nativeDelete(List objectIds, boolean localOnly); diff --git a/java/runtime/src/main/java/io/ray/runtime/object/ObjectSerializer.java b/java/runtime/src/main/java/io/ray/runtime/object/ObjectSerializer.java index 51ae9bfd2b98..76576b969e20 100644 --- a/java/runtime/src/main/java/io/ray/runtime/object/ObjectSerializer.java +++ b/java/runtime/src/main/java/io/ray/runtime/object/ObjectSerializer.java @@ -55,8 +55,7 @@ public class ObjectSerializer { * Deserialize an object from an {@link NativeRayObject} instance. * * @param nativeRayObject The object to deserialize. - * @param objectId The associated object ID of the object. - * @return The deserialized object. + * @param objectId The associated object ID of the object. Returns The deserialized object. */ public static Object deserialize( NativeRayObject nativeRayObject, ObjectId objectId, Class objectType) { @@ -111,8 +110,7 @@ public static Object deserialize( /** * Serialize an Java object to an {@link NativeRayObject} instance. * - * @param object The object to serialize. - * @return The serialized object. + * @param object The object to serialize. Returns The serialized object. */ public static NativeRayObject serialize(Object object) { if (object instanceof NativeRayObject) { diff --git a/java/runtime/src/main/java/io/ray/runtime/object/ObjectStore.java b/java/runtime/src/main/java/io/ray/runtime/object/ObjectStore.java index 5e7b626033a2..df524af11c8a 100644 --- a/java/runtime/src/main/java/io/ray/runtime/object/ObjectStore.java +++ b/java/runtime/src/main/java/io/ray/runtime/object/ObjectStore.java @@ -26,8 +26,7 @@ public ObjectStore(WorkerContext workerContext) { /** * Put a raw object into object store. * - * @param obj The ray object. - * @return Generated ID of the object. + * @param obj The ray object. Returns Generated ID of the object. */ public abstract ObjectId putRaw(NativeRayObject obj); @@ -42,8 +41,7 @@ public ObjectStore(WorkerContext workerContext) { /** * Serialize and put an object to the object store. * - * @param object The object to put. - * @return Id of the object. + * @param object The object to put. Returns Id of the object. */ public ObjectId put(Object object) { if (object instanceof NativeRayObject) { @@ -73,8 +71,8 @@ public void put(Object object, ObjectId objectId) { * Get a list of raw objects from the object store. * * @param objectIds IDs of the objects to get. - * @param timeoutMs Timeout in milliseconds, wait infinitely if it's negative. - * @return Result list of objects data. + * @param timeoutMs Timeout in milliseconds, wait infinitely if it's negative. Returns Result list + * of objects data. */ public abstract List getRaw(List objectIds, long timeoutMs); @@ -82,8 +80,7 @@ public void put(Object object, ObjectId objectId) { * Get a list of objects from the object store. * * @param ids List of the object ids. - * @param Type of these objects. - * @return A list of GetResult objects. + * @param Type of these objects. Returns A list of GetResult objects. */ @SuppressWarnings("unchecked") public List get(List ids, Class elementType) { @@ -117,36 +114,25 @@ public List get(List ids, Class elementType) { } /** - * Wait for a list of RayObjects to be available, until specified number of objects are ready, or - * specified timeout has passed. + * Wait for a list of objects to appear in the object store. * * @param objectIds IDs of the objects to wait for. * @param numObjects Number of objects that should appear. - * @param timeoutMs Timeout in milliseconds, wait infinitely if it's negative. - * @param fetchLocal If true, wait for the object to be downloaded onto the local node before - * returning it as ready. If false, ray.wait() will not trigger fetching of objects to the - * local node and will return immediately once the object is available anywhere in the - * cluster. - * @return A bitset that indicates each object has appeared or not. + * @param timeoutMs Timeout in milliseconds, wait infinitely if it's negative. Returns A bitset + * that indicates each object has appeared or not. */ - public abstract List wait( - List objectIds, int numObjects, long timeoutMs, boolean fetchLocal); + public abstract List wait(List objectIds, int numObjects, long timeoutMs); /** - * Wait for a list of RayObjects to be available, until specified number of objects are ready, or - * specified timeout has passed. + * Wait for a list of RayObjects to be locally available, until specified number of objects are + * ready, or specified timeout has passed. * * @param waitList A list of object references to wait for. * @param numReturns The number of objects that should be returned. - * @param timeoutMs The maximum time in milliseconds to wait before returning. - * @param fetchLocal If true, wait for the object to be downloaded onto the local node before - * returning it as ready. If false, ray.wait() will not trigger fetching of objects to the - * local node and will return immediately once the object is available anywhere in the - * cluster. - * @return Two lists, one containing locally available objects, one containing the rest. + * @param timeoutMs The maximum time in milliseconds to wait before returning. Returns Two lists, + * one containing locally available objects, one containing the rest. */ - public WaitResult wait( - List> waitList, int numReturns, int timeoutMs, boolean fetchLocal) { + public WaitResult wait(List> waitList, int numReturns, int timeoutMs) { Preconditions.checkNotNull(waitList); if (waitList.isEmpty()) { return new WaitResult<>(Collections.emptyList(), Collections.emptyList()); @@ -155,7 +141,7 @@ public WaitResult wait( List ids = waitList.stream().map(ref -> ((ObjectRefImpl) ref).getId()).collect(Collectors.toList()); - List ready = wait(ids, numReturns, timeoutMs, fetchLocal); + List ready = wait(ids, numReturns, timeoutMs); List> readyList = new ArrayList<>(); List> unreadyList = new ArrayList<>(); @@ -199,8 +185,7 @@ public WaitResult wait( /** * Promote the given object to the underlying object store, and get the ownership info. * - * @param objectId The ID of the object to promote - * @return the serialized ownership address + * @param objectId The ID of the object to promote Returns the serialized ownership address */ public abstract byte[] promoteAndGetOwnershipInfo(ObjectId objectId); diff --git a/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupImpl.java b/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupImpl.java index 55ca446f8423..b08f7c9f5c0f 100644 --- a/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupImpl.java +++ b/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupImpl.java @@ -30,32 +30,32 @@ private PlacementGroupImpl( this.state = state; } - @Override public PlacementGroupId getId() { return id; } - @Override public String getName() { return name; } - @Override public List> getBundles() { return bundles; } - @Override public PlacementStrategy getStrategy() { return strategy; } - @Override public PlacementGroupState getState() { return state; } - @Override + /** + * Wait for the placement group to be ready within the specified time. + * + * @param timeoutSeconds Timeout in seconds. Returns True if the placement group is created. False + * otherwise. + */ public boolean wait(int timeoutSeconds) { return Ray.internal().waitPlacementGroupReady(id, timeoutSeconds); } @@ -71,8 +71,7 @@ public static class Builder { /** * Set the Id of the placement group. * - * @param id Id of the placement group. - * @return self. + * @param id Id of the placement group. Returns self. */ public Builder setId(PlacementGroupId id) { this.id = id; @@ -82,8 +81,7 @@ public Builder setId(PlacementGroupId id) { /** * Set the name of the placement group. * - * @param name Name of the placement group. - * @return self. + * @param name Name of the placement group. Returns self. */ public Builder setName(String name) { this.name = name; @@ -93,8 +91,7 @@ public Builder setName(String name) { /** * Set the bundles of the placement group. * - * @param bundles the bundles of the placement group. - * @return self. + * @param bundles the bundles of the placement group. Returns self. */ public Builder setBundles(List> bundles) { this.bundles = bundles; @@ -104,8 +101,7 @@ public Builder setBundles(List> bundles) { /** * Set the placement strategy of the placement group. * - * @param strategy the placement strategy of the placement group. - * @return self. + * @param strategy the placement strategy of the placement group. Returns self. */ public Builder setStrategy(PlacementStrategy strategy) { this.strategy = strategy; @@ -115,8 +111,7 @@ public Builder setStrategy(PlacementStrategy strategy) { /** * Set the placement state of the placement group. * - * @param state the state of the placement group. - * @return self. + * @param state the state of the placement group. Returns self. */ public Builder setState(PlacementGroupState state) { this.state = state; diff --git a/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupUtils.java b/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupUtils.java index 8e9d03cc6407..75305ef1f4e2 100644 --- a/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupUtils.java +++ b/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupUtils.java @@ -61,8 +61,8 @@ private static PlacementGroupState covertToUserSpecifiedState( /** * Generate a PlacementGroupImpl from placementGroupTableData protobuf data. * - * @param placementGroupTableData protobuf data. - * @return placement group info {@link PlacementGroupImpl} + * @param placementGroupTableData protobuf data. Returns placement group info {@link + * PlacementGroupImpl} */ private static PlacementGroupImpl generatePlacementGroupFromPbData( PlacementGroupTableData placementGroupTableData) { @@ -90,8 +90,8 @@ private static PlacementGroupImpl generatePlacementGroupFromPbData( /** * Generate a PlacementGroupImpl from byte array. * - * @param placementGroupByteArray bytes array from native method. - * @return placement group info {@link PlacementGroupImpl} + * @param placementGroupByteArray bytes array from native method. Returns placement group info + * {@link PlacementGroupImpl} */ public static PlacementGroupImpl generatePlacementGroupFromByteArray( byte[] placementGroupByteArray) { diff --git a/java/runtime/src/main/java/io/ray/runtime/runner/RunManager.java b/java/runtime/src/main/java/io/ray/runtime/runner/RunManager.java index 192e5550ceb4..2307b0489d3c 100644 --- a/java/runtime/src/main/java/io/ray/runtime/runner/RunManager.java +++ b/java/runtime/src/main/java/io/ray/runtime/runner/RunManager.java @@ -96,7 +96,7 @@ public static void getAddressInfoAndFillConfig(RayConfig rayConfig) { * * @param command The command to start the process with. */ - public static String runCommand(List command) throws IOException, InterruptedException { + private static String runCommand(List command) throws IOException, InterruptedException { if (LOGGER.isDebugEnabled()) { LOGGER.debug("Starting process with command: {}", Joiner.on(" ").join(command)); } diff --git a/java/runtime/src/main/java/io/ray/runtime/task/TaskSubmitter.java b/java/runtime/src/main/java/io/ray/runtime/task/TaskSubmitter.java index e8a8351716d5..ca195d6ced11 100644 --- a/java/runtime/src/main/java/io/ray/runtime/task/TaskSubmitter.java +++ b/java/runtime/src/main/java/io/ray/runtime/task/TaskSubmitter.java @@ -21,8 +21,7 @@ public interface TaskSubmitter { * @param functionDescriptor The remote function to execute. * @param args Arguments of this task. * @param numReturns Return object count. - * @param options Options for this task. - * @return Ids of the return objects. + * @param options Options for this task. Returns Ids of the return objects. */ List submitTask( FunctionDescriptor functionDescriptor, @@ -35,8 +34,7 @@ List submitTask( * * @param functionDescriptor The remote function that generates the actor object. * @param args Arguments of this task. - * @param options Options for this actor creation task. - * @return Handle to the actor. + * @param options Options for this actor creation task. Returns Handle to the actor. * @throws IllegalArgumentException if actor of specified name exists */ BaseActorHandle createActor( @@ -50,8 +48,7 @@ BaseActorHandle createActor( * @param functionDescriptor The remote function to execute. * @param args Arguments of this task. * @param numReturns Return object count. - * @param options Options for this task. - * @return Ids of the return objects. + * @param options Options for this task. Returns Ids of the return objects. */ List submitActorTask( BaseActorHandle actor, @@ -65,8 +62,7 @@ List submitActorTask( * * @param name Name of the placement group. * @param bundles Pre-allocated resource list. - * @param strategy Actor placement strategy. - * @return A handle to the created placement group. + * @param strategy Actor placement strategy. Returns A handle to the created placement group. */ PlacementGroup createPlacementGroup( String name, List> bundles, PlacementStrategy strategy); @@ -82,8 +78,8 @@ PlacementGroup createPlacementGroup( * Wait for the placement group to be ready within the specified time. * * @param id Id of placement group. - * @param timeoutMs Timeout in milliseconds. - * @return True if the placement group is created. False otherwise. + * @param timeoutMs Timeout in milliseconds. Returns True if the placement group is created. False + * otherwise. */ boolean waitPlacementGroupReady(PlacementGroupId id, int timeoutMs); diff --git a/java/runtime/src/main/java/io/ray/runtime/util/BinaryFileUtil.java b/java/runtime/src/main/java/io/ray/runtime/util/BinaryFileUtil.java index f3282ed08c56..85c327a446b7 100644 --- a/java/runtime/src/main/java/io/ray/runtime/util/BinaryFileUtil.java +++ b/java/runtime/src/main/java/io/ray/runtime/util/BinaryFileUtil.java @@ -21,8 +21,7 @@ public class BinaryFileUtil { * will be protected by a file lock. * * @param destDir a directory to extract resource file to - * @param fileName resource file name - * @return extracted resource file + * @param fileName resource file name Returns extracted resource file */ public static File getNativeFile(String destDir, String fileName) { final File dir = new File(destDir); diff --git a/java/runtime/src/main/java/io/ray/runtime/util/IdUtil.java b/java/runtime/src/main/java/io/ray/runtime/util/IdUtil.java index 239568afa51b..4f7bf2580af2 100644 --- a/java/runtime/src/main/java/io/ray/runtime/util/IdUtil.java +++ b/java/runtime/src/main/java/io/ray/runtime/util/IdUtil.java @@ -13,7 +13,7 @@ public class IdUtil { /** * Compute the actor ID of the task which created this object. * - * @return The actor ID of the task which created this object. + *

Returns The actor ID of the task which created this object. */ public static ActorId getActorIdFromObjectId(ObjectId objectId) { byte[] taskIdBytes = new byte[TaskId.LENGTH]; diff --git a/java/runtime/src/main/java/io/ray/runtime/util/ResourceUtil.java b/java/runtime/src/main/java/io/ray/runtime/util/ResourceUtil.java index e9676d07b2f6..0c7a93d27818 100644 --- a/java/runtime/src/main/java/io/ray/runtime/util/ResourceUtil.java +++ b/java/runtime/src/main/java/io/ray/runtime/util/ResourceUtil.java @@ -11,8 +11,8 @@ public class ResourceUtil { * Convert resources map to a string that is used for the command line argument of starting * raylet. * - * @param resources The resources map to be converted. - * @return The starting-raylet command line argument, like "CPU,4,GPU,0". + * @param resources The resources map to be converted. Returns The starting-raylet command line + * argument, like "CPU,4,GPU,0". */ public static String getResourcesStringFromMap(Map resources) { StringBuilder builder = new StringBuilder(); @@ -32,9 +32,8 @@ public static String getResourcesStringFromMap(Map resources) { /** * Parse the static resources configure field and convert to the resources map. * - * @param resources The static resources string to be parsed. - * @return The map whose key represents the resource name and the value represents the resource - * quantity. + * @param resources The static resources string to be parsed. Returns The map whose key represents + * the resource name and the value represents the resource quantity. * @throws IllegalArgumentException If the resources string's format does match, it will throw an * IllegalArgumentException. */ diff --git a/java/test.sh b/java/test.sh index b49f06037c10..8336c1da1c5f 100755 --- a/java/test.sh +++ b/java/test.sh @@ -17,26 +17,19 @@ pushd "$ROOT_DIR" popd run_testng() { - local pid local exit_code - "$@" & - pid=$! - if wait $pid; then + if "$@"; then exit_code=0 else exit_code=$? fi # exit_code == 2 means there are skipped tests. if [ $exit_code -ne 2 ] && [ $exit_code -ne 0 ] ; then - # Only print log files if it ran in cluster mode - if [[ ! "$*" =~ SINGLE_PROCESS ]]; then - if [ $exit_code -gt 128 ] ; then - # Test crashed. Print the driver log for diagnosis. - cat /tmp/ray/session_latest/logs/java-core-driver-*$pid* - fi + if [ $exit_code -gt 128 ] ; then + # Test crashed. Print the driver log for diagnosis. + cat /tmp/ray/session_latest/logs/java-core-driver-* fi - # Only print the hs_err_pid file of TestNG process - find . -name "hs_err_pid$pid.log" -exec cat {} + + find . -name "hs_err_*log" -exec cat {} + exit $exit_code fi } @@ -48,44 +41,18 @@ bazel build //java:gen_maven_deps echo "Build test jar." bazel build //java:all_tests_deploy.jar -java/generate_jni_header_files.sh +# Enable multi-worker feature in Java test +TEST_ARGS=(-Dray.job.num-java-workers-per-process=10) -if ! git diff --exit-code -- java src/ray/core_worker/lib/java; then - echo "Files are changed after build. Common cases are:" - echo " * Java native methods doesn't match JNI files. You need to either update Java code or JNI code." - echo " * pom_template.xml and pom.xml doesn't match. You need to either update pom_template.xml or pom.xml." - exit 1 -fi - -# NOTE(kfstrom): Java test troubleshooting only. -# Set MAX_ROUNDS to a big number (e.g. 1000) to run Java tests repeatedly. -# You may also want to modify java/testng.xml to run only a subset of test cases. -MAX_ROUNDS=1 -if [ $MAX_ROUNDS -gt 1 ]; then - export RAY_BACKEND_LOG_LEVEL=debug -fi - -round=1 -while true; do - echo Starting cluster mode test round $round - - echo "Running tests under cluster mode." - # TODO(hchen): Ideally, we should use the following bazel command to run Java tests. However, if there're skipped tests, - # TestNG will exit with code 2. And bazel treats it as test failure. - # bazel test //java:all_tests --config=ci || cluster_exit_code=$? - run_testng java -cp "$ROOT_DIR"/../bazel-bin/java/all_tests_deploy.jar org.testng.TestNG -d /tmp/ray_java_test_output "$ROOT_DIR"/testng.xml - - echo Finished cluster mode test round $round - date - round=$((round+1)) - if (( round > MAX_ROUNDS )); then - break - fi -done +echo "Running tests under cluster mode." +# TODO(hchen): Ideally, we should use the following bazel command to run Java tests. However, if there're skipped tests, +# TestNG will exit with code 2. And bazel treats it as test failure. +# bazel test //java:all_tests --config=ci || cluster_exit_code=$? +run_testng java -cp "$ROOT_DIR"/../bazel-bin/java/all_tests_deploy.jar "${TEST_ARGS[@]}" org.testng.TestNG -d /tmp/ray_java_test_output "$ROOT_DIR"/testng.xml echo "Running tests under single-process mode." # bazel test //java:all_tests --jvmopt="-Dray.run-mode=SINGLE_PROCESS" --config=ci || single_exit_code=$? -run_testng java -Dray.run-mode="SINGLE_PROCESS" -cp "$ROOT_DIR"/../bazel-bin/java/all_tests_deploy.jar org.testng.TestNG -d /tmp/ray_java_test_output "$ROOT_DIR"/testng.xml +run_testng java -Dray.run-mode="SINGLE_PROCESS" -cp "$ROOT_DIR"/../bazel-bin/java/all_tests_deploy.jar "${TEST_ARGS[@]}" org.testng.TestNG -d /tmp/ray_java_test_output "$ROOT_DIR"/testng.xml echo "Running connecting existing cluster tests." case "${OSTYPE}" in @@ -98,18 +65,15 @@ RAY_BACKEND_LOG_LEVEL=debug java -cp bazel-bin/java/all_tests_deploy.jar -Dray.a -Dray.redis.password='123456' -Dray.job.code-search-path="$PWD/bazel-bin/java/all_tests_deploy.jar" io.ray.test.MultiDriverTest ray stop -# See issue #13742 the test is very flaky. -# Skipping the doc test for now. - -# echo "Running documentation demo code." -# docdemo_path="java/test/src/main/java/io/ray/docdemo/" -# for file in "$docdemo_path"*.java; do -# file=${file#"$docdemo_path"} -# class=${file%".java"} -# echo "Running $class" -# java -cp bazel-bin/java/all_tests_deploy.jar "io.ray.docdemo.$class" -# done -# popd +echo "Running documentation demo code." +docdemo_path="java/test/src/main/java/io/ray/docdemo/" +for file in "$docdemo_path"*.java; do + file=${file#"$docdemo_path"} + class=${file%".java"} + echo "Running $class" + java -cp bazel-bin/java/all_tests_deploy.jar "io.ray.docdemo.$class" +done +popd pushd "$ROOT_DIR" echo "Testing maven install." diff --git a/java/test/pom.xml b/java/test/pom.xml index f401f3cff5ab..c9e34821b544 100644 --- a/java/test/pom.xml +++ b/java/test/pom.xml @@ -117,6 +117,41 @@ + + + com.diffplug.spotless + spotless-maven-plugin + 2.6.1 + + + + + + + + + .java + + + + + + + + true + 4 + + + + + + + 1.7 + + + + + diff --git a/java/test/src/main/java/io/ray/test/ActorRestartTest.java b/java/test/src/main/java/io/ray/test/ActorRestartTest.java index c57f9b6142d1..fe70e086764d 100644 --- a/java/test/src/main/java/io/ray/test/ActorRestartTest.java +++ b/java/test/src/main/java/io/ray/test/ActorRestartTest.java @@ -3,7 +3,6 @@ import io.ray.api.ActorHandle; import io.ray.api.Ray; import io.ray.runtime.exception.RayActorException; -import io.ray.runtime.exception.RayException; import io.ray.runtime.util.SystemUtil; import java.io.IOException; import java.util.concurrent.TimeUnit; @@ -57,7 +56,6 @@ public void testActorRestart() throws InterruptedException, IOException { // Kill the actor process. killActorProcess(actor); - waitForActorAlive(actor); int value = actor.task(Counter::increase).remote().get(); Assert.assertEquals(value, 1); @@ -83,18 +81,4 @@ private static void killActorProcess(ActorHandle actor) // Wait for the actor to be killed. TimeUnit.SECONDS.sleep(1); } - - private static void waitForActorAlive(ActorHandle actor) { - Assert.assertTrue( - TestUtils.waitForCondition( - () -> { - try { - actor.task(Counter::getPid).remote().get(); - return true; - } catch (RayException e) { - return false; - } - }, - 10000)); - } } diff --git a/java/test/src/main/java/io/ray/test/ExitActorTest.java b/java/test/src/main/java/io/ray/test/ExitActorTest.java index a1c40e2ac8a1..279af55c05e5 100644 --- a/java/test/src/main/java/io/ray/test/ExitActorTest.java +++ b/java/test/src/main/java/io/ray/test/ExitActorTest.java @@ -15,9 +15,7 @@ import org.testng.Assert; import org.testng.annotations.Test; -@Test( - groups = {"cluster"}, - enabled = false) +@Test(groups = {"cluster"}) public class ExitActorTest extends BaseTest { private static class ExitingActor { diff --git a/java/test/src/main/java/io/ray/test/FailureTest.java b/java/test/src/main/java/io/ray/test/FailureTest.java index 5bfc40dd672e..218c78271023 100644 --- a/java/test/src/main/java/io/ray/test/FailureTest.java +++ b/java/test/src/main/java/io/ray/test/FailureTest.java @@ -23,17 +23,20 @@ public class FailureTest extends BaseTest { private static final String EXCEPTION_MESSAGE = "Oops"; + private String oldNumWorkersPerProcess; + @BeforeClass public void setUp() { // This is needed by `testGetThrowsQuicklyWhenFoundException`. // Set one worker per process. Otherwise, if `badFunc2` and `slowFunc` run in the same // process, `sleep` will delay `System.exit`. + oldNumWorkersPerProcess = System.getProperty("ray.job.num-java-workers-per-process"); System.setProperty("ray.job.num-java-workers-per-process", "1"); } @AfterClass public void tearDown() { - System.clearProperty("ray.job.num-java-workers-per-process"); + System.setProperty("ray.job.num-java-workers-per-process", oldNumWorkersPerProcess); } public static int badFunc() { diff --git a/java/test/src/main/java/io/ray/test/JobConfigTest.java b/java/test/src/main/java/io/ray/test/JobConfigTest.java index f5efc3377c3c..4ba9e484d5a1 100644 --- a/java/test/src/main/java/io/ray/test/JobConfigTest.java +++ b/java/test/src/main/java/io/ray/test/JobConfigTest.java @@ -10,8 +10,11 @@ @Test(groups = {"cluster"}) public class JobConfigTest extends BaseTest { + private String oldNumWorkersPerProcess; + @BeforeClass public void setupJobConfig() { + oldNumWorkersPerProcess = System.getProperty("ray.job.num-java-workers-per-process"); System.setProperty("ray.job.num-java-workers-per-process", "3"); System.setProperty("ray.job.jvm-options.0", "-DX=999"); System.setProperty("ray.job.jvm-options.1", "-DY=998"); @@ -21,7 +24,7 @@ public void setupJobConfig() { @AfterClass public void tearDownJobConfig() { - System.clearProperty("ray.job.num-java-workers-per-process"); + System.setProperty("ray.job.num-java-workers-per-process", oldNumWorkersPerProcess); System.clearProperty("ray.job.jvm-options.0"); System.clearProperty("ray.job.jvm-options.1"); System.clearProperty("ray.job.worker-env.foo1"); diff --git a/java/test/src/main/java/io/ray/test/KillActorTest.java b/java/test/src/main/java/io/ray/test/KillActorTest.java index 753b00a9c59c..d862d3e1232a 100644 --- a/java/test/src/main/java/io/ray/test/KillActorTest.java +++ b/java/test/src/main/java/io/ray/test/KillActorTest.java @@ -14,14 +14,17 @@ @Test(groups = {"cluster"}) public class KillActorTest extends BaseTest { + private String oldNumWorkersPerProcess; + @BeforeClass public void setUp() { + oldNumWorkersPerProcess = System.getProperty("ray.job.num-java-workers-per-process"); System.setProperty("ray.job.num-java-workers-per-process", "1"); } @AfterClass public void tearDown() { - System.clearProperty("ray.job.num-java-workers-per-process"); + System.setProperty("ray.job.num-java-workers-per-process", oldNumWorkersPerProcess); } public static class HangActor { @@ -59,8 +62,6 @@ private static void remoteKill(ActorHandle actor, boolean noRestart) { private void testKillActor(BiConsumer, Boolean> kill, boolean noRestart) { ActorHandle actor = Ray.actor(HangActor::new).setMaxRestarts(1).remote(); - // Wait for the actor to be created. - actor.task(HangActor::ping).remote().get(); ObjectRef result = actor.task(HangActor::hang).remote(); // The actor will hang in this task. Assert.assertEquals(0, Ray.wait(ImmutableList.of(result), 1, 500).getReady().size()); diff --git a/java/test/src/main/java/io/ray/test/MultiDriverTest.java b/java/test/src/main/java/io/ray/test/MultiDriverTest.java index 3feb981927c0..9c781f56283f 100644 --- a/java/test/src/main/java/io/ray/test/MultiDriverTest.java +++ b/java/test/src/main/java/io/ray/test/MultiDriverTest.java @@ -17,9 +17,7 @@ import org.testng.Assert; import org.testng.annotations.Test; -@Test( - groups = {"cluster"}, - enabled = false) +@Test(groups = {"cluster"}) public class MultiDriverTest extends BaseTest { private static final int DRIVER_COUNT = 10; diff --git a/java/test/src/main/java/io/ray/test/NodeIpTest.java b/java/test/src/main/java/io/ray/test/NodeIpTest.java deleted file mode 100644 index 4aee086efcb7..000000000000 --- a/java/test/src/main/java/io/ray/test/NodeIpTest.java +++ /dev/null @@ -1,46 +0,0 @@ -package io.ray.test; - -import io.ray.api.Ray; -import org.apache.commons.lang3.SystemUtils; -import org.testng.Assert; -import org.testng.SkipException; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -@Test(groups = {"cluster"}) -public class NodeIpTest extends BaseTest { - - private static final String NODE_IP = "127.0.0.2"; - - @BeforeClass - public void setUp() { - if (SystemUtils.IS_OS_MAC) { - throw new SkipException("Skip NodeIpTest on Mac OS"); - } - System.setProperty("ray.head-args.0", "--node-ip-address=127.0.0.2"); - System.setProperty("ray.node-ip", "127.0.0.2"); - } - - @AfterClass - public void tearDown() { - if (!SystemUtils.IS_OS_MAC) { - System.clearProperty("ray.head-args.0"); - System.clearProperty("ray.node-ip"); - } - } - - static String getNodeIp() { - return TestUtils.getRuntime().getRayConfig().nodeIp; - } - - public void testNodeIp() { - // this is on the driver node, and it should be equal with ray.node-ip - String nodeIP = TestUtils.getRuntime().getRayConfig().nodeIp; - Assert.assertEquals(nodeIP, NODE_IP); - - // this is on the worker node, and it should be equal with node-ip-address - nodeIP = Ray.task(NodeIpTest::getNodeIp).remote().get(); - Assert.assertEquals(nodeIP, NODE_IP); - } -} diff --git a/java/test/src/main/java/io/ray/test/PlacementGroupTest.java b/java/test/src/main/java/io/ray/test/PlacementGroupTest.java index 89d1fab69452..14bf0fd6a577 100644 --- a/java/test/src/main/java/io/ray/test/PlacementGroupTest.java +++ b/java/test/src/main/java/io/ray/test/PlacementGroupTest.java @@ -7,6 +7,7 @@ import io.ray.api.placementgroup.PlacementGroupState; import io.ray.api.placementgroup.PlacementStrategy; import io.ray.runtime.exception.RayException; +import io.ray.runtime.placementgroup.PlacementGroupImpl; import java.util.List; import org.testng.Assert; import org.testng.annotations.Test; @@ -31,7 +32,8 @@ public int getValue() { // This test just creates a placement group with one bundle. // It's not comprehensive to test all placement group test cases. public void testCreateAndCallActor() { - PlacementGroup placementGroup = PlacementGroupTestUtils.createSimpleGroup(); + PlacementGroupImpl placementGroup = + (PlacementGroupImpl) PlacementGroupTestUtils.createSimpleGroup(); Assert.assertTrue(placementGroup.wait(10)); Assert.assertEquals(placementGroup.getName(), "unnamed_group"); @@ -46,18 +48,22 @@ public void testCreateAndCallActor() { @Test(groups = {"cluster"}) public void testGetPlacementGroup() { - PlacementGroup firstPlacementGroup = - PlacementGroupTestUtils.createNameSpecifiedSimpleGroup( - "CPU", 1, PlacementStrategy.PACK, 1.0, "first_placement_group"); - - PlacementGroup secondPlacementGroup = - PlacementGroupTestUtils.createNameSpecifiedSimpleGroup( - "CPU", 1, PlacementStrategy.PACK, 1.0, "second_placement_group"); + PlacementGroupImpl firstPlacementGroup = + (PlacementGroupImpl) + PlacementGroupTestUtils.createNameSpecifiedSimpleGroup( + "CPU", 1, PlacementStrategy.PACK, 1.0, "first_placement_group"); + + PlacementGroupImpl secondPlacementGroup = + (PlacementGroupImpl) + PlacementGroupTestUtils.createNameSpecifiedSimpleGroup( + "CPU", 1, PlacementStrategy.PACK, 1.0, "second_placement_group"); Assert.assertTrue(firstPlacementGroup.wait(10)); Assert.assertTrue(secondPlacementGroup.wait(10)); - PlacementGroup firstPlacementGroupRes = Ray.getPlacementGroup((firstPlacementGroup).getId()); - PlacementGroup secondPlacementGroupRes = Ray.getPlacementGroup((secondPlacementGroup).getId()); + PlacementGroupImpl firstPlacementGroupRes = + (PlacementGroupImpl) Ray.getPlacementGroup((firstPlacementGroup).getId()); + PlacementGroupImpl secondPlacementGroupRes = + (PlacementGroupImpl) Ray.getPlacementGroup((secondPlacementGroup).getId()); Assert.assertNotNull(firstPlacementGroupRes); Assert.assertNotNull(secondPlacementGroupRes); @@ -70,9 +76,9 @@ public void testGetPlacementGroup() { List allPlacementGroup = Ray.getAllPlacementGroups(); Assert.assertEquals(allPlacementGroup.size(), 2); - PlacementGroup placementGroupRes = allPlacementGroup.get(0); + PlacementGroupImpl placementGroupRes = (PlacementGroupImpl) allPlacementGroup.get(0); Assert.assertNotNull(placementGroupRes.getId()); - PlacementGroup expectPlacementGroup = + PlacementGroupImpl expectPlacementGroup = placementGroupRes.getId().equals(firstPlacementGroup.getId()) ? firstPlacementGroup : secondPlacementGroup; @@ -83,23 +89,23 @@ public void testGetPlacementGroup() { Assert.assertEquals(placementGroupRes.getStrategy(), expectPlacementGroup.getStrategy()); } - @Test( - groups = {"cluster"}, - enabled = false) + @Test(groups = {"cluster"}) public void testRemovePlacementGroup() { PlacementGroupTestUtils.createNameSpecifiedSimpleGroup( "CPU", 1, PlacementStrategy.PACK, 1.0, "first_placement_group"); - PlacementGroup secondPlacementGroup = - PlacementGroupTestUtils.createNameSpecifiedSimpleGroup( - "CPU", 1, PlacementStrategy.PACK, 1.0, "second_placement_group"); + PlacementGroupImpl secondPlacementGroup = + (PlacementGroupImpl) + PlacementGroupTestUtils.createNameSpecifiedSimpleGroup( + "CPU", 1, PlacementStrategy.PACK, 1.0, "second_placement_group"); List allPlacementGroup = Ray.getAllPlacementGroups(); Assert.assertEquals(allPlacementGroup.size(), 2); Ray.removePlacementGroup(secondPlacementGroup.getId()); - PlacementGroup removedPlacementGroup = Ray.getPlacementGroup((secondPlacementGroup).getId()); + PlacementGroupImpl removedPlacementGroup = + (PlacementGroupImpl) Ray.getPlacementGroup((secondPlacementGroup).getId()); Assert.assertEquals(removedPlacementGroup.getState(), PlacementGroupState.REMOVED); // Wait for placement group after it is removed. diff --git a/java/test/src/main/java/io/ray/test/PlasmaFreeTest.java b/java/test/src/main/java/io/ray/test/PlasmaFreeTest.java index b8235b8d84fa..3e49ff798630 100644 --- a/java/test/src/main/java/io/ray/test/PlasmaFreeTest.java +++ b/java/test/src/main/java/io/ray/test/PlasmaFreeTest.java @@ -25,7 +25,7 @@ public void testDeleteObjects() { () -> !TestUtils.getRuntime() .getObjectStore() - .wait(ImmutableList.of(((ObjectRefImpl) helloId).getId()), 1, 0, true) + .wait(ImmutableList.of(((ObjectRefImpl) helloId).getId()), 1, 0) .get(0), 50); if (TestUtils.isSingleProcessMode()) { diff --git a/java/test/src/main/java/io/ray/test/ReferenceCountingTest.java b/java/test/src/main/java/io/ray/test/ReferenceCountingTest.java index a98f9595914b..aa56581951e6 100644 --- a/java/test/src/main/java/io/ray/test/ReferenceCountingTest.java +++ b/java/test/src/main/java/io/ray/test/ReferenceCountingTest.java @@ -119,7 +119,7 @@ private static void fillObjectStoreAndGet( TestUtils.getRuntime().getObjectStore().getRaw(ImmutableList.of(objectId), Long.MAX_VALUE); } else { List result = - TestUtils.getRuntime().getObjectStore().wait(ImmutableList.of(objectId), 1, 100, true); + TestUtils.getRuntime().getObjectStore().wait(ImmutableList.of(objectId), 1, 100); Assert.assertFalse(result.get(0)); } } diff --git a/java/test/src/main/java/io/ray/test/TestProgressListener.java b/java/test/src/main/java/io/ray/test/TestProgressListener.java index 915d82af317b..1fed5ac21375 100644 --- a/java/test/src/main/java/io/ray/test/TestProgressListener.java +++ b/java/test/src/main/java/io/ray/test/TestProgressListener.java @@ -1,42 +1,27 @@ package io.ray.test; -import com.google.common.collect.ImmutableList; -import io.ray.runtime.runner.RunManager; -import java.io.File; import java.time.LocalDateTime; -import java.util.Collection; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import org.apache.commons.io.FileUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.lang3.SystemUtils; import org.testng.IInvokedMethod; import org.testng.IInvokedMethodListener; import org.testng.ITestContext; import org.testng.ITestListener; import org.testng.ITestResult; -import org.testng.SkipException; public class TestProgressListener implements IInvokedMethodListener, ITestListener { - // Travis aborts CI if no outputs for 10 minutes. So threshold needs to be smaller than 10m. - private static final long hangDetectionThresholdMillis = 5 * 60 * 1000; - private static final int TAIL_NO_OF_LINES = 500; - private Thread testMainThread; - private long testStartTimeMillis; - private String getFullTestName(ITestResult testResult) { return testResult.getTestClass().getName() + "." + testResult.getMethod().getMethodName(); } - private void printSection(String sectionName) { + private void printInfo(String tag, String content) { System.out.println( - "============ [" + LocalDateTime.now().toString() + "] " + sectionName + " ============"); - } - - private void printTestStage(String tag, String content) { - printSection("[" + tag + "] " + content); + "============ [" + + LocalDateTime.now().toString() + + "] [" + + tag + + "] " + + content + + " ============"); } @Override @@ -47,50 +32,31 @@ public void afterInvocation(IInvokedMethod method, ITestResult testResult) {} @Override public void onTestStart(ITestResult result) { - printTestStage("TEST START", getFullTestName(result)); - testStartTimeMillis = System.currentTimeMillis(); - // TODO(kfstorm): Add a timer to detect hang - if (testMainThread == null) { - testMainThread = Thread.currentThread(); - Thread hangDetectionThread = - new Thread( - () -> { - try { - // If current task case has ran for more than 5 minutes. - while (System.currentTimeMillis() - testStartTimeMillis - < hangDetectionThresholdMillis) { - Thread.sleep(1000); - } - printDebugInfo(null, /*testHanged=*/ true); - } catch (InterruptedException e) { - // ignored - } - }); - hangDetectionThread.setDaemon(true); - hangDetectionThread.start(); - } + printInfo("TEST START", getFullTestName(result)); } @Override public void onTestSuccess(ITestResult result) { - printTestStage("TEST SUCCESS", getFullTestName(result)); + printInfo("TEST SUCCESS", getFullTestName(result)); } @Override public void onTestFailure(ITestResult result) { - printTestStage("TEST FAILURE", getFullTestName(result)); - printDebugInfo(result, /*testHanged=*/ false); + printInfo("TEST FAILURE", getFullTestName(result)); + Throwable throwable = result.getThrowable(); + if (throwable != null) { + throwable.printStackTrace(); + } } @Override public void onTestSkipped(ITestResult result) { - printTestStage("TEST SKIPPED", getFullTestName(result)); - printDebugInfo(result, /*testHanged=*/ false); + printInfo("TEST SKIPPED", getFullTestName(result)); } @Override public void onTestFailedButWithinSuccessPercentage(ITestResult result) { - printTestStage("TEST FAILED BUT WITHIN SUCCESS PERCENTAGE", getFullTestName(result)); + printInfo("TEST FAILED BUT WITHIN SUCCESS PERCENTAGE", getFullTestName(result)); } @Override @@ -98,102 +64,4 @@ public void onStart(ITestContext context) {} @Override public void onFinish(ITestContext context) {} - - private void printDebugInfo(ITestResult result, boolean testHanged) { - boolean testFailed = false; - if (result != null) { - Throwable throwable = result.getThrowable(); - if (throwable != null && !(throwable instanceof SkipException)) { - testFailed = true; - throwable.printStackTrace(); - } - } - if (!testFailed && !testHanged) { - return; - } - - if (testHanged) { - printSection("TEST CASE HANGED"); - printSection("STACK TRACE OF TEST THREAD"); - for (StackTraceElement element : testMainThread.getStackTrace()) { - System.out.println(element.toString()); - } - Set javaPids = getJavaPids(); - for (Integer pid : javaPids) { - runCommandSafely(ImmutableList.of("jstack", pid.toString())); - // TODO(kfstorm): Check lldb or gdb exists rather than detecting OS type. - if (SystemUtils.IS_OS_MAC) { - runCommandSafely( - ImmutableList.of("lldb", "--batch", "-o", "bt all", "-p", pid.toString())); - } else { - runCommandSafely( - ImmutableList.of( - "sudo", "gdb", "-batch", "-ex", "thread apply all bt", "-p", pid.toString())); - } - } - } - - printLogFiles(); - - if (testHanged) { - printSection("ABORT TEST"); - System.exit(1); - } - } - - private String runCommandSafely(List command) { - String output; - String commandString = String.join(" ", command); - printSection(commandString); - try { - output = RunManager.runCommand(command); - System.out.println(output); - } catch (Exception e) { - System.out.println("Failed to execute command: " + commandString); - e.printStackTrace(); - output = ""; - } - return output; - } - - private Set getJavaPids() { - Set javaPids = new HashSet<>(); - String jpsOutput = runCommandSafely(ImmutableList.of("jps", "-v")); - try { - for (String line : StringUtils.split(jpsOutput, "\n")) { - String[] parts = StringUtils.split(line); - if (parts.length > 1 && parts[1].toLowerCase().equals("jps")) { - // Skip jps. - continue; - } - Integer pid = Integer.valueOf(parts[0]); - javaPids.add(pid); - } - } catch (Exception e) { - System.out.println("Failed to parse jps output."); - e.printStackTrace(); - } - - String pgrepJavaResult = runCommandSafely(ImmutableList.of("pgrep", "java")); - try { - for (String line : StringUtils.split(pgrepJavaResult, "\n")) { - Integer pid = Integer.valueOf(line); - javaPids.add(pid); - } - } catch (Exception e) { - System.out.println("Failed to parse pgrep java output."); - e.printStackTrace(); - } - - return javaPids; - } - - private void printLogFiles() { - Collection logFiles = - FileUtils.listFiles(new File("/tmp/ray/session_latest/logs"), null, false); - for (File file : logFiles) { - runCommandSafely( - ImmutableList.of("tail", "-n", String.valueOf(TAIL_NO_OF_LINES), file.getAbsolutePath())); - } - } } diff --git a/java/test/src/main/resources/ray.conf b/java/test/src/main/resources/ray.conf deleted file mode 100644 index b838c0075a3f..000000000000 --- a/java/test/src/main/resources/ray.conf +++ /dev/null @@ -1,6 +0,0 @@ -ray { - job { - # Enable multi-worker feature in Java test - num-java-workers-per-process: 10 - } -} diff --git a/java/testng.xml b/java/testng.xml index 0db2704845d4..6cc10b9ab24a 100644 --- a/java/testng.xml +++ b/java/testng.xml @@ -1,6 +1,6 @@ - + diff --git a/python/ray/_private/client_mode_hook.py b/python/ray/_private/client_mode_hook.py index 74682f1cfa9d..3ceef7316abd 100644 --- a/python/ray/_private/client_mode_hook.py +++ b/python/ray/_private/client_mode_hook.py @@ -2,9 +2,6 @@ from contextlib import contextmanager from functools import wraps -# Attr set on func defs to mark they have been converted to client mode. -RAY_CLIENT_MODE_ATTR = "__ray_client_mode_key__" - client_mode_enabled = os.environ.get("RAY_CLIENT_MODE", "0") == "1" _client_hook_enabled = True @@ -37,54 +34,16 @@ def disable_client_hook(): def client_mode_hook(func): - """Decorator for ray module methods to delegate to ray client""" + """ + Decorator for ray module methods to delegate to ray client + """ from ray.util.client import ray @wraps(func) def wrapper(*args, **kwargs): - if client_mode_should_convert(): + global _client_hook_enabled + if client_mode_enabled and _client_hook_enabled: return getattr(ray, func.__name__)(*args, **kwargs) return func(*args, **kwargs) return wrapper - - -def client_mode_should_convert(): - global _client_hook_enabled - return client_mode_enabled and _client_hook_enabled - - -def client_mode_convert_function(func_cls, in_args, in_kwargs, **kwargs): - """Runs a preregistered ray RemoteFunction through the ray client. - - The common case for this is to transparently convert that RemoteFunction - to a ClientRemoteFunction. This happens in circumstances where the - RemoteFunction is declared early, in a library and only then is Ray used in - client mode -- nescessitating a conversion. - """ - from ray.util.client import ray - - key = getattr(func_cls, RAY_CLIENT_MODE_ATTR, None) - if key is None: - key = ray._convert_function(func_cls) - setattr(func_cls, RAY_CLIENT_MODE_ATTR, key) - client_func = ray._get_converted(key) - return client_func._remote(in_args, in_kwargs, **kwargs) - - -def client_mode_convert_actor(actor_cls, in_args, in_kwargs, **kwargs): - """Runs a preregistered actor class on the ray client - - The common case for this decorator is for instantiating an ActorClass - transparently as a ClientActorClass. This happens in circumstances where - the ActorClass is declared early, in a library and only then is Ray used in - client mode -- nescessitating a conversion. - """ - from ray.util.client import ray - - key = getattr(actor_cls, RAY_CLIENT_MODE_ATTR, None) - if key is None: - key = ray._convert_actor(actor_cls) - setattr(actor_cls, RAY_CLIENT_MODE_ATTR, key) - client_actor = ray._get_converted(key) - return client_actor._remote(in_args, in_kwargs, **kwargs) diff --git a/python/ray/_private/services.py b/python/ray/_private/services.py index c3144c05f39c..c9ea996f9c0c 100644 --- a/python/ray/_private/services.py +++ b/python/ray/_private/services.py @@ -216,7 +216,7 @@ def get_ray_address_to_use_or_die(): A string to pass into `ray.init(address=...)` """ if "RAY_ADDRESS" in os.environ: - return os.environ.get("RAY_ADDRESS") + return "auto" # Avoid conflict with RAY_ADDRESS env var return find_redis_address_or_die() @@ -829,13 +829,6 @@ def start_redis(node_ip_address, redis_modules = [REDIS_MODULE] redis_stdout_file, redis_stderr_file = redirect_files[0] - # If no port is given, fallback to default Redis port for the primary - # shard. - if port is None: - port = ray_constants.DEFAULT_PORT - num_retries = 20 - else: - num_retries = 1 # Start the primary Redis shard. port, p = _start_redis_instance( redis_executable, @@ -843,7 +836,6 @@ def start_redis(node_ip_address, port=port, password=password, redis_max_clients=redis_max_clients, - num_retries=num_retries, # Below we use None to indicate no limit on the memory of the # primary Redis shard. redis_max_memory=None, @@ -877,29 +869,17 @@ def start_redis(node_ip_address, # Start other Redis shards. Each Redis shard logs to a separate file, # prefixed by "redis-". redis_shards = [] - # Attempt to start the other Redis shards port range right after the - # primary Redis shard port. - last_shard_port = port for i in range(num_redis_shards): redis_stdout_file, redis_stderr_file = redirect_files[i + 1] redis_executable = REDIS_EXECUTABLE redis_modules = [REDIS_MODULE] - redis_shard_port = redis_shard_ports[i] - # If no shard port is given, try to start this shard's Redis instance - # on the port right after the last shard's port. - if redis_shard_port is None: - redis_shard_port = last_shard_port + 1 - num_retries = 20 - else: - num_retries = 1 redis_shard_port, p = _start_redis_instance( redis_executable, modules=redis_modules, - port=redis_shard_port, + port=redis_shard_ports[i], password=password, redis_max_clients=redis_max_clients, - num_retries=num_retries, redis_max_memory=redis_max_memory, stdout_file=redis_stdout_file, stderr_file=redis_stderr_file, @@ -910,14 +890,13 @@ def start_redis(node_ip_address, redis_shards.append(shard_address) # Store redis shard information in the primary redis shard. primary_redis_client.rpush("RedisShards", shard_address) - last_shard_port = redis_shard_port return redis_address, redis_shards, processes def _start_redis_instance(executable, modules, - port, + port=None, redis_max_clients=None, num_retries=20, stdout_file=None, @@ -928,19 +907,20 @@ def _start_redis_instance(executable, """Start a single Redis server. Notes: - We will initially try to start the Redis instance at the given port, - and then try at most `num_retries - 1` times to start the Redis - instance at successive random ports. + If "port" is not None, then we will only use this port and try + only once. Otherwise, we will first try the default redis port, + and if it is unavailable, we will try random ports with + maximum retries of "num_retries". Args: executable (str): Full path of the redis-server executable. modules (list of str): A list of pathnames, pointing to the redis module(s) that will be loaded in this redis server. - port (int): Try to start a Redis server at this port. + port (int): If provided, start a Redis server with this port. redis_max_clients: If this is provided, Ray will attempt to configure Redis with this maxclients number. - num_retries (int): The number of times to attempt to start Redis at - successive ports. + num_retries (int): The number of times to attempt to start Redis. If a + port is provided, this defaults to 1. stdout_file: A file handle opened for writing to redirect stdout to. If no redirection should happen, then this should be None. stderr_file: A file handle opened for writing to redirect stderr to. If @@ -963,6 +943,13 @@ def _start_redis_instance(executable, for module in modules: assert os.path.isfile(module) counter = 0 + if port is not None: + # If a port is specified, then try only once to connect. + # This ensures that we will use the given port. + num_retries = 1 + else: + port = ray_constants.DEFAULT_PORT + load_module_args = [] for module in modules: load_module_args += ["--loadmodule", module] @@ -1058,9 +1045,7 @@ def start_log_monitor(redis_address, stdout_file=None, stderr_file=None, redis_password=None, - fate_share=None, - max_bytes=0, - backup_count=0): + fate_share=None): """Start a log monitor process. Args: @@ -1071,20 +1056,17 @@ def start_log_monitor(redis_address, stderr_file: A file handle opened for writing to redirect stderr to. If no redirection should happen, then this should be None. redis_password (str): The password of the redis server. - max_bytes (int): Log rotation parameter. Corresponding to - RotatingFileHandler's maxBytes. - backup_count (int): Log rotation parameter. Corresponding to - RotatingFileHandler's backupCount. Returns: ProcessInfo for the process that was started. """ log_monitor_filepath = os.path.join(RAY_PATH, "log_monitor.py") command = [ - sys.executable, "-u", log_monitor_filepath, - f"--redis-address={redis_address}", f"--logs-dir={logs_dir}", - f"--logging-rotate-bytes={max_bytes}", - f"--logging-rotate-backup-count={backup_count}" + sys.executable, + "-u", + log_monitor_filepath, + f"--redis-address={redis_address}", + f"--logs-dir={logs_dir}", ] if redis_password: command += ["--redis-password", redis_password] @@ -1106,9 +1088,7 @@ def start_dashboard(require_dashboard, stdout_file=None, stderr_file=None, redis_password=None, - fate_share=None, - max_bytes=0, - backup_count=0): + fate_share=None): """Start a dashboard process. Args: @@ -1127,10 +1107,6 @@ def start_dashboard(require_dashboard, stderr_file: A file handle opened for writing to redirect stderr to. If no redirection should happen, then this should be None. redis_password (str): The password of the redis server. - max_bytes (int): Log rotation parameter. Corresponding to - RotatingFileHandler's maxBytes. - backup_count (int): Log rotation parameter. Corresponding to - RotatingFileHandler's backupCount. Returns: ProcessInfo for the process that was started. @@ -1156,11 +1132,14 @@ def start_dashboard(require_dashboard, dashboard_dir = "new_dashboard" dashboard_filepath = os.path.join(RAY_PATH, dashboard_dir, "dashboard.py") command = [ - sys.executable, "-u", dashboard_filepath, f"--host={host}", - f"--port={port}", f"--redis-address={redis_address}", - f"--temp-dir={temp_dir}", f"--log-dir={logdir}", - f"--logging-rotate-bytes={max_bytes}", - f"--logging-rotate-backup-count={backup_count}" + sys.executable, + "-u", + dashboard_filepath, + f"--host={host}", + f"--port={port}", + f"--redis-address={redis_address}", + f"--temp-dir={temp_dir}", + f"--log-dir={logdir}", ] if redis_password: @@ -1279,9 +1258,7 @@ def start_raylet(redis_address, fate_share=None, socket_to_use=None, head_node=False, - start_initial_python_workers_for_first_job=False, - max_bytes=0, - backup_count=0): + start_initial_python_workers_for_first_job=False): """Start a raylet, which is a combined local scheduler and object manager. Args: @@ -1318,10 +1295,6 @@ def start_raylet(redis_address, config (dict|None): Optional Raylet configuration that will override defaults in RayConfig. java_worker_options (list): The command options for Java worker. - max_bytes (int): Log rotation parameter. Corresponding to - RotatingFileHandler's maxBytes. - backup_count (int): Log rotation parameter. Corresponding to - RotatingFileHandler's backupCount. Returns: ProcessInfo for the process that was started. """ @@ -1370,7 +1343,6 @@ def start_raylet(redis_address, raylet_name, redis_password, session_dir, - node_ip_address, ) else: java_worker_command = [] @@ -1400,8 +1372,6 @@ def start_raylet(redis_address, f"--config-list={config_str}", f"--temp-dir={temp_dir}", f"--metrics-agent-port={metrics_agent_port}", - f"--logging-rotate-bytes={max_bytes}", - f"--logging-rotate-backup-count={backup_count}", "RAY_WORKER_DYNAMIC_OPTION_PLACEHOLDER", ] if redis_password: @@ -1432,8 +1402,6 @@ def start_raylet(redis_address, f"--raylet-name={raylet_name}", f"--temp-dir={temp_dir}", f"--log-dir={log_dir}", - f"--logging-rotate-bytes={max_bytes}", - f"--logging-rotate-backup-count={backup_count}", ] if redis_password is not None and len(redis_password) != 0: @@ -1509,8 +1477,7 @@ def get_ray_jars_dir(): def build_java_worker_command(java_worker_options, redis_address, node_manager_port, plasma_store_name, - raylet_name, redis_password, session_dir, - node_ip_address): + raylet_name, redis_password, session_dir): """This method assembles the command used to start a Java worker. Args: @@ -1521,7 +1488,6 @@ def build_java_worker_command(java_worker_options, redis_address, raylet_name (str): The name of the raylet socket to create. redis_password (str): The password of connect to redis. session_dir (str): The path of this session. - node_ip_address (str): The ip address for this node. Returns: The command string for starting Java worker. """ @@ -1539,9 +1505,6 @@ def build_java_worker_command(java_worker_options, redis_address, if redis_password is not None: pairs.append(("ray.redis.password", redis_password)) - if node_ip_address is not None: - pairs.append(("ray.node-ip", node_ip_address)) - pairs.append(("ray.home", RAY_HOME)) pairs.append(("ray.logging.dir", os.path.join(session_dir, "logs"))) pairs.append(("ray.session-dir", session_dir)) @@ -1599,9 +1562,13 @@ def build_cpp_worker_command( The command string for starting CPP worker. """ + # TODO(Guyang Song): Remove the arg is_default_worker. + # See `cluster_mode_test.cc` for why this workaround is currently needed + # for C++ workers. command = [ DEFAULT_WORKER_EXECUTABLE, plasma_store_name, raylet_name, - str(node_manager_port), redis_address, redis_password, session_dir + str(node_manager_port), redis_address, redis_password, session_dir, + "is_default_worker" ] return command @@ -1655,11 +1622,10 @@ def determine_plasma_store_config(object_store_memory, "This will harm performance! You may be able to free up " "space by deleting files in /dev/shm. If you are inside a " "Docker container, you can increase /dev/shm size by " - "passing '--shm-size={:.2f}gb' to 'docker run' (or add it " - "to the run_options list in a Ray cluster config). Make " - "sure to set this to more than 30% of available RAM.". - format(ray.utils.get_user_temp_dir(), shm_avail, - object_store_memory * (1.1) / (2**30))) + "passing '--shm-size=Xgb' to 'docker run' (or add it to " + "the run_options list in a Ray cluster config). Make sure " + "to set this to more than 2gb.".format( + ray.utils.get_user_temp_dir(), shm_avail)) else: plasma_directory = ray.utils.get_user_temp_dir() @@ -1813,9 +1779,7 @@ def start_monitor(redis_address, stderr_file=None, autoscaling_config=None, redis_password=None, - fate_share=None, - max_bytes=0, - backup_count=0): + fate_share=None): """Run a process to monitor the other processes. Args: @@ -1827,20 +1791,17 @@ def start_monitor(redis_address, no redirection should happen, then this should be None. autoscaling_config: path to autoscaling config file. redis_password (str): The password of the redis server. - max_bytes (int): Log rotation parameter. Corresponding to - RotatingFileHandler's maxBytes. - backup_count (int): Log rotation parameter. Corresponding to - RotatingFileHandler's backupCount. Returns: ProcessInfo for the process that was started. """ monitor_path = os.path.join(RAY_PATH, "monitor.py") command = [ - sys.executable, "-u", monitor_path, f"--logs-dir={logs_dir}", - f"--redis-address={redis_address}", - f"--logging-rotate-bytes={max_bytes}", - f"--logging-rotate-backup-count={backup_count}" + sys.executable, + "-u", + monitor_path, + f"--logs-dir={logs_dir}", + "--redis-address=" + str(redis_address), ] if autoscaling_config: command.append("--autoscaling-config=" + str(autoscaling_config)) diff --git a/python/ray/_raylet.pxd b/python/ray/_raylet.pxd index 4a0f7b923b54..e8edc78a71b1 100644 --- a/python/ray/_raylet.pxd +++ b/python/ray/_raylet.pxd @@ -101,8 +101,7 @@ cdef class CoreWorker: cdef _create_put_buffer(self, shared_ptr[CBuffer] &metadata, size_t data_size, ObjectRef object_ref, c_vector[CObjectID] contained_ids, - CObjectID *c_object_id, shared_ptr[CBuffer] *data, - owner_address=*) + CObjectID *c_object_id, shared_ptr[CBuffer] *data) cdef store_task_outputs( self, worker, outputs, const c_vector[CObjectID] return_ids, c_vector[shared_ptr[CRayObject]] *returns) diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 3dda95988cd3..8ba80852fb40 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -477,12 +477,6 @@ cdef execute_task( if debugger_breakpoint != b"": ray.util.pdb.set_trace( breakpoint_uuid=debugger_breakpoint) - if inspect.iscoroutinefunction(function_executor): - raise ValueError( - "'async def' should not be used for remote " - "tasks. You can wrap the async function with " - "`asyncio.get_event_loop.run_until(f())`. " - "See more at docs.ray.io/async_api.html") outputs = function_executor(*args, **kwargs) next_breakpoint = ( ray.worker.global_worker.debugger_breakpoint) @@ -628,8 +622,7 @@ cdef void gc_collect() nogil: cdef c_vector[c_string] spill_objects_handler( - const c_vector[CObjectID]& object_ids_to_spill, - const c_vector[c_string]& owner_addresses) nogil: + const c_vector[CObjectID]& object_ids_to_spill) nogil: cdef c_vector[c_string] return_urls with gil: object_refs = VectorToObjectRefs(object_ids_to_spill) @@ -637,8 +630,7 @@ cdef c_vector[c_string] spill_objects_handler( with ray.worker._changeproctitle( ray_constants.WORKER_PROCESS_TYPE_SPILL_WORKER, ray_constants.WORKER_PROCESS_TYPE_SPILL_WORKER_IDLE): - urls = external_storage.spill_objects( - object_refs, owner_addresses) + urls = external_storage.spill_objects(object_refs) for url in urls: return_urls.push_back(url) except Exception: @@ -724,20 +716,6 @@ cdef void delete_spilled_objects_handler( job_id=None) -cdef void unhandled_exception_handler(const CRayObject& error) nogil: - with gil: - worker = ray.worker.global_worker - data = None - metadata = None - if error.HasData(): - data = Buffer.make(error.GetData()) - if error.HasMetadata(): - metadata = Buffer.make(error.GetMetadata()).to_pybytes() - # TODO(ekl) why does passing a ObjectRef.nil() lead to shutdown errors? - object_ids = [None] - worker.raise_errors([(data, metadata)], object_ids) - - # This function introduces ~2-7us of overhead per call (i.e., it can be called # up to hundreds of thousands of times per second). cdef void get_py_stack(c_string* stack_out) nogil: @@ -847,7 +825,6 @@ cdef class CoreWorker: options.spill_objects = spill_objects_handler options.restore_spilled_objects = restore_spilled_objects_handler options.delete_spilled_objects = delete_spilled_objects_handler - options.unhandled_exception_handler = unhandled_exception_handler options.get_lang_stack = get_py_stack options.ref_counting_enabled = True options.is_local_mode = local_mode @@ -921,18 +898,6 @@ cdef class CoreWorker: return RayObjectsToDataMetadataPairs(results) - def get_if_local(self, object_refs): - """Get objects from local plasma store directly - without a fetch request to raylet.""" - cdef: - c_vector[shared_ptr[CRayObject]] results - c_vector[CObjectID] c_object_ids = ObjectRefsToVector(object_refs) - with nogil: - check_status( - CCoreWorkerProcess.GetCoreWorker().GetIfLocal( - c_object_ids, &results)) - return RayObjectsToDataMetadataPairs(results) - def object_exists(self, ObjectRef object_ref): cdef: c_bool has_object @@ -947,11 +912,7 @@ cdef class CoreWorker: cdef _create_put_buffer(self, shared_ptr[CBuffer] &metadata, size_t data_size, ObjectRef object_ref, c_vector[CObjectID] contained_ids, - CObjectID *c_object_id, shared_ptr[CBuffer] *data, - owner_address=None): - cdef: - CAddress c_owner_address - + CObjectID *c_object_id, shared_ptr[CBuffer] *data): if object_ref is None: with nogil: check_status(CCoreWorkerProcess.GetCoreWorker().CreateOwned( @@ -959,16 +920,11 @@ cdef class CoreWorker: c_object_id, data)) else: c_object_id[0] = object_ref.native() - if owner_address is None: - c_owner_address = CCoreWorkerProcess.GetCoreWorker( - ).GetRpcAddress() - else: - c_owner_address = CAddress() - c_owner_address.ParseFromString(owner_address) with nogil: check_status(CCoreWorkerProcess.GetCoreWorker().CreateExisting( metadata, data_size, c_object_id[0], - c_owner_address, data)) + CCoreWorkerProcess.GetCoreWorker().GetRpcAddress(), + data)) # If data is nullptr, that means the ObjectRef already existed, # which we ignore. @@ -977,8 +933,7 @@ cdef class CoreWorker: return data.get() == NULL def put_file_like_object( - self, metadata, data_size, file_like, ObjectRef object_ref, - owner_address): + self, metadata, data_size, file_like, ObjectRef object_ref): """Directly create a new Plasma Store object from a file like object. This avoids extra memory copy. @@ -988,7 +943,6 @@ cdef class CoreWorker: file_like: A python file object that provides the `readinto` interface. object_ref: The new ObjectRef. - owner_address: Owner address for this object ref. """ cdef: CObjectID c_object_id @@ -1003,7 +957,7 @@ cdef class CoreWorker: object_already_exists = self._create_put_buffer( metadata_buf, data_size, object_ref, ObjectRefsToVector([]), - &c_object_id, &data_buf, owner_address) + &c_object_id, &data_buf) if object_already_exists: logger.debug("Object already exists in 'put_file_like_object'.") return @@ -1230,8 +1184,7 @@ cdef class CoreWorker: self, c_string name, c_vector[unordered_map[c_string, double]] bundles, - c_string strategy, - c_bool is_detached): + c_string strategy): cdef: CPlacementGroupID c_placement_group_id CPlacementStrategy c_strategy @@ -1255,8 +1208,7 @@ cdef class CoreWorker: CPlacementGroupCreationOptions( name, c_strategy, - bundles, - is_detached + bundles ), &c_placement_group_id)) @@ -1458,13 +1410,9 @@ cdef class CoreWorker: object_ref.native()) def remove_object_ref_reference(self, ObjectRef object_ref): - cdef: - CObjectID c_object_id = object_ref.native() - # We need to release the gil since object destruction may call the - # unhandled exception handler. - with nogil: - CCoreWorkerProcess.GetCoreWorker().RemoveLocalReference( - c_object_id) + # Note: faster to not release GIL for short-running op. + CCoreWorkerProcess.GetCoreWorker().RemoveLocalReference( + object_ref.native()) def serialize_and_promote_object_ref(self, ObjectRef object_ref): cdef: @@ -1619,13 +1567,12 @@ cdef class CoreWorker: return ref_counts - def set_get_async_callback(self, ObjectRef object_ref, callback): - cpython.Py_INCREF(callback) + def get_async(self, ObjectRef object_ref, future): + cpython.Py_INCREF(future) CCoreWorkerProcess.GetCoreWorker().GetAsync( - object_ref.native(), - async_callback, - callback - ) + object_ref.native(), + async_set_result, + future) def push_error(self, JobID job_id, error_type, error_message, double timestamp): @@ -1639,11 +1586,13 @@ cdef class CoreWorker: resource_name.encode("ascii"), capacity, CNodeID.FromBinary(client_id.binary())) -cdef void async_callback(shared_ptr[CRayObject] obj, - CObjectID object_ref, - void *user_callback) with gil: +cdef void async_set_result(shared_ptr[CRayObject] obj, + CObjectID object_ref, + void *future) with gil: cdef: c_vector[shared_ptr[CRayObject]] objects_to_deserialize + py_future = (future) + loop = py_future._loop # Object is retrieved from in memory store. # Here we go through the code path used to deserialize objects. @@ -1654,6 +1603,23 @@ cdef void async_callback(shared_ptr[CRayObject] obj, result = ray.worker.global_worker.deserialize_objects( data_metadata_pairs, ids_to_deserialize)[0] - py_callback = user_callback - py_callback(result) - cpython.Py_DECREF(py_callback) + def set_future(): + # Issue #11030, #8841 + # If this future has result set already, we just need to + # skip the set result/exception procedure. + if py_future.done(): + cpython.Py_DECREF(py_future) + return + + if isinstance(result, RayTaskError): + ray.worker.last_task_error_raise_time = time.time() + py_future.set_exception(result.as_instanceof_cause()) + elif isinstance(result, RayError): + # Directly raise exception for RayActorError + py_future.set_exception(result) + else: + py_future.set_result(result) + + cpython.Py_DECREF(py_future) + + loop.call_soon_threadsafe(set_future) diff --git a/python/ray/actor.py b/python/ray/actor.py index b24c04a10dd5..499cd1eacd36 100644 --- a/python/ray/actor.py +++ b/python/ray/actor.py @@ -13,8 +13,6 @@ from ray import ActorClassID, Language from ray._raylet import PythonFunctionDescriptor from ray._private.client_mode_hook import client_mode_hook -from ray._private.client_mode_hook import client_mode_should_convert -from ray._private.client_mode_hook import client_mode_convert_actor from ray import cross_language from ray.util.inspect import ( is_function_or_method, @@ -555,29 +553,6 @@ def _remote(self, if max_concurrency < 1: raise ValueError("max_concurrency must be >= 1") - if client_mode_should_convert(): - return client_mode_convert_actor( - self, - args, - kwargs, - num_cpus=num_cpus, - num_gpus=num_gpus, - memory=memory, - object_store_memory=object_store_memory, - resources=resources, - accelerator_type=accelerator_type, - max_concurrency=max_concurrency, - max_restarts=max_restarts, - max_task_retries=max_task_retries, - name=name, - lifetime=lifetime, - placement_group=placement_group, - placement_group_bundle_index=placement_group_bundle_index, - placement_group_capture_child_tasks=( - placement_group_capture_child_tasks), - override_environment_variables=( - override_environment_variables)) - worker = ray.worker.global_worker worker.check_connected() @@ -609,9 +584,7 @@ def _remote(self, elif lifetime == "detached": detached = True else: - raise ValueError( - "actor `lifetime` argument must be either `None` or 'detached'" - ) + raise ValueError("lifetime must be either `None` or 'detached'") if placement_group_capture_child_tasks is None: placement_group_capture_child_tasks = ( @@ -962,7 +935,7 @@ def _deserialization_helper(cls, state, outer_object_ref=None): def __reduce__(self): """This code path is used by pickling but not by Ray forking.""" state = self._serialization_helper() - return ActorHandle._deserialization_helper, state + return ActorHandle._deserialization_helper, (state) def modify_class(cls): diff --git a/python/ray/autoscaler/_private/autoscaler.py b/python/ray/autoscaler/_private/autoscaler.py index 727c4db2effb..2838e24c18b4 100644 --- a/python/ray/autoscaler/_private/autoscaler.py +++ b/python/ray/autoscaler/_private/autoscaler.py @@ -43,7 +43,7 @@ # that will be passed into a NodeUpdaterThread. UpdateInstructions = namedtuple( "UpdateInstructions", - ["node_id", "setup_commands", "ray_start_commands", "docker_config"]) + ["node_id", "init_commands", "start_ray_commands", "docker_config"]) AutoscalerSummary = namedtuple( "AutoscalerSummary", @@ -283,7 +283,7 @@ def _update(self): # problems. They should at a minimum be spawned as daemon threads. # See https://github.com/ray-project/ray/pull/5903 for more info. T = [] - for node_id, setup_commands, ray_start_commands, docker_config in ( + for node_id, commands, ray_start, docker_config in ( self.should_update(node_id) for node_id in nodes): if node_id is not None: resources = self._node_resources(node_id) @@ -291,8 +291,8 @@ def _update(self): T.append( threading.Thread( target=self.spawn_updater, - args=(node_id, setup_commands, ray_start_commands, - resources, docker_config))) + args=(node_id, commands, ray_start, resources, + docker_config))) for t in T: t.start() for t in T: @@ -633,25 +633,25 @@ def should_update(self, node_id): successful_updated = self.num_successful_updates.get(node_id, 0) > 0 if successful_updated and self.config.get("restart_only", False): - setup_commands = [] - ray_start_commands = self.config["worker_start_ray_commands"] + init_commands = [] + ray_commands = self.config["worker_start_ray_commands"] elif successful_updated and self.config.get("no_restart", False): - setup_commands = self._get_node_type_specific_fields( + init_commands = self._get_node_type_specific_fields( node_id, "worker_setup_commands") - ray_start_commands = [] + ray_commands = [] else: - setup_commands = self._get_node_type_specific_fields( + init_commands = self._get_node_type_specific_fields( node_id, "worker_setup_commands") - ray_start_commands = self.config["worker_start_ray_commands"] + ray_commands = self.config["worker_start_ray_commands"] docker_config = self._get_node_specific_docker_config(node_id) return UpdateInstructions( node_id=node_id, - setup_commands=setup_commands, - ray_start_commands=ray_start_commands, + init_commands=init_commands, + start_ray_commands=ray_commands, docker_config=docker_config) - def spawn_updater(self, node_id, setup_commands, ray_start_commands, + def spawn_updater(self, node_id, init_commands, ray_start_commands, node_resources, docker_config): logger.info(f"Creating new (spawn_updater) updater thread for node" f" {node_id}.") @@ -665,8 +665,7 @@ def spawn_updater(self, node_id, setup_commands, ray_start_commands, initialization_commands=with_head_node_ip( self._get_node_type_specific_fields( node_id, "initialization_commands"), self.head_node_ip), - setup_commands=with_head_node_ip(setup_commands, - self.head_node_ip), + setup_commands=with_head_node_ip(init_commands, self.head_node_ip), ray_start_commands=with_head_node_ip(ray_start_commands, self.head_node_ip), runtime_hash=self.runtime_hash, @@ -766,7 +765,7 @@ def summary(self): ] is_pending = status in pending_states if is_pending: - pending_nodes.append((ip, node_type, status)) + pending_nodes.append((ip, node_type)) else: # TODO (Alex): Failed nodes are now immediately killed, so # this list will almost always be empty. We should ideally diff --git a/python/ray/autoscaler/_private/aws/config.py b/python/ray/autoscaler/_private/aws/config.py index 9aa3e6d85778..79fc57896dac 100644 --- a/python/ray/autoscaler/_private/aws/config.py +++ b/python/ray/autoscaler/_private/aws/config.py @@ -5,7 +5,6 @@ import json import os import time -from typing import Any, Dict, List import logging import boto3 @@ -155,11 +154,11 @@ def print_info(resource_string, _tags=workers_tags) tags = {"default": _log_info["head_instance_profile_src"] == "default"} - profile_arn = config["head_node"]["IamInstanceProfile"].get("Arn") - profile_name = _arn_to_name(profile_arn) \ - if profile_arn \ - else config["head_node"]["IamInstanceProfile"]["Name"] - cli_logger.labeled_value("IAM Profile", "{}", profile_name, _tags=tags) + cli_logger.labeled_value( + "IAM Profile", + "{}", + _arn_to_name(config["head_node"]["IamInstanceProfile"]["Arn"]), + _tags=tags) if ("KeyName" in config["head_node"] and "KeyName" in config["worker_nodes"]): @@ -358,23 +357,9 @@ def _configure_subnet(config): ec2 = _resource("ec2", config) use_internal_ips = config["provider"].get("use_internal_ips", False) - # If head or worker security group is specified, filter down to subnets - # belonging to the same VPC as the security group. - sg_ids = (config["head_node"].get("SecurityGroupIds", []) + - config["worker_nodes"].get("SecurityGroupIds", [])) - if sg_ids: - vpc_id_of_sg = _get_vpc_id_of_sg(sg_ids, config) - else: - vpc_id_of_sg = None - try: - candidate_subnets = ec2.subnets.all() - if vpc_id_of_sg: - candidate_subnets = [ - s for s in candidate_subnets if s.vpc_id == vpc_id_of_sg - ] subnets = sorted( - (s for s in candidate_subnets if s.state == "available" and ( + (s for s in ec2.subnets.all() if s.state == "available" and ( use_internal_ips or s.map_public_ip_on_launch)), reverse=True, # sort from Z-A key=lambda subnet: subnet.availability_zone) @@ -429,34 +414,6 @@ def _configure_subnet(config): return config -def _get_vpc_id_of_sg(sg_ids: List[str], config: Dict[str, Any]) -> str: - """Returns the VPC id of the security groups with the provided security - group ids. - - Errors if the provided security groups belong to multiple VPCs. - Errors if no security group with any of the provided ids is identified. - """ - sg_ids = list(set(sg_ids)) - - ec2 = _resource("ec2", config) - filters = [{"Name": "group-id", "Values": sg_ids}] - security_groups = ec2.security_groups.filter(Filters=filters) - vpc_ids = [sg.vpc_id for sg in security_groups] - vpc_ids = list(set(vpc_ids)) - - multiple_vpc_msg = "All security groups specified in the cluster config "\ - "should belong to the same VPC." - cli_logger.doassert(len(vpc_ids) <= 1, multiple_vpc_msg) - assert len(vpc_ids) <= 1, multiple_vpc_msg - - no_sg_msg = "Failed to detect a security group with id equal to any of "\ - "the configured SecurityGroupIds." - cli_logger.doassert(len(vpc_ids) > 0, no_sg_msg) - assert len(vpc_ids) > 0, no_sg_msg - - return vpc_ids[0] - - def _configure_security_group(config): _set_config_info( head_security_group_src="config", workers_security_group_src="config") @@ -496,13 +453,11 @@ def _check_ami(config): # If we do not provide a default AMI for the given region, noop. return - head_ami = config["head_node"].get("ImageId", "").lower() - if head_ami in ["", "latest_dlami"]: + if config["head_node"].get("ImageId", "").lower() == "latest_dlami": config["head_node"]["ImageId"] = default_ami _set_config_info(head_ami_src="dlami") - worker_ami = config["worker_nodes"].get("ImageId", "").lower() - if worker_ami in ["", "latest_dlami"]: + if config["worker_nodes"].get("ImageId", "").lower() == "latest_dlami": config["worker_nodes"]["ImageId"] = default_ami _set_config_info(workers_ami_src="dlami") @@ -611,13 +566,6 @@ def _create_security_group(config, vpc_id, group_name): def _upsert_security_group_rules(conf, security_groups): sgids = {sg.id for sg in security_groups.values()} - - # Update sgids to include user-specified security groups. - # This is necessary if the user specifies the head node type's security - # groups but not the worker's, or vice-versa. - for node_type in NODE_KIND_CONFIG_KEYS.values(): - sgids.update(conf[node_type].get("SecurityGroupIds", [])) - # sort security group items for deterministic inbound rule config order # (mainly supports more precise stub-based boto3 unit testing) for node_type, sg in sorted(security_groups.items()): @@ -635,7 +583,7 @@ def _update_inbound_rules(target_security_group, sgids, config): def _create_default_inbound_rules(sgids, extended_rules=[]): - intracluster_rules = _create_default_intracluster_inbound_rules(sgids) + intracluster_rules = _create_default_instracluster_inbound_rules(sgids) ssh_rules = _create_default_ssh_inbound_rules() merged_rules = itertools.chain( intracluster_rules, @@ -645,7 +593,7 @@ def _create_default_inbound_rules(sgids, extended_rules=[]): return list(merged_rules) -def _create_default_intracluster_inbound_rules(intracluster_sgids): +def _create_default_instracluster_inbound_rules(intracluster_sgids): return [{ "FromPort": -1, "ToPort": -1, diff --git a/python/ray/autoscaler/_private/command_runner.py b/python/ray/autoscaler/_private/command_runner.py index 2a3b7ae65a69..f328d4fd6c1a 100644 --- a/python/ray/autoscaler/_private/command_runner.py +++ b/python/ray/autoscaler/_private/command_runner.py @@ -584,9 +584,6 @@ def __init__(self, docker_config, **common_args): self.docker_config = docker_config self.home_dir = None self.initialized = False - # Optionally use 'podman' instead of 'docker' - use_podman = docker_config.get("use_podman", False) - self.docker_cmd = "podman" if use_podman else "docker" def run( self, @@ -601,8 +598,8 @@ def run( shutdown_after_run=False, ): if run_env == "auto": - run_env = "host" if (not bool(cmd) or cmd.find( - self.docker_cmd) == 0) else self.docker_cmd + run_env = "host" if (not bool(cmd) + or cmd.find("docker") == 0) else "docker" if environment_variables: cmd = _with_environment_variables(cmd, environment_variables) @@ -614,8 +611,7 @@ def run( cmd = with_docker_exec( [cmd], container_name=self.container_name, - with_interactive=is_using_login_shells(), - docker_cmd=self.docker_cmd)[0] + with_interactive=is_using_login_shells())[0] if shutdown_after_run: # sudo shutdown should run after `with_docker_exec` command above @@ -636,10 +632,8 @@ def run_rsync_up(self, source, target, options=None): self._get_docker_host_mount_location( self.ssh_command_runner.cluster_name), target.lstrip("/")) - host_mount_location = os.path.dirname(host_destination.rstrip("/")) self.ssh_command_runner.run( - f"mkdir -p {host_mount_location} && chown -R " - f"{self.ssh_command_runner.ssh_user} {host_mount_location}", + f"mkdir -p {os.path.dirname(host_destination.rstrip('/'))}", silent=is_rsync_silent()) self.ssh_command_runner.run_rsync_up( @@ -651,9 +645,9 @@ def run_rsync_up(self, source, target, options=None): # Without it, docker copies the source *into* the target host_destination += "/." self.ssh_command_runner.run( - "{} cp {} {}:{}".format(self.docker_cmd, host_destination, - self.container_name, - self._docker_expand_user(target)), + "docker cp {} {}:{}".format(host_destination, + self.container_name, + self._docker_expand_user(target)), silent=is_rsync_silent()) def run_rsync_down(self, source, target, options=None): @@ -661,10 +655,8 @@ def run_rsync_down(self, source, target, options=None): host_source = os.path.join( self._get_docker_host_mount_location( self.ssh_command_runner.cluster_name), source.lstrip("/")) - host_mount_location = os.path.dirname(host_source.rstrip("/")) self.ssh_command_runner.run( - f"mkdir -p {host_mount_location} && chown -R " - f"{self.ssh_command_runner.ssh_user} {host_mount_location}", + f"mkdir -p {os.path.dirname(host_source.rstrip('/'))}", silent=is_rsync_silent()) if source[-1] == "/": source += "." @@ -672,9 +664,9 @@ def run_rsync_down(self, source, target, options=None): # Without it, docker copies the source *into* the target if not options.get("docker_mount_if_possible", False): self.ssh_command_runner.run( - "{} cp {}:{} {}".format(self.docker_cmd, self.container_name, - self._docker_expand_user(source), - host_source), + "docker cp {}:{} {}".format(self.container_name, + self._docker_expand_user(source), + host_source), silent=is_rsync_silent()) self.ssh_command_runner.run_rsync_down( host_source, target, options=options) @@ -682,30 +674,22 @@ def run_rsync_down(self, source, target, options=None): def remote_shell_command_str(self): inner_str = self.ssh_command_runner.remote_shell_command_str().replace( "ssh", "ssh -tt", 1).strip("\n") - return inner_str + " {} exec -it {} /bin/bash\n".format( - self.docker_cmd, self.container_name) + return inner_str + " docker exec -it {} /bin/bash\n".format( + self.container_name) def _check_docker_installed(self): no_exist = "NoExist" output = self.ssh_command_runner.run( - f"command -v {self.docker_cmd} || echo '{no_exist}'", - with_output=True) + f"command -v docker || echo '{no_exist}'", with_output=True) cleaned_output = output.decode().strip() if no_exist in cleaned_output or "docker" not in cleaned_output: - if self.docker_cmd == "docker": - install_commands = [ - "curl -fsSL https://get.docker.com -o get-docker.sh", - "sudo sh get-docker.sh", "sudo usermod -aG docker $USER", - "sudo systemctl restart docker -f" - ] - else: - install_commands = [ - "sudo apt-get update", "sudo apt-get -y install podman" - ] - + install_commands = [ + "curl -fsSL https://get.docker.com -o get-docker.sh", + "sudo sh get-docker.sh", "sudo usermod -aG docker $USER", + "sudo systemctl restart docker -f" + ] logger.error( - f"{self.docker_cmd.capitalize()} not installed. You can " - f"install {self.docker_cmd.capitalize()} by adding the " + "Docker not installed. You can install Docker by adding the " "following commands to 'initialization_commands':\n" + "\n".join(install_commands)) @@ -713,7 +697,7 @@ def _check_container_status(self): if self.initialized: return True output = self.ssh_command_runner.run( - check_docker_running_cmd(self.container_name, self.docker_cmd), + check_docker_running_cmd(self.container_name), with_output=True).decode("utf-8").strip() # Checks for the false positive where "true" is in the container name return ("true" in output.lower() @@ -724,8 +708,7 @@ def _docker_expand_user(self, string, any_char=False): if user_pos > -1: if self.home_dir is None: self.home_dir = self.ssh_command_runner.run( - f"{self.docker_cmd} exec {self.container_name} " - "printenv HOME", + f"docker exec {self.container_name} printenv HOME", with_output=True).decode("utf-8").strip() if any_char: @@ -740,7 +723,7 @@ def _check_if_container_restart_is_needed( self, image: str, cleaned_bind_mounts: Dict[str, str]) -> bool: re_init_required = False running_image = self.run( - check_docker_image(self.container_name, self.docker_cmd), + check_docker_image(self.container_name), with_output=True, run_env="host").decode("utf-8").strip() if running_image != image: @@ -749,7 +732,7 @@ def _check_if_container_restart_is_needed( "of {} (which was provided in the YAML)", self.container_name, running_image, image) mounts = self.run( - check_bind_mounts_cmd(self.container_name, self.docker_cmd), + check_bind_mounts_cmd(self.container_name), with_output=True, run_env="host").decode("utf-8").strip() try: @@ -791,14 +774,12 @@ def run_init(self, *, as_head, file_mounts, sync_run_yet): if self.docker_config.get("pull_before_run", True): assert specific_image, "Image must be included in config if " + \ "pull_before_run is specified" - self.run( - "{} pull {}".format(self.docker_cmd, specific_image), - run_env="host") + self.run("docker pull {}".format(specific_image), run_env="host") else: - self.run(f"{self.docker_cmd} image inspect {specific_image} " - "1> /dev/null 2>&1 || " - f"{self.docker_cmd} pull {specific_image}") + self.run( + f"docker image inspect {specific_image} 1> /dev/null 2>&1 || " + f"docker pull {specific_image}") # Bootstrap files cannot be bind mounted because docker opens the # underlying inode. When the file is switched, docker becomes outdated. @@ -814,15 +795,12 @@ def run_init(self, *, as_head, file_mounts, sync_run_yet): requires_re_init = self._check_if_container_restart_is_needed( specific_image, cleaned_bind_mounts) if requires_re_init: - self.run( - f"{self.docker_cmd} stop {self.container_name}", - run_env="host") + self.run(f"docker stop {self.container_name}", run_env="host") if (not container_running) or requires_re_init: # Get home directory image_env = self.ssh_command_runner.run( - f"{self.docker_cmd} " + "inspect -f '{{json .Config.Env}}' " + - specific_image, + "docker inspect -f '{{json .Config.Env}}' " + specific_image, with_output=True).decode().strip() home_directory = "/root" for env_var in json.loads(image_env): @@ -837,8 +815,7 @@ def run_init(self, *, as_head, file_mounts, sync_run_yet): "run_options", []) + self.docker_config.get( f"{'head' if as_head else 'worker'}_run_options", []) + self._configure_runtime() + self._auto_configure_shm(), - self.ssh_command_runner.cluster_name, home_directory, - self.docker_cmd) + self.ssh_command_runner.cluster_name, home_directory) self.run(start_command, run_env="host") docker_run_executed = True @@ -851,8 +828,7 @@ def run_init(self, *, as_head, file_mounts, sync_run_yet): # is called before the first `file_sync` happens self.run_rsync_up(file_mounts[mount], mount) self.ssh_command_runner.run( - "{cmd} cp {src} {container}:{dst}".format( - cmd=self.docker_cmd, + "docker cp {src} {container}:{dst}".format( src=os.path.join( self._get_docker_host_mount_location( self.ssh_command_runner.cluster_name), mount), @@ -866,7 +842,7 @@ def _configure_runtime(self): return [] runtime_output = self.ssh_command_runner.run( - f"{self.docker_cmd} " + "info -f '{{.Runtimes}}' ", + "docker info -f '{{.Runtimes}}' ", with_output=True).decode().strip() if "nvidia-container-runtime" in runtime_output: try: diff --git a/python/ray/autoscaler/_private/commands.py b/python/ray/autoscaler/_private/commands.py index d967543ff984..df0a104493eb 100644 --- a/python/ray/autoscaler/_private/commands.py +++ b/python/ray/autoscaler/_private/commands.py @@ -34,7 +34,7 @@ from ray.autoscaler._private.cli_logger import cli_logger, cf from ray.autoscaler._private.updater import NodeUpdaterThread from ray.autoscaler._private.command_runner import set_using_login_shells, \ - set_rsync_silent + set_rsync_silent from ray.autoscaler._private.event_system import (CreateClusterEvent, global_event_system) from ray.autoscaler._private.log_timer import LogTimer @@ -137,22 +137,17 @@ def request_resources(num_cpus: Optional[int] = None, overwrite=True) -def create_or_update_cluster( - config_file: str, - override_min_workers: Optional[int], - override_max_workers: Optional[int], - no_restart: bool, - restart_only: bool, - yes: bool, - override_cluster_name: Optional[str] = None, - no_config_cache: bool = False, - redirect_command_output: Optional[bool] = False, - use_login_shells: bool = True, - no_monitor_on_head: bool = False) -> Dict[str, Any]: - """Creates or updates an autoscaling Ray cluster from a config json.""" - # no_monitor_on_head is an internal flag used by the Ray K8s operator. - # If True, prevents autoscaling config sync to the Ray head during cluster - # creation. See https://github.com/ray-project/ray/pull/13720. +def create_or_update_cluster(config_file: str, + override_min_workers: Optional[int], + override_max_workers: Optional[int], + no_restart: bool, + restart_only: bool, + yes: bool, + override_cluster_name: Optional[str] = None, + no_config_cache: bool = False, + redirect_command_output: Optional[bool] = False, + use_login_shells: bool = True) -> Dict[str, Any]: + """Create or updates an autoscaling Ray cluster from a config json.""" set_using_login_shells(use_login_shells) if not use_login_shells: cmd_output_util.set_allow_interactive(False) @@ -230,7 +225,7 @@ def handle_cli_override(key, override): try_logging_config(config) get_or_create_head_node(config, config_file, no_restart, restart_only, yes, - override_cluster_name, no_monitor_on_head) + override_cluster_name) return config @@ -490,17 +485,13 @@ def monitor_cluster(cluster_config_file: str, num_lines: int, port_forward=None) -def warn_about_bad_start_command(start_commands: List[str], - no_monitor_on_head: bool = False) -> None: +def warn_about_bad_start_command(start_commands: List[str]) -> None: ray_start_cmd = list(filter(lambda x: "ray start" in x, start_commands)) if len(ray_start_cmd) == 0: cli_logger.warning( "Ray runtime will not be started because `{}` is not in `{}`.", cf.bold("ray start"), cf.bold("head_start_ray_commands")) - - autoscaling_config_in_ray_start_cmd = any( - "autoscaling-config" in x for x in ray_start_cmd) - if not (autoscaling_config_in_ray_start_cmd or no_monitor_on_head): + if not any("autoscaling-config" in x for x in ray_start_cmd): cli_logger.warning( "The head node will not launch any workers because " "`{}` does not have `{}` set.\n" @@ -516,7 +507,6 @@ def get_or_create_head_node(config: Dict[str, Any], restart_only: bool, yes: bool, override_cluster_name: Optional[str], - no_monitor_on_head: bool = False, _provider: Optional[NodeProvider] = None, _runner: ModuleType = subprocess) -> None: """Create the cluster head node, which in turn creates the workers.""" @@ -639,19 +629,44 @@ def get_or_create_head_node(config: Dict[str, Any], (runtime_hash, file_mounts_contents_hash) = hash_runtime_conf( config["file_mounts"], None, config) - if not no_monitor_on_head: - # Return remote_config_file to avoid prematurely closing it. - config, remote_config_file = _set_up_config_for_head_node( - config, provider, no_restart) - cli_logger.print("Prepared bootstrap config") + # Rewrite the auth config so that the head + # node can update the workers + remote_config = copy.deepcopy(config) + + # drop proxy options if they exist, otherwise + # head node won't be able to connect to workers + remote_config["auth"].pop("ssh_proxy_command", None) + + if "ssh_private_key" in config["auth"]: + remote_key_path = "~/ray_bootstrap_key.pem" + remote_config["auth"]["ssh_private_key"] = remote_key_path + + # Adjust for new file locations + new_mounts = {} + for remote_path in config["file_mounts"]: + new_mounts[remote_path] = remote_path + remote_config["file_mounts"] = new_mounts + remote_config["no_restart"] = no_restart + + remote_config = provider.prepare_for_head_node(remote_config) + + # Now inject the rewritten config and SSH key into the head node + remote_config_file = tempfile.NamedTemporaryFile( + "w", prefix="ray-bootstrap-") + remote_config_file.write(json.dumps(remote_config)) + remote_config_file.flush() + config["file_mounts"].update({ + "~/ray_bootstrap_config.yaml": remote_config_file.name + }) + + if "ssh_private_key" in config["auth"]: + config["file_mounts"].update({ + remote_key_path: config["auth"]["ssh_private_key"], + }) + cli_logger.print("Prepared bootstrap config") if restart_only: - # Docker may re-launch nodes, requiring setup - # commands to be rerun. - if config.get("docker", {}).get("container_name"): - setup_commands = config["head_setup_commands"] - else: - setup_commands = [] + setup_commands = [] ray_start_commands = config["head_start_ray_commands"] elif no_restart: setup_commands = config["head_setup_commands"] @@ -661,8 +676,7 @@ def get_or_create_head_node(config: Dict[str, Any], ray_start_commands = config["head_start_ray_commands"] if not no_restart: - warn_about_bad_start_command(ray_start_commands, - no_monitor_on_head) + warn_about_bad_start_command(ray_start_commands) updater = NodeUpdaterThread( node_id=head_node, @@ -683,8 +697,7 @@ def get_or_create_head_node(config: Dict[str, Any], "rsync_exclude": config.get("rsync_exclude"), "rsync_filter": config.get("rsync_filter") }, - docker_config=config.get("docker"), - restart_only=restart_only) + docker_config=config.get("docker")) updater.start() updater.join() @@ -724,54 +737,6 @@ def get_or_create_head_node(config: Dict[str, Any], cli_logger.print(" {}", remote_shell_str.strip()) -def _set_up_config_for_head_node(config: Dict[str, Any], - provider: NodeProvider, - no_restart: bool) ->\ - Tuple[Dict[str, Any], Any]: - """Prepares autoscaling config and, if needed, ssh key, to be mounted onto - the Ray head node for use by the autoscaler. - - Returns the modified config and the temporary config file that will be - mounted onto the head node. - """ - # Rewrite the auth config so that the head - # node can update the workers - remote_config = copy.deepcopy(config) - - # drop proxy options if they exist, otherwise - # head node won't be able to connect to workers - remote_config["auth"].pop("ssh_proxy_command", None) - - if "ssh_private_key" in config["auth"]: - remote_key_path = "~/ray_bootstrap_key.pem" - remote_config["auth"]["ssh_private_key"] = remote_key_path - - # Adjust for new file locations - new_mounts = {} - for remote_path in config["file_mounts"]: - new_mounts[remote_path] = remote_path - remote_config["file_mounts"] = new_mounts - remote_config["no_restart"] = no_restart - - remote_config = provider.prepare_for_head_node(remote_config) - - # Now inject the rewritten config and SSH key into the head node - remote_config_file = tempfile.NamedTemporaryFile( - "w", prefix="ray-bootstrap-") - remote_config_file.write(json.dumps(remote_config)) - remote_config_file.flush() - config["file_mounts"].update({ - "~/ray_bootstrap_config.yaml": remote_config_file.name - }) - - if "ssh_private_key" in config["auth"]: - config["file_mounts"].update({ - remote_key_path: config["auth"]["ssh_private_key"], - }) - - return config, remote_config_file - - def attach_cluster(config_file: str, start: bool, use_screen: bool, diff --git a/python/ray/autoscaler/_private/constants.py b/python/ray/autoscaler/_private/constants.py index 2fbf6ec325e4..3fd3ec65e095 100644 --- a/python/ray/autoscaler/_private/constants.py +++ b/python/ray/autoscaler/_private/constants.py @@ -15,9 +15,6 @@ def env_integer(key, default): # Whether event logging to driver is enabled. Set to 0 to disable. AUTOSCALER_EVENTS = env_integer("AUTOSCALER_EVENTS", 1) -# Whether to avoid launching GPU nodes for CPU only tasks. -AUTOSCALER_CONSERVE_GPU_NODES = env_integer("AUTOSCALER_CONSERVE_GPU_NODES", 1) - # How long to wait for a node to start, in seconds NODE_START_WAIT_S = env_integer("AUTOSCALER_NODE_START_WAIT_S", 900) diff --git a/python/ray/autoscaler/_private/docker.py b/python/ray/autoscaler/_private/docker.py index 9a21cd9cbd36..46bb20a3feca 100644 --- a/python/ray/autoscaler/_private/docker.py +++ b/python/ray/autoscaler/_private/docker.py @@ -29,10 +29,8 @@ def validate_docker_config(config): def with_docker_exec(cmds, container_name, - docker_cmd, env_vars=None, with_interactive=False): - assert docker_cmd, "Must provide docker command" env_str = "" if env_vars: env_str = " ".join( @@ -47,27 +45,27 @@ def with_docker_exec(cmds, ] -def _check_helper(cname, template, docker_cmd): +def _check_helper(cname, template): return " ".join([ - docker_cmd, "inspect", "-f", "'{{" + template + "}}'", cname, "||", + "docker", "inspect", "-f", "'{{" + template + "}}'", cname, "||", "true" ]) -def check_docker_running_cmd(cname, docker_cmd): - return _check_helper(cname, ".State.Running", docker_cmd) +def check_docker_running_cmd(cname): + return _check_helper(cname, ".State.Running") -def check_bind_mounts_cmd(cname, docker_cmd): - return _check_helper(cname, "json .Mounts", docker_cmd) +def check_bind_mounts_cmd(cname): + return _check_helper(cname, "json .Mounts") -def check_docker_image(cname, docker_cmd): - return _check_helper(cname, ".Config.Image", docker_cmd) +def check_docker_image(cname): + return _check_helper(cname, ".Config.Image") def docker_start_cmds(user, image, mount_dict, container_name, user_options, - cluster_name, home_directory, docker_cmd): + cluster_name, home_directory): # Imported here due to circular dependency. from ray.autoscaler.sdk import get_docker_host_mount_location docker_mount_prefix = get_docker_host_mount_location(cluster_name) @@ -86,7 +84,7 @@ def docker_start_cmds(user, image, mount_dict, container_name, user_options, user_options_str = " ".join(user_options) docker_run = [ - docker_cmd, "run", "--rm", "--name {}".format(container_name), "-d", + "docker", "run", "--rm", "--name {}".format(container_name), "-d", "-it", mount_flags, env_flags, user_options_str, "--net=host", image, "bash" ] diff --git a/python/ray/autoscaler/_private/kubernetes/config.py b/python/ray/autoscaler/_private/kubernetes/config.py index dcc315bc9c92..b285e7701ff6 100644 --- a/python/ray/autoscaler/_private/kubernetes/config.py +++ b/python/ray/autoscaler/_private/kubernetes/config.py @@ -94,11 +94,6 @@ def get_autodetected_resources(container_data): for resource_name in ["cpu", "gpu"] } - # Throw out GPU from resource dict if the amount is 0. - for key in copy.deepcopy(node_type_resources): - if node_type_resources[key] == 0: - del node_type_resources[key] - return node_type_resources diff --git a/python/ray/autoscaler/_private/load_metrics.py b/python/ray/autoscaler/_private/load_metrics.py index 09ea112381ed..bf9dc564bdca 100644 --- a/python/ray/autoscaler/_private/load_metrics.py +++ b/python/ray/autoscaler/_private/load_metrics.py @@ -5,7 +5,6 @@ from typing import Dict, List import numpy as np -import ray.ray_constants import ray._private.services as services from ray.autoscaler._private.constants import MEMORY_RESOURCE_UNIT_BYTES,\ AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE @@ -213,15 +212,8 @@ def summary(self): ) if self.static_resources_by_ip else {} usage_dict = {} for key in total_resources: - if key in ["memory", "object_store_memory"]: - total = total_resources[key] * \ - ray.ray_constants.MEMORY_RESOURCE_UNIT_BYTES - available = available_resources[key] * \ - ray.ray_constants.MEMORY_RESOURCE_UNIT_BYTES - usage_dict[key] = (total - available, total) - else: - total = total_resources[key] - usage_dict[key] = (total - available_resources[key], total) + total = total_resources[key] + usage_dict[key] = (total - available_resources[key], total) summarized_demand_vector = freq_of_dicts( self.get_resource_demand_vector(clip=False)) diff --git a/python/ray/autoscaler/_private/resource_demand_scheduler.py b/python/ray/autoscaler/_private/resource_demand_scheduler.py index 0a08e0579b2e..523fd7d2f028 100644 --- a/python/ray/autoscaler/_private/resource_demand_scheduler.py +++ b/python/ray/autoscaler/_private/resource_demand_scheduler.py @@ -17,7 +17,6 @@ from ray.autoscaler.node_provider import NodeProvider from ray.gcs_utils import PlacementGroupTableData from ray.core.generated.common_pb2 import PlacementStrategy -from ray.autoscaler._private.constants import AUTOSCALER_CONSERVE_GPU_NODES from ray.autoscaler.tags import ( TAG_RAY_USER_NODE_TYPE, NODE_KIND_UNMANAGED, NODE_TYPE_LEGACY_WORKER, NODE_KIND_WORKER, NODE_TYPE_LEGACY_HEAD, TAG_RAY_NODE_KIND, NODE_KIND_HEAD) @@ -640,7 +639,7 @@ def get_nodes_for(node_types: Dict[NodeType, NodeTypeConfigDict], # resources. This will behave properly with the current utilization # score heuristic, but it's a little dangerous and misleading. logger.warning( - f"The autoscaler could not find a node type to satisfy the " + f"The autoscaler could not find a node type to satisfy the" f"request: {resources}. If this request is related to " f"placement groups the resource request will resolve itself, " f"otherwise please specify a node type with the necessary " @@ -665,16 +664,8 @@ def get_nodes_for(node_types: Dict[NodeType, NodeTypeConfigDict], def _utilization_score(node_resources: ResourceDict, - resources: List[ResourceDict]) -> float: + resources: ResourceDict) -> float: remaining = copy.deepcopy(node_resources) - is_gpu_node = "GPU" in node_resources - any_gpu_task = any("GPU" in r for r in resources) - - # Avoid launching GPU nodes if there aren't any GPU tasks at all. Note that - # if there *is* a GPU task, then CPU tasks can be scheduled as well. - if AUTOSCALER_CONSERVE_GPU_NODES: - if is_gpu_node and not any_gpu_task: - return None fittable = [] for r in resources: diff --git a/python/ray/autoscaler/_private/updater.py b/python/ray/autoscaler/_private/updater.py index 14981252cd6d..7256d9046f49 100644 --- a/python/ray/autoscaler/_private/updater.py +++ b/python/ray/autoscaler/_private/updater.py @@ -48,7 +48,6 @@ class NodeUpdater: use_internal_ip: Wwhether the node_id belongs to an internal ip or external ip. docker_config: Docker section of autoscaler yaml - restart_only: Whether to skip setup commands & just restart ray """ def __init__(self, @@ -69,8 +68,7 @@ def __init__(self, rsync_options=None, process_runner=subprocess, use_internal_ip=False, - docker_config=None, - restart_only=False): + docker_config=None): self.log_prefix = "NodeUpdater: {}: ".format(node_id) use_internal_ip = (use_internal_ip @@ -108,7 +106,6 @@ def __init__(self, self.auth_config = auth_config self.is_head_node = is_head_node self.docker_config = docker_config - self.restart_only = restart_only def run(self): if cmd_output_util.does_allow_interactive( @@ -301,11 +298,6 @@ def do_update(self): sync_run_yet=False) if init_required: node_tags[TAG_RAY_RUNTIME_CONFIG] += "-invalidate" - # This ensures that `setup_commands` are not removed - self.restart_only = False - - if self.restart_only: - self.setup_commands = [] # runtime_hash will only change whenever the user restarts # or updates their cluster with `get_or_create_head_node` diff --git a/python/ray/autoscaler/_private/util.py b/python/ray/autoscaler/_private/util.py index 788da5cc2da6..81a2c1fc00ff 100644 --- a/python/ray/autoscaler/_private/util.py +++ b/python/ray/autoscaler/_private/util.py @@ -85,14 +85,13 @@ def validate_config(config: Dict[str, Any]) -> None: if config["head_node_type"] not in config["available_node_types"]: raise ValueError( "`head_node_type` must be one of `available_node_types`.") - - sum_min_workers = sum( - config["available_node_types"][node_type].get("min_workers", 0) - for node_type in config["available_node_types"]) - if sum_min_workers > config["max_workers"]: - raise ValueError( - "The specified global `max_workers` is smaller than the " - "sum of `min_workers` of all the available node types.") + if "worker_default_node_type" not in config: + raise ValueError("You must specify `worker_default_node_type` if " + "`available_node_types is set.") + if (config["worker_default_node_type"] not in config[ + "available_node_types"]): + raise ValueError("`worker_default_node_type` must be one of " + "`available_node_types`.") def prepare_config(config): @@ -124,7 +123,8 @@ def rewrite_legacy_yaml_to_available_node_types( }, } config["head_node_type"] = NODE_TYPE_LEGACY_HEAD - del config["min_workers"] + config["worker_default_node_type"] = NODE_TYPE_LEGACY_WORKER + return config @@ -313,12 +313,12 @@ def format_pg(pg): def get_usage_report(lm_summary) -> str: usage_lines = [] - for resource, (used, total) in sorted(lm_summary.usage.items()): + for resource, (used, total) in lm_summary.usage.items(): if "node:" in resource: continue # Skip the auto-added per-node "node:" resource. line = f" {used}/{total} {resource}" if resource in ["memory", "object_store_memory"]: - to_GiB = 1 / 2**30 + to_GiB = ray.ray_constants.MEMORY_RESOURCE_UNIT_BYTES / 2**30 used *= to_GiB total *= to_GiB line = f" {used:.2f}/{total:.3f} GiB {resource}" @@ -362,8 +362,8 @@ def format_info_string(lm_summary, autoscaler_summary, time=None): for node_type, count in autoscaler_summary.pending_launches.items(): line = f" {node_type}, {count} launching" pending_lines.append(line) - for ip, node_type, status in autoscaler_summary.pending_nodes: - line = f" {ip}: {node_type}, {status.lower()}" + for ip, node_type in autoscaler_summary.pending_nodes: + line = f" {ip}: {node_type}, setting up" pending_lines.append(line) if pending_lines: pending_report = "\n".join(pending_lines) diff --git a/python/ray/autoscaler/aws/example-head-and-worker-security-group.yaml b/python/ray/autoscaler/aws/example-head-and-worker-security-group.yaml deleted file mode 100644 index b940366a0e2f..000000000000 --- a/python/ray/autoscaler/aws/example-head-and-worker-security-group.yaml +++ /dev/null @@ -1,31 +0,0 @@ -cluster_name: sg - -max_workers: 1 - -provider: - type: aws - region: us-west-2 - availability_zone: us-west-2a - -auth: - ssh_user: ubuntu - -# If required, head and worker nodes can exist on subnets in different VPCs and -# communicate via VPC peering. - -# VPC peering overview: https://docs.aws.amazon.com/vpc/latest/userguide/vpc-peering.html. -# Setup VPC peering: https://docs.aws.amazon.com/vpc/latest/peering/create-vpc-peering-connection.html. -# Configure VPC peering route tables: https://docs.aws.amazon.com/vpc/latest/peering/vpc-peering-routing.html. - -# To enable external SSH connectivity, you should also ensure that your VPC -# is configured to assign public IPv4 addresses to every EC2 instance -# assigned to it. -head_node: - SecurityGroupIds: - - sg-1234abcd # Replace with an actual security group id. - -worker_nodes: - SecurityGroupIds: - - sg-1234abcd # Replace with an actual security group id. - - diff --git a/python/ray/autoscaler/aws/example-multi-node-type.yaml b/python/ray/autoscaler/aws/example-multi-node-type.yaml index 19584c69df2d..56b5c1b78d2e 100644 --- a/python/ray/autoscaler/aws/example-multi-node-type.yaml +++ b/python/ray/autoscaler/aws/example-multi-node-type.yaml @@ -1,5 +1,6 @@ # Experimental: an example of configuring a mixed-node-type cluster. cluster_name: multi_node_type +min_workers: 1 max_workers: 40 # The autoscaler will scale up the cluster faster with higher upscaling speed. @@ -54,6 +55,9 @@ available_node_types: # Specify the node type of the head node (as configured above). head_node_type: cpu_4_ondemand +# Specify the default type of the worker node (as configured above). +worker_default_node_type: cpu_16_spot + # The default settings for the head node. This will be merged with the per-node # type configs given above. head_node: diff --git a/python/ray/autoscaler/kubernetes/defaults.yaml b/python/ray/autoscaler/kubernetes/defaults.yaml index 4d6d481927f9..31b3301ea0f6 100644 --- a/python/ray/autoscaler/kubernetes/defaults.yaml +++ b/python/ray/autoscaler/kubernetes/defaults.yaml @@ -1,8 +1,12 @@ -# A unique identifier for the head node and workers of this cluster. -cluster_name: defaults +# An unique identifier for the head node and workers of this cluster. +cluster_name: default + +# The minimum number of workers nodes to launch in addition to the head +# node. This number should be >= 0. +min_workers: 0 # The maximum number of workers nodes to launch in addition to the head -# node. +# node. This takes precedence over min_workers. max_workers: 2 # The autoscaler will scale up the cluster faster with higher upscaling speed. @@ -74,83 +78,127 @@ provider: # NOTE: If you're running multiple Ray clusters with services # on one Kubernetes cluster, they must have unique service # names. - name: example-cluster-ray-head + name: ray-head spec: # This selector must match the head node pod's selector below. selector: - component: example-cluster-ray-head + component: ray-head + ports: + - protocol: TCP + port: 8000 + targetPort: 8000 + + # Service that maps to the worker nodes of the Ray cluster. + - apiVersion: v1 + kind: Service + metadata: + # NOTE: If you're running multiple Ray clusters with services + # on one Kubernetes cluster, they must have unique service + # names. + name: ray-workers + spec: + # This selector must match the worker node pods' selector below. + selector: + component: ray-worker ports: - - name: client - protocol: TCP - port: 10001 - targetPort: 10001 - - name: dashboard - protocol: TCP - port: 8265 - targetPort: 8265 - -# Specify the pod type for the ray head node (as configured below). -head_node_type: head_node -# Specify the allowed pod types for this ray cluster and the resources they provide. -available_node_types: - worker_node: - # Minimum number of Ray workers of this Pod type. - min_workers: 0 - # Maximum number of Ray workers of this Pod type. Takes precedence over min_workers. - max_workers: 2 - node_config: - apiVersion: v1 - kind: Pod - metadata: + - protocol: TCP + port: 8000 + targetPort: 8000 + +# Kubernetes pod config for the head node pod. +head_node: + apiVersion: v1 + kind: Pod + metadata: # Automatically generates a name for the pod with this prefix. - generateName: example-cluster-ray-worker- - spec: + generateName: ray-head- + + # Must match the head node service selector above if a head node + # service is required. + labels: + component: ray-head + spec: + # Change this if you altered the autoscaler_service_account above + # or want to provide your own. + serviceAccountName: autoscaler + + # Restarting the head node automatically is not currently supported. + # If the head node goes down, `ray up` must be run again. restartPolicy: Never + + # This volume allocates shared memory for Ray to use for its plasma + # object store. If you do not provide this, Ray will fall back to + # /tmp which cause slowdowns if is not a shared memory volume. volumes: - name: dshm emptyDir: - medium: Memory + medium: Memory + containers: - name: ray-node imagePullPolicy: Always + # You are free (and encouraged) to use your own container image, + # but it should have the following installed: + # - rsync (used for `ray rsync` commands and file mounts) + # - screen (used for `ray attach`) + # - kubectl (used by the autoscaler to manage worker pods) image: rayproject/ray:nightly + # Do not change this command - it keeps the pod alive until it is + # explicitly killed. command: ["/bin/bash", "-c", "--"] args: ["trap : TERM INT; sleep infinity & wait;"] + ports: + - containerPort: 6379 # Redis port. + - containerPort: 6380 # Redis port. + - containerPort: 6381 # Redis port. + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. + # This volume allocates shared memory for Ray to use for its plasma # object store. If you do not provide this, Ray will fall back to # /tmp which cause slowdowns if is not a shared memory volume. volumeMounts: - - mountPath: /dev/shm - name: dshm + - mountPath: /dev/shm + name: dshm resources: - requests: - cpu: 1000m - memory: 512Mi - limits: - # The maximum memory that this pod is allowed to use. The - # limit will be detected by ray and split to use 10% for - # redis, 30% for the shared memory object store, and the - # rest for application memory. If this limit is not set and - # the object store size is not set manually, ray will - # allocate a very large object store in each pod that may - # cause problems for other pods. - memory: 512Mi - head_node: - node_config: - apiVersion: v1 - kind: Pod - metadata: + requests: + cpu: 1000m + memory: 512Mi + limits: + # The maximum memory that this pod is allowed to use. The + # limit will be detected by ray and split to use 10% for + # redis, 30% for the shared memory object store, and the + # rest for application memory. If this limit is not set and + # the object store size is not set manually, ray will + # allocate a very large object store in each pod that may + # cause problems for other pods. + memory: 2Gi + env: + # This is used in the head_start_ray_commands below so that + # Ray can spawn the correct number of processes. Omitting this + # may lead to degraded performance. + - name: MY_CPU_REQUEST + valueFrom: + resourceFieldRef: + resource: requests.cpu + +# Kubernetes pod config for worker node pods. +worker_nodes: + apiVersion: v1 + kind: Pod + metadata: # Automatically generates a name for the pod with this prefix. - generateName: example-cluster-ray-head- - # Must match the head node service selector above if a head node + generateName: ray-worker- + + # Must match the worker node service selector above if a worker node # service is required. labels: - component: example-cluster-ray-head - spec: - # Change this if you altered the autoscaler_service_account above - # or want to provide your own. - serviceAccountName: autoscaler + component: ray-worker + spec: + serviceAccountName: default + # Worker nodes will be managed automatically by the head node, so + # do not change the restart policy. restartPolicy: Never # This volume allocates shared memory for Ray to use for its plasma @@ -159,51 +207,45 @@ available_node_types: volumes: - name: dshm emptyDir: - medium: Memory + medium: Memory + containers: - name: ray-node imagePullPolicy: Always + # You are free (and encouraged) to use your own container image, + # but it should have the following installed: + # - rsync (used for `ray rsync` commands and file mounts) image: rayproject/ray:nightly # Do not change this command - it keeps the pod alive until it is # explicitly killed. command: ["/bin/bash", "-c", "--"] - args: ['trap : TERM INT; sleep infinity & wait;'] + args: ["trap : TERM INT; sleep infinity & wait;"] ports: - - containerPort: 6379 # Redis port - - containerPort: 10001 # Used by Ray Client - - containerPort: 8265 # Used by Ray Dashboard + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. # This volume allocates shared memory for Ray to use for its plasma # object store. If you do not provide this, Ray will fall back to # /tmp which cause slowdowns if is not a shared memory volume. volumeMounts: - - mountPath: /dev/shm - name: dshm + - mountPath: /dev/shm + name: dshm resources: - requests: - cpu: 1000m - memory: 512Mi - limits: - # The maximum memory that this pod is allowed to use. The - # limit will be detected by ray and split to use 10% for - # redis, 30% for the shared memory object store, and the - # rest for application memory. If this limit is not set and - # the object store size is not set manually, ray will - # allocate a very large object store in each pod that may - # cause problems for other pods. - memory: 512Mi - - -# Command to start ray on the head node. You don't need to change this. -# Note dashboard-host is set to 0.0.0.0 so that kubernetes can port forward. -head_start_ray_commands: - - ray stop - - ulimit -n 65536; ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0 - -# Command to start ray on worker nodes. You don't need to change this. -worker_start_ray_commands: - - ray stop - - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 + requests: + cpu: 1000m + memory: 512Mi + limits: + # This memory limit will be detected by ray and split into + # 30% for plasma, and 70% for workers. + memory: 2Gi + env: + # This is used in the head_start_ray_commands below so that + # Ray can spawn the correct number of processes. Omitting this + # may lead to degraded performance. + - name: MY_CPU_REQUEST + valueFrom: + resourceFieldRef: + resource: requests.cpu # Files or directories to copy to the head and worker nodes. The format is a # dictionary from REMOTE_PATH: LOCAL_PATH, e.g. @@ -224,6 +266,16 @@ cluster_synced_files: [] # should sync to the worker node continuously file_mounts_sync_continuously: False +# Patterns for files to exclude when running rsync up or rsync down. +# This is not supported on kubernetes. +# rsync_exclude: [] + +# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for +# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided +# as a value, the behavior will match git's behavior for finding and using .gitignore files. +# This is not supported on kubernetes. +# rsync_filter: [] + # List of commands that will be run before `setup_commands`. If docker is # enabled, these commands will run outside the container and before docker @@ -239,6 +291,13 @@ head_setup_commands: [] # Custom commands that will be run on worker nodes after common setup. worker_setup_commands: [] -head_node: {} +# Command to start ray on the head node. You don't need to change this. +# Note webui-host is set to 0.0.0.0 so that kubernetes can port forward. +head_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0 -worker_nodes: {} +# Command to start ray on worker nodes. You don't need to change this. +worker_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/python/ray/autoscaler/kubernetes/example-full-legacy.yaml b/python/ray/autoscaler/kubernetes/example-full-legacy.yaml deleted file mode 100644 index 1af270ed4f8a..000000000000 --- a/python/ray/autoscaler/kubernetes/example-full-legacy.yaml +++ /dev/null @@ -1,261 +0,0 @@ -# A unique identifier for the head node and workers of this cluster. -cluster_name: example-cluster - -# The minimum number of workers nodes to launch in addition to the head -# node. This number should be >= 0. -min_workers: 0 - -# The maximum number of workers nodes to launch in addition to the head -# node. This takes precedence over min_workers. -max_workers: 2 - -# The autoscaler will scale up the cluster faster with higher upscaling speed. -# E.g., if the task requires adding more nodes then autoscaler will gradually -# scale up the cluster in chunks of upscaling_speed*currently_running_nodes. -# This number should be > 0. -upscaling_speed: 1.0 - -# If a node is idle for this many minutes, it will be removed. -idle_timeout_minutes: 5 - -# Kubernetes resources that need to be configured for the autoscaler to be -# able to manage the Ray cluster. If any of the provided resources don't -# exist, the autoscaler will attempt to create them. If this fails, you may -# not have the required permissions and will have to request them to be -# created by your cluster administrator. -provider: - type: kubernetes - - # Exposing external IP addresses for ray pods isn't currently supported. - use_internal_ips: true - - # Namespace to use for all resources created. - namespace: ray - - # ServiceAccount created by the autoscaler for the head node pod that it - # runs in. If this field isn't provided, the head pod config below must - # contain a user-created service account with the proper permissions. - autoscaler_service_account: - apiVersion: v1 - kind: ServiceAccount - metadata: - name: autoscaler - - # Role created by the autoscaler for the head node pod that it runs in. - # If this field isn't provided, the role referenced in - # autoscaler_role_binding must exist and have at least these permissions. - autoscaler_role: - kind: Role - apiVersion: rbac.authorization.k8s.io/v1 - metadata: - name: autoscaler - rules: - - apiGroups: [""] - resources: ["pods", "pods/status", "pods/exec"] - verbs: ["get", "watch", "list", "create", "delete", "patch"] - - # RoleBinding created by the autoscaler for the head node pod that it runs - # in. If this field isn't provided, the head pod config below must contain - # a user-created service account with the proper permissions. - autoscaler_role_binding: - apiVersion: rbac.authorization.k8s.io/v1 - kind: RoleBinding - metadata: - name: autoscaler - subjects: - - kind: ServiceAccount - name: autoscaler - roleRef: - kind: Role - name: autoscaler - apiGroup: rbac.authorization.k8s.io - - services: - # Service that maps to the head node of the Ray cluster. - - apiVersion: v1 - kind: Service - metadata: - # NOTE: If you're running multiple Ray clusters with services - # on one Kubernetes cluster, they must have unique service - # names. - name: example-cluster-ray-head - spec: - # This selector must match the head node pod's selector below. - selector: - component: example-cluster-ray-head - ports: - - name: client - protocol: TCP - port: 10001 - targetPort: 10001 - - name: dashboard - protocol: TCP - port: 8265 - targetPort: 8265 - - -# Kubernetes pod config for the head node pod. -head_node: - apiVersion: v1 - kind: Pod - metadata: - # Automatically generates a name for the pod with this prefix. - generateName: example-cluster-ray-head- - - # Must match the head node service selector above if a head node - # service is required. - labels: - component: example-cluster-ray-head - spec: - # Change this if you altered the autoscaler_service_account above - # or want to provide your own. - serviceAccountName: autoscaler - - # Restarting the head node automatically is not currently supported. - # If the head node goes down, `ray up` must be run again. - restartPolicy: Never - - # This volume allocates shared memory for Ray to use for its plasma - # object store. If you do not provide this, Ray will fall back to - # /tmp which cause slowdowns if is not a shared memory volume. - volumes: - - name: dshm - emptyDir: - medium: Memory - - containers: - - name: ray-node - imagePullPolicy: Always - # You are free (and encouraged) to use your own container image, - # but it should have the following installed: - # - rsync (used for `ray rsync` commands and file mounts) - # - screen (used for `ray attach`) - # - kubectl (used by the autoscaler to manage worker pods) - image: rayproject/ray:nightly - # Do not change this command - it keeps the pod alive until it is - # explicitly killed. - command: ["/bin/bash", "-c", "--"] - args: ["trap : TERM INT; sleep infinity & wait;"] - ports: - - containerPort: 6379 # Redis port - - containerPort: 10001 # Used by Ray Client - - containerPort: 8265 # Used by Ray Dashboard - - # This volume allocates shared memory for Ray to use for its plasma - # object store. If you do not provide this, Ray will fall back to - # /tmp which cause slowdowns if is not a shared memory volume. - volumeMounts: - - mountPath: /dev/shm - name: dshm - resources: - requests: - cpu: 1000m - memory: 512Mi - limits: - # The maximum memory that this pod is allowed to use. The - # limit will be detected by ray and split to use 10% for - # redis, 30% for the shared memory object store, and the - # rest for application memory. If this limit is not set and - # the object store size is not set manually, ray will - # allocate a very large object store in each pod that may - # cause problems for other pods. - memory: 2Gi - -# Kubernetes pod config for worker node pods. -worker_nodes: - apiVersion: v1 - kind: Pod - metadata: - # Automatically generates a name for the pod with this prefix. - generateName: example-cluster-ray-worker- - - # Must match the worker node service selector above if a worker node - # service is required. - labels: - component: ray-worker - spec: - serviceAccountName: default - - # Worker nodes will be managed automatically by the head node, so - # do not change the restart policy. - restartPolicy: Never - - # This volume allocates shared memory for Ray to use for its plasma - # object store. If you do not provide this, Ray will fall back to - # /tmp which cause slowdowns if is not a shared memory volume. - volumes: - - name: dshm - emptyDir: - medium: Memory - - containers: - - name: ray-node - imagePullPolicy: Always - # You are free (and encouraged) to use your own container image, - # but it should have the following installed: - # - rsync (used for `ray rsync` commands and file mounts) - image: rayproject/ray:nightly - # Do not change this command - it keeps the pod alive until it is - # explicitly killed. - command: ["/bin/bash", "-c", "--"] - args: ["trap : TERM INT; sleep infinity & wait;"] - - # This volume allocates shared memory for Ray to use for its plasma - # object store. If you do not provide this, Ray will fall back to - # /tmp which cause slowdowns if is not a shared memory volume. - volumeMounts: - - mountPath: /dev/shm - name: dshm - resources: - requests: - cpu: 1000m - memory: 512Mi - limits: - # This memory limit will be detected by ray and split into - # 30% for plasma, and 70% for workers. - memory: 2Gi - -# Files or directories to copy to the head and worker nodes. The format is a -# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. -file_mounts: { -# "~/path1/on/remote/machine": "/path1/on/local/machine", -# "~/path2/on/remote/machine": "/path2/on/local/machine", -} -# Note that the container images in this example have a non-root user. -# To avoid permissions issues, we recommend mounting into a subdirectory of home (~). - -# Files or directories to copy from the head node to the worker nodes. The format is a -# list of paths. The same path on the head node will be copied to the worker node. -# This behavior is a subset of the file_mounts behavior. In the vast majority of cases -# you should just use file_mounts. Only use this if you know what you're doing! -cluster_synced_files: [] - -# Whether changes to directories in file_mounts or cluster_synced_files in the head node -# should sync to the worker node continuously -file_mounts_sync_continuously: False - - -# List of commands that will be run before `setup_commands`. If docker is -# enabled, these commands will run outside the container and before docker -# is setup. -initialization_commands: [] - -# List of shell commands to run to set up nodes. -setup_commands: [] - -# Custom commands that will be run on the head node after common setup. -head_setup_commands: [] - -# Custom commands that will be run on worker nodes after common setup. -worker_setup_commands: [] - -# Command to start ray on the head node. You don't need to change this. -# Note dashboard-host is set to 0.0.0.0 so that kubernetes can port forward. -head_start_ray_commands: - - ray stop - - ulimit -n 65536; ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0 - -# Command to start ray on worker nodes. You don't need to change this. -worker_start_ray_commands: - - ray stop - - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 diff --git a/python/ray/autoscaler/kubernetes/example-full.yaml b/python/ray/autoscaler/kubernetes/example-full.yaml index cb09545d4f09..80ada3b27966 100644 --- a/python/ray/autoscaler/kubernetes/example-full.yaml +++ b/python/ray/autoscaler/kubernetes/example-full.yaml @@ -1,8 +1,12 @@ -# A unique identifier for the head node and workers of this cluster. -cluster_name: example-cluster +# An unique identifier for the head node and workers of this cluster. +cluster_name: default + +# The minimum number of workers nodes to launch in addition to the head +# node. This number should be >= 0. +min_workers: 0 # The maximum number of workers nodes to launch in addition to the head -# node. +# node. This takes precedence over min_workers. max_workers: 2 # The autoscaler will scale up the cluster faster with higher upscaling speed. @@ -74,86 +78,127 @@ provider: # NOTE: If you're running multiple Ray clusters with services # on one Kubernetes cluster, they must have unique service # names. - name: example-cluster-ray-head + name: ray-head spec: # This selector must match the head node pod's selector below. selector: - component: example-cluster-ray-head + component: ray-head + ports: + - protocol: TCP + port: 8000 + targetPort: 8000 + + # Service that maps to the worker nodes of the Ray cluster. + - apiVersion: v1 + kind: Service + metadata: + # NOTE: If you're running multiple Ray clusters with services + # on one Kubernetes cluster, they must have unique service + # names. + name: ray-workers + spec: + # This selector must match the worker node pods' selector below. + selector: + component: ray-worker ports: - - name: client - protocol: TCP - port: 10001 - targetPort: 10001 - - name: dashboard - protocol: TCP - port: 8265 - targetPort: 8265 - -# Specify the pod type for the ray head node (as configured below). -head_node_type: head_node -# Specify the allowed pod types for this ray cluster and the resources they provide. -available_node_types: - worker_node: - # Minimum number of Ray workers of this Pod type. - min_workers: 0 - # Maximum number of Ray workers of this Pod type. Takes precedence over min_workers. - max_workers: 2 - # User-specified custom resources for use by Ray. Object with string keys and integer values. - # (Ray detects CPU and GPU from pod spec resource requests and limits, so no need to fill those here.) - resources: {"foo": 1, "bar": 2} - node_config: - apiVersion: v1 - kind: Pod - metadata: + - protocol: TCP + port: 8000 + targetPort: 8000 + +# Kubernetes pod config for the head node pod. +head_node: + apiVersion: v1 + kind: Pod + metadata: # Automatically generates a name for the pod with this prefix. - generateName: example-cluster-ray-worker- - spec: + generateName: ray-head- + + # Must match the head node service selector above if a head node + # service is required. + labels: + component: ray-head + spec: + # Change this if you altered the autoscaler_service_account above + # or want to provide your own. + serviceAccountName: autoscaler + + # Restarting the head node automatically is not currently supported. + # If the head node goes down, `ray up` must be run again. restartPolicy: Never + + # This volume allocates shared memory for Ray to use for its plasma + # object store. If you do not provide this, Ray will fall back to + # /tmp which cause slowdowns if is not a shared memory volume. volumes: - name: dshm emptyDir: - medium: Memory + medium: Memory + containers: - name: ray-node imagePullPolicy: Always + # You are free (and encouraged) to use your own container image, + # but it should have the following installed: + # - rsync (used for `ray rsync` commands and file mounts) + # - screen (used for `ray attach`) + # - kubectl (used by the autoscaler to manage worker pods) image: rayproject/ray:nightly + # Do not change this command - it keeps the pod alive until it is + # explicitly killed. command: ["/bin/bash", "-c", "--"] args: ["trap : TERM INT; sleep infinity & wait;"] + ports: + - containerPort: 6379 # Redis port. + - containerPort: 6380 # Redis port. + - containerPort: 6381 # Redis port. + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. + # This volume allocates shared memory for Ray to use for its plasma # object store. If you do not provide this, Ray will fall back to # /tmp which cause slowdowns if is not a shared memory volume. volumeMounts: - - mountPath: /dev/shm - name: dshm + - mountPath: /dev/shm + name: dshm resources: - requests: - cpu: 1000m - memory: 512Mi - limits: - # The maximum memory that this pod is allowed to use. The - # limit will be detected by ray and split to use 10% for - # redis, 30% for the shared memory object store, and the - # rest for application memory. If this limit is not set and - # the object store size is not set manually, ray will - # allocate a very large object store in each pod that may - # cause problems for other pods. - memory: 512Mi - head_node: - node_config: - apiVersion: v1 - kind: Pod - metadata: + requests: + cpu: 1000m + memory: 512Mi + limits: + # The maximum memory that this pod is allowed to use. The + # limit will be detected by ray and split to use 10% for + # redis, 30% for the shared memory object store, and the + # rest for application memory. If this limit is not set and + # the object store size is not set manually, ray will + # allocate a very large object store in each pod that may + # cause problems for other pods. + memory: 2Gi + env: + # This is used in the head_start_ray_commands below so that + # Ray can spawn the correct number of processes. Omitting this + # may lead to degraded performance. + - name: MY_CPU_REQUEST + valueFrom: + resourceFieldRef: + resource: requests.cpu + +# Kubernetes pod config for worker node pods. +worker_nodes: + apiVersion: v1 + kind: Pod + metadata: # Automatically generates a name for the pod with this prefix. - generateName: example-cluster-ray-head- - # Must match the head node service selector above if a head node + generateName: ray-worker- + + # Must match the worker node service selector above if a worker node # service is required. labels: - component: example-cluster-ray-head - spec: - # Change this if you altered the autoscaler_service_account above - # or want to provide your own. - serviceAccountName: autoscaler + component: ray-worker + spec: + serviceAccountName: default + # Worker nodes will be managed automatically by the head node, so + # do not change the restart policy. restartPolicy: Never # This volume allocates shared memory for Ray to use for its plasma @@ -162,48 +207,96 @@ available_node_types: volumes: - name: dshm emptyDir: - medium: Memory + medium: Memory + containers: - name: ray-node imagePullPolicy: Always + # You are free (and encouraged) to use your own container image, + # but it should have the following installed: + # - rsync (used for `ray rsync` commands and file mounts) image: rayproject/ray:nightly # Do not change this command - it keeps the pod alive until it is # explicitly killed. command: ["/bin/bash", "-c", "--"] - args: ['trap : TERM INT; sleep infinity & wait;'] + args: ["trap : TERM INT; sleep infinity & wait;"] ports: - - containerPort: 6379 # Redis port - - containerPort: 10001 # Used by Ray Client - - containerPort: 8265 # Used by Ray Dashboard + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. # This volume allocates shared memory for Ray to use for its plasma # object store. If you do not provide this, Ray will fall back to # /tmp which cause slowdowns if is not a shared memory volume. volumeMounts: - - mountPath: /dev/shm - name: dshm + - mountPath: /dev/shm + name: dshm resources: - requests: - cpu: 1000m - memory: 512Mi - limits: - # The maximum memory that this pod is allowed to use. The - # limit will be detected by ray and split to use 10% for - # redis, 30% for the shared memory object store, and the - # rest for application memory. If this limit is not set and - # the object store size is not set manually, ray will - # allocate a very large object store in each pod that may - # cause problems for other pods. - memory: 512Mi + requests: + cpu: 1000m + memory: 512Mi + limits: + # This memory limit will be detected by ray and split into + # 30% for plasma, and 70% for workers. + memory: 2Gi + env: + # This is used in the head_start_ray_commands below so that + # Ray can spawn the correct number of processes. Omitting this + # may lead to degraded performance. + - name: MY_CPU_REQUEST + valueFrom: + resourceFieldRef: + resource: requests.cpu + +# Files or directories to copy to the head and worker nodes. The format is a +# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. +file_mounts: { +# "~/path1/on/remote/machine": "/path1/on/local/machine", +# "~/path2/on/remote/machine": "/path2/on/local/machine", +} +# Note that the container images in this example have a non-root user. +# To avoid permissions issues, we recommend mounting into a subdirectory of home (~). + +# Files or directories to copy from the head node to the worker nodes. The format is a +# list of paths. The same path on the head node will be copied to the worker node. +# This behavior is a subset of the file_mounts behavior. In the vast majority of cases +# you should just use file_mounts. Only use this if you know what you're doing! +cluster_synced_files: [] + +# Whether changes to directories in file_mounts or cluster_synced_files in the head node +# should sync to the worker node continuously +file_mounts_sync_continuously: False + +# Patterns for files to exclude when running rsync up or rsync down. +# This is not supported on kubernetes. +# rsync_exclude: [] + +# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for +# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided +# as a value, the behavior will match git's behavior for finding and using .gitignore files. +# This is not supported on kubernetes. +# rsync_filter: [] + +# List of commands that will be run before `setup_commands`. If docker is +# enabled, these commands will run outside the container and before docker +# is setup. +initialization_commands: [] + +# List of shell commands to run to set up nodes. +setup_commands: [] + +# Custom commands that will be run on the head node after common setup. +head_setup_commands: [] +# Custom commands that will be run on worker nodes after common setup. +worker_setup_commands: [] # Command to start ray on the head node. You don't need to change this. -# Note dashboard-host is set to 0.0.0.0 so that kubernetes can port forward. +# Note webui-host is set to 0.0.0.0 so that kubernetes can port forward. head_start_ray_commands: - ray stop - - ulimit -n 65536; ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0 + - ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0 # Command to start ray on worker nodes. You don't need to change this. worker_start_ray_commands: - ray stop - - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 + - ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/python/ray/autoscaler/kubernetes/example-minimal.yaml b/python/ray/autoscaler/kubernetes/example-minimal.yaml index dc5b95d0f336..62cf855db8fb 100644 --- a/python/ray/autoscaler/kubernetes/example-minimal.yaml +++ b/python/ray/autoscaler/kubernetes/example-minimal.yaml @@ -1,9 +1,9 @@ # An unique identifier for the head node and workers of this cluster. -cluster_name: example-cluster +cluster_name: minimal # The maximum number of workers nodes to launch in addition to the head -# node. -max_workers: 2 +# node. This takes precedence over min_workers. min_workers default to 0. +max_workers: 1 # Kubernetes resources that need to be configured for the autoscaler to be # able to manage the Ray cluster. If any of the provided resources don't @@ -56,26 +56,3 @@ provider: kind: Role name: autoscaler apiGroup: rbac.authorization.k8s.io - - services: - # Service that maps to the head node of the Ray cluster. - - apiVersion: v1 - kind: Service - metadata: - # NOTE: If you're running multiple Ray clusters with services - # on one Kubernetes cluster, they must have unique service - # names. - name: example-cluster-ray-head - spec: - # This selector must match the head node pod's selector below. - selector: - component: example-cluster-ray-head - ports: - - name: client - protocol: TCP - port: 10001 - targetPort: 10001 - - name: dashboard - protocol: TCP - port: 8265 - targetPort: 8265 diff --git a/python/ray/autoscaler/kubernetes/example_scripts/job_example.py b/python/ray/autoscaler/kubernetes/example_scripts/job_example.py deleted file mode 100644 index e58a789ee6ae..000000000000 --- a/python/ray/autoscaler/kubernetes/example_scripts/job_example.py +++ /dev/null @@ -1,71 +0,0 @@ -from collections import Counter -import os -import sys -import time -import ray - -""" This script is meant to be run from a pod in the same Kubernetes namespace -as your Ray cluster. - -Just below are the environment variables used to access Ray client via a -service targetting the Ray cluster's head node pod. -These environment variables are set by Kubernetes. -See https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables -In the documentation examples, the head service has -"example-cluster-ray-head" and the relevant port is named "client". -Modify the environment variables as needed to match the name of the service -and port. - -Note: The default head service set up by the Ray Kubernetes operator is named --ray-head, -where is the metadata.name field you set in the RayCluster -custom resource. -""" # noqa -HEAD_SERVICE_IP_ENV = "EXAMPLE_CLUSTER_RAY_HEAD_SERVICE_HOST" -HEAD_SERVICE_CLIENT_PORT_ENV = "EXAMPLE_CLUSTER_RAY_HEAD_SERVICE_PORT_CLIENT" - - -@ray.remote -def gethostname(x): - import platform - import time - time.sleep(0.01) - return x + (platform.node(), ) - - -def wait_for_nodes(expected): - # Wait for all nodes to join the cluster. - while True: - resources = ray.cluster_resources() - node_keys = [key for key in resources if "node" in key] - num_nodes = sum(resources[node_key] for node_key in node_keys) - if num_nodes < expected: - print("{} nodes have joined so far, waiting for {} more.".format( - num_nodes, expected - num_nodes)) - sys.stdout.flush() - time.sleep(1) - else: - break - - -def main(): - wait_for_nodes(3) - - # Check that objects can be transferred from each node to each other node. - for i in range(10): - print("Iteration {}".format(i)) - results = [ - gethostname.remote(gethostname.remote(())) for _ in range(100) - ] - print(Counter(ray.get(results))) - sys.stdout.flush() - - print("Success!") - sys.stdout.flush() - - -if __name__ == "__main__": - head_service_ip = os.environ[HEAD_SERVICE_IP_ENV] - client_port = os.environ[HEAD_SERVICE_CLIENT_PORT_ENV] - ray.util.connect(f"{head_service_ip}:{client_port}") - main() diff --git a/python/ray/autoscaler/kubernetes/example_scripts/run_local_example.py b/python/ray/autoscaler/kubernetes/example_scripts/run_local_example.py deleted file mode 100644 index 667f8c628960..000000000000 --- a/python/ray/autoscaler/kubernetes/example_scripts/run_local_example.py +++ /dev/null @@ -1,58 +0,0 @@ -from collections import Counter -import sys -import time -import ray -""" Run this script locally to execute a Ray program on your Ray cluster on -Kubernetes. - -Before running this script, you must port-forward from the local host to -the relevant Kubernetes head service e.g. -kubectl -n ray port-forward service/example-cluster-ray-head 10001:10001. - -Set the constant LOCAL_PORT below to the local port being forwarded. -""" -LOCAL_PORT = 10001 - - -@ray.remote -def gethostname(x): - import platform - import time - time.sleep(0.01) - return x + (platform.node(), ) - - -def wait_for_nodes(expected): - # Wait for all nodes to join the cluster. - while True: - resources = ray.cluster_resources() - node_keys = [key for key in resources if "node" in key] - num_nodes = sum(resources[node_key] for node_key in node_keys) - if num_nodes < expected: - print("{} nodes have joined so far, waiting for {} more.".format( - num_nodes, expected - num_nodes)) - sys.stdout.flush() - time.sleep(1) - else: - break - - -def main(): - wait_for_nodes(3) - - # Check that objects can be transferred from each node to each other node. - for i in range(10): - print("Iteration {}".format(i)) - results = [ - gethostname.remote(gethostname.remote(())) for _ in range(100) - ] - print(Counter(ray.get(results))) - sys.stdout.flush() - - print("Success!") - sys.stdout.flush() - - -if __name__ == "__main__": - ray.util.connect(f"127.0.0.1:{LOCAL_PORT}") - main() diff --git a/python/ray/autoscaler/kubernetes/job-example.yaml b/python/ray/autoscaler/kubernetes/job-example.yaml deleted file mode 100644 index b5e140dc8036..000000000000 --- a/python/ray/autoscaler/kubernetes/job-example.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# Job to run a Ray program in its own pod. Assumes that a Ray cluster is already -# running. -apiVersion: batch/v1 -kind: Job -metadata: - generateName: ray-test-job- -spec: - template: - spec: - restartPolicy: Never - containers: - - name: ray - image: rayproject/ray:nightly - imagePullPolicy: Always - command: ["python"] - args: - - "$(EXAMPLE_PROGRAM_PATH)" - env: - - name: EXAMPLE_PROGRAM_PATH - value: "/home/ray/anaconda3/lib/python3.7/site-packages/ray/autoscaler/kubernetes/example_scripts/job_example.py" - resources: - requests: - cpu: 100m - memory: 512Mi diff --git a/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml b/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml index df7a33254cf5..9e92d5d4f6bc 100644 --- a/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml +++ b/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml @@ -13,16 +13,6 @@ spec: - name: v1 served: true storage: true - subresources: - status: {} - additionalPrinterColumns: - - name: status - type: string - description: Running or Error - jsonPath: .status.phase - - name: age - type: date - jsonPath: .metadata.creationTimestamp schema: openAPIV3Schema: description: Ray cluster configuration @@ -30,17 +20,12 @@ spec: required: - spec properties: - status: - type: object - properties: - phase: - description: Running or Error - type: string spec: type: object required: - podTypes - headPodType + - workerDefaultPodType properties: maxWorkers: description: The maximum number of workers nodes to launch in addition to the @@ -78,9 +63,9 @@ spec: description: Maximum number of Ray workers of this Pod type. rayResources: type: object - description: User-specified custom resources for use by Ray. Keys strings, values integers. - # TODO (dmitri): Validate that values are integers [patternProperties not supported by OpenAPI v3.0] - x-kubernetes-preserve-unknown-fields: true + description: User-specified custom resources for use by Ray. + # TODO (dmitri): Validate that values are numeric [patternProperties not supported by OpenAPI v3.0] + x-kubernetes-preserve-unknown-fields: true setupCommands: description: Commands to run before starting the Ray runtime. type: array @@ -4279,6 +4264,9 @@ spec: headPodType: description: Specifies the head node type. type: string + workerDefaultPodType: + description: Specifies the default worker node type. + type: string headStartRayCommands: description: Commands to start Ray on the head node. type: array diff --git a/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml b/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml index 34018f0c47d0..bb4a71fcc203 100644 --- a/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml +++ b/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml @@ -14,6 +14,8 @@ spec: idleTimeoutMinutes: 5 # Specify the pod type for the ray head node (as configured below). headPodType: head-node + # Specify the default pod type for ray the worker nodes (as configured below). + workerDefaultPodType: worker-nodes # Specify the allowed pod types for this ray cluster and the resources they provide. podTypes: - name: head-node @@ -42,9 +44,9 @@ spec: command: ["/bin/bash", "-c", "--"] args: ['trap : TERM INT; sleep infinity & wait;'] ports: - - containerPort: 6379 # Redis port - - containerPort: 10001 # Used by Ray Client - - containerPort: 8265 # Used by Ray Dashboard + - containerPort: 6379 # Redis port. + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. # This volume allocates shared memory for Ray to use for its plasma # object store. If you do not provide this, Ray will fall back to @@ -65,14 +67,16 @@ spec: # allocate a very large object store in each pod that may # cause problems for other pods. memory: 512Mi - - name: worker-node + - name: worker-nodes # Minimum number of Ray workers of this Pod type. minWorkers: 2 # Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers. maxWorkers: 3 - # User-specified custom resources for use by Ray. - # (Ray detects CPU and GPU from pod spec resource requests and limits, so no need to fill those here.) - rayResources: {"foo": 1, "bar": 1} + # User-specified custom resources for use by Ray + rayResources: {"Custom1": 1, "is_spot": 1} + # Optional commands to run before starting the Ray runtime. + setupCommands: + - pip install numpy # Example podConfig: apiVersion: v1 kind: Pod @@ -91,6 +95,9 @@ spec: image: rayproject/ray:nightly command: ["/bin/bash", "-c", "--"] args: ["trap : TERM INT; sleep infinity & wait;"] + ports: + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. # This volume allocates shared memory for Ray to use for its plasma # object store. If you do not provide this, Ray will fall back to # /tmp which cause slowdowns if is not a shared memory volume. @@ -113,9 +120,9 @@ spec: # Commands to start Ray on the head node. You don't need to change this. # Note dashboard-host is set to 0.0.0.0 so that Kubernetes can port forward. headStartRayCommands: - - ray stop - - ulimit -n 65536; ray start --head --no-monitor --dashboard-host 0.0.0.0 + - ray stop + - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --dashboard-host 0.0.0.0 # Commands to start Ray on worker nodes. You don't need to change this. workerStartRayCommands: - - ray stop - - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 + - ray stop + - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml b/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml index c244a589faac..e5e4ecf3197a 100644 --- a/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml +++ b/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml @@ -14,6 +14,8 @@ spec: idleTimeoutMinutes: 5 # Specify the pod type for the ray head node (as configured below). headPodType: head-node + # Specify the default pod type for ray the worker nodes (as configured below). + workerDefaultPodType: worker-nodes # Specify the allowed pod types for this ray cluster and the resources they provide. podTypes: - name: head-node @@ -42,9 +44,9 @@ spec: command: ["/bin/bash", "-c", "--"] args: ['trap : TERM INT; sleep infinity & wait;'] ports: - - containerPort: 6379 # Redis port - - containerPort: 10001 # Used by Ray Client - - containerPort: 8265 # Used by Ray Dashboard + - containerPort: 6379 # Redis port. + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. # This volume allocates shared memory for Ray to use for its plasma # object store. If you do not provide this, Ray will fall back to @@ -65,14 +67,16 @@ spec: # allocate a very large object store in each pod that may # cause problems for other pods. memory: 512Mi - - name: worker-node + - name: worker-nodes # Minimum number of Ray workers of this Pod type. minWorkers: 1 # Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers. maxWorkers: 3 - # User-specified custom resources for use by Ray. Object with string keys and integer values. - # (Ray detects CPU and GPU from pod spec resource requests and limits, so no need to fill those here.) - rayResources: {"baz": 5, "quux": 17} + # User-specified custom resources for use by Ray + rayResources: {"Custom1": 1, "is_spot": 1} + # Optional commands to run before starting the Ray runtime. + setupCommands: + - pip install numpy # Example podConfig: apiVersion: v1 kind: Pod @@ -91,6 +95,9 @@ spec: image: rayproject/ray:nightly command: ["/bin/bash", "-c", "--"] args: ["trap : TERM INT; sleep infinity & wait;"] + ports: + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. # This volume allocates shared memory for Ray to use for its plasma # object store. If you do not provide this, Ray will fall back to # /tmp which cause slowdowns if is not a shared memory volume. @@ -113,9 +120,9 @@ spec: # Commands to start Ray on the head node. You don't need to change this. # Note dashboard-host is set to 0.0.0.0 so that Kubernetes can port forward. headStartRayCommands: - - ray stop - - ulimit -n 65536; ray start --head --no-monitor --dashboard-host 0.0.0.0 + - ray stop + - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --dashboard-host 0.0.0.0 # Commands to start Ray on worker nodes. You don't need to change this. workerStartRayCommands: - - ray stop - - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 + - ray stop + - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/python/ray/autoscaler/kubernetes/operator_configs/operator.yaml b/python/ray/autoscaler/kubernetes/operator_configs/operator.yaml index f0f43a1efdc9..2c170f072df8 100644 --- a/python/ray/autoscaler/kubernetes/operator_configs/operator.yaml +++ b/python/ray/autoscaler/kubernetes/operator_configs/operator.yaml @@ -10,7 +10,7 @@ metadata: name: ray-operator-role rules: - apiGroups: ["", "cluster.ray.io"] - resources: ["rayclusters", "rayclusters/finalizers", "rayclusters/status", "pods", "pods/exec", "services"] + resources: ["rayclusters", "rayclusters/finalizers", "pods", "pods/exec"] verbs: ["get", "watch", "list", "create", "delete", "patch", "update"] --- apiVersion: rbac.authorization.k8s.io/v1 diff --git a/python/ray/autoscaler/ray-schema.json b/python/ray/autoscaler/ray-schema.json index a5d927a01178..41a4a070832e 100644 --- a/python/ray/autoscaler/ray-schema.json +++ b/python/ray/autoscaler/ray-schema.json @@ -24,7 +24,7 @@ "type": "string" }, "min_workers": { - "description": "DEPRECATED. Use the per node_type min_workers field instead.", + "description": "The minimum number of workers nodes to launch in addition to the head node. This number should be >= 0", "type": "integer", "minimum": 0 }, @@ -34,17 +34,17 @@ "minimum": 0 }, "initial_workers": { - "description": "DEPRECATED.", + "description": "The number of workers to launch initially, in addition to the head node.", "type": "integer", "minimum": 0 }, "autoscaling_mode": { - "description": "DEPRECATED. Use upscaling_speed instead.", + "description": "The mode of the autoscaler e.g. default, aggressive", "type": "string", "enum": [ "default", "aggressive" ] }, "target_utilization_fraction": { - "description": "DEPRECATED. Use upscaling_speed instead.", + "description": "The autoscaler will scale up the cluster to this target fraction of resources usage. For example, if a cluster of 8 nodes is 100% busy # and target_utilization was 0.8, it would resize the cluster to 10.", "type": "number", "minimum": 0, "maximum": 1 @@ -247,11 +247,6 @@ "type": "boolean", "description": "disable Ray from automatically detecting /dev/shm size for the container", "default": false - }, - "use_podman" : { - "type": "boolean", - "description": "Use 'podman' command in place of 'docker'", - "default": false } } }, @@ -261,7 +256,7 @@ }, "worker_default_node_type": { "type": "string", - "description": "DEPRECATED." + "description": "If using multiple node types, specifies the default worker node type." }, "head_node": { "type": "object", @@ -337,12 +332,8 @@ "min_workers": {"type": "integer"}, "max_workers": {"type": "integer"}, "resources": { - "patternProperties": { - ".*":{ - "type": "integer", - "minimum": 0 - } - } + "type": "object", + ".*": {"type": "number"} }, "initialization_commands": { "$ref": "#/definitions/commands", diff --git a/python/ray/autoscaler/staroid/example-multi-node-type.yaml b/python/ray/autoscaler/staroid/example-multi-node-type.yaml index f0291963ec3c..860bb6a87674 100644 --- a/python/ray/autoscaler/staroid/example-multi-node-type.yaml +++ b/python/ray/autoscaler/staroid/example-multi-node-type.yaml @@ -1,5 +1,6 @@ # an example of configuring a mixed-node-type cluster. cluster_name: multi-node-type # name with 'a-z' and '-' +min_workers: 1 max_workers: 40 # The autoscaler will scale up the cluster faster with higher upscaling speed. @@ -102,6 +103,9 @@ available_node_types: # Specify the node type of the head node (as configured above). head_node_type: cpu_4_ondemand +# Specify the default type of the worker node (as configured above). +worker_default_node_type: cpu_4_spot + # The default settings for the head node. This will be merged with the per-node # type configs given above. #head_node: diff --git a/python/ray/experimental/shuffle.py b/python/ray/experimental/shuffle.py deleted file mode 100644 index 0a3f0165609f..000000000000 --- a/python/ray/experimental/shuffle.py +++ /dev/null @@ -1,215 +0,0 @@ -"""A simple distributed shuffle implementation in Ray. - -This utility provides a `simple_shuffle` function that can be used to -redistribute M input partitions into N output partitions. It does this with -a single wave of shuffle map tasks followed by a single wave of shuffle reduce -tasks. Each shuffle map task generates O(N) output objects, and each shuffle -reduce task consumes O(M) input objects, for a total of O(N*M) objects. - -To try an example 10GB shuffle, run: - - $ python -m ray.experimental.shuffle \ - --num-partitions=50 --partition-size=200e6 \ - --object-store-memory=1e9 - -This will print out some statistics on the shuffle execution such as: - - --- Aggregate object store stats across all nodes --- - Plasma memory usage 0 MiB, 0 objects, 0.0% full - Spilled 9487 MiB, 2487 objects, avg write throughput 1023 MiB/s - Restored 9487 MiB, 2487 objects, avg read throughput 1358 MiB/s - Objects consumed by Ray tasks: 9537 MiB. - - Shuffled 9536 MiB in 16.579771757125854 seconds -""" - -from typing import List, Iterable, Tuple, Callable, Any - -import ray -from ray import ObjectRef - -# TODO(ekl) why doesn't TypeVar() deserialize properly in Ray? -# The type produced by the input reader function. -InType = Any -# The type produced by the output writer function. -OutType = Any -# Integer identifying the partition number. -PartitionID = int - - -class ObjectStoreWriter: - """This class is used to stream shuffle map outputs to the object store. - - It can be subclassed to optimize writing (e.g., batching together small - records into larger objects). This will be performance critical if your - input records are small (the example shuffle uses very large records, so - the naive strategy works well). - """ - - def __init__(self): - self.results = [] - - def add(self, item: InType) -> None: - """Queue a single item to be written to the object store. - - This base implementation immediately writes each given item to the - object store as a standalone object. - """ - self.results.append(ray.put(item)) - - def finish(self) -> List[ObjectRef]: - """Return list of object refs representing written items.""" - return self.results - - -def round_robin_partitioner(input_stream: Iterable[InType], num_partitions: int - ) -> Iterable[Tuple[PartitionID, InType]]: - """Round robin partitions items from the input reader. - - You can write custom partitioning functions for your use case. - - Args: - input_stream: Iterator over items from the input reader. - num_partitions: Number of output partitions. - - Yields: - Tuples of (partition id, input item). - """ - i = 0 - for item in input_stream: - yield (i, item) - i += 1 - i %= num_partitions - - -def simple_shuffle( - *, - input_reader: Callable[[PartitionID], Iterable[InType]], - input_num_partitions: int, - output_num_partitions: int, - output_writer: Callable[[PartitionID, List[ObjectRef]], OutType], - partitioner: Callable[[Iterable[InType], int], Iterable[ - PartitionID]] = round_robin_partitioner, - object_store_writer: ObjectStoreWriter = ObjectStoreWriter, -) -> List[OutType]: - """Simple distributed shuffle in Ray. - - Args: - input_reader: Function that generates the input items for a - partition (e.g., data records). - input_num_partitions: The number of input partitions. - output_num_partitions: The desired number of output partitions. - output_writer: Function that consumes a iterator of items for a - given output partition. It returns a single value that will be - collected across all output partitions. - partitioner: Partitioning function to use. Defaults to round-robin - partitioning of input items. - object_store_writer: Class used to write input items to the - object store in an efficient way. Defaults to a naive - implementation that writes each input record as one object. - - Returns: - List of outputs from the output writers. - """ - - @ray.remote(num_returns=output_num_partitions) - def shuffle_map(i: PartitionID) -> List[List[ObjectRef]]: - writers = [object_store_writer() for _ in range(output_num_partitions)] - for out_i, item in partitioner(input_reader(i), output_num_partitions): - writers[out_i].add(item) - return [c.finish() for c in writers] - - @ray.remote - def shuffle_reduce(i: PartitionID, - *mapper_outputs: List[List[ObjectRef]]) -> OutType: - input_objects = [] - assert len(mapper_outputs) == input_num_partitions - for obj_refs in mapper_outputs: - for obj_ref in obj_refs: - input_objects.append(obj_ref) - return output_writer(i, input_objects) - - shuffle_map_out = [ - shuffle_map.remote(i) for i in range(input_num_partitions) - ] - - shuffle_reduce_out = [ - shuffle_reduce.remote( - j, *[shuffle_map_out[i][j] for i in range(input_num_partitions)]) - for j in range(output_num_partitions) - ] - - return ray.get(shuffle_reduce_out) - - -@ray.remote -class _StatusTracker: - def __init__(self): - self.num_map = 0 - self.num_reduce = 0 - - def inc(self): - self.num_map += 1 - print("Num map tasks finished", self.num_map) - - def inc2(self): - self.num_reduce += 1 - print("Num reduce tasks finished", self.num_reduce) - - -def main(): - import argparse - import numpy as np - import time - - parser = argparse.ArgumentParser() - parser.add_argument("--ray-address", type=str, default=None) - parser.add_argument("--object-store-memory", type=float, default=1e9) - parser.add_argument("--num-partitions", type=int, default=5) - parser.add_argument("--partition-size", type=float, default=200e6) - args = parser.parse_args() - - if args.ray_address: - ray.init(address=args.ray_address) - else: - ray.init(object_store_memory=args.object_store_memory) - - partition_size = int(args.partition_size) - num_partitions = args.num_partitions - rows_per_partition = partition_size // (8 * 2) - tracker = _StatusTracker.remote() - - def input_reader(i: PartitionID) -> Iterable[InType]: - for _ in range(num_partitions): - yield np.ones( - (rows_per_partition // num_partitions, 2), dtype=np.int64) - tracker.inc.remote() - - def output_writer(i: PartitionID, - shuffle_inputs: List[ObjectRef]) -> OutType: - total = 0 - # TODO(ekl) using ray.wait can be more efficient for pipelining. - for obj_ref in shuffle_inputs: - arr = ray.get(obj_ref) - total += arr.size * arr.itemsize - tracker.inc2.remote() - return total - - start = time.time() - output_sizes = simple_shuffle( - input_reader=input_reader, - input_num_partitions=num_partitions, - output_num_partitions=num_partitions, - output_writer=output_writer) - delta = time.time() - start - - time.sleep(.5) - print() - print(ray.internal.internal_api.memory_summary(stats_only=True)) - print() - print("Shuffled", int(sum(output_sizes) / (1024 * 1024)), "MiB in", delta, - "seconds") - - -if __name__ == "__main__": - main() diff --git a/python/ray/external_storage.py b/python/ray/external_storage.py index 138561f432e2..1b4f6fec81f1 100644 --- a/python/ray/external_storage.py +++ b/python/ray/external_storage.py @@ -1,7 +1,5 @@ import abc -import logging import os -import shutil import urllib from collections import namedtuple from typing import List, IO, Tuple @@ -11,7 +9,6 @@ from ray._raylet import ObjectRef ParsedURL = namedtuple("ParsedURL", "base_url, offset, size") -logger = logging.getLogger(__name__) def create_url_with_offset(*, url: str, offset: int, size: int) -> str: @@ -80,32 +77,27 @@ class ExternalStorage(metaclass=abc.ABCMeta): the external storage is invalid. """ - HEADER_LENGTH = 24 - def _get_objects_from_store(self, object_refs): worker = ray.worker.global_worker - # Since the object should always exist in the plasma store before - # spilling, it can directly get the object from the local plasma - # store. - # issue: https://github.com/ray-project/ray/pull/13831 - ray_object_pairs = worker.core_worker.get_if_local(object_refs) + ray_object_pairs = worker.core_worker.get_objects( + object_refs, + worker.current_task_id, + timeout_ms=0, + plasma_objects_only=True) return ray_object_pairs - def _put_object_to_store(self, metadata, data_size, file_like, object_ref, - owner_address): + def _put_object_to_store(self, metadata, data_size, file_like, object_ref): worker = ray.worker.global_worker worker.core_worker.put_file_like_object(metadata, data_size, file_like, - object_ref, owner_address) + object_ref) def _write_multiple_objects(self, f: IO, object_refs: List[ObjectRef], - owner_addresses: List[str], url: str) -> List[str]: """Fuse all given objects into a given file handle. Args: f(IO): File handle to fusion all given object refs. object_refs(list): Object references to fusion to a single file. - owner_addresses(list): Owner addresses for the provided objects. url(str): url where the object ref is stored in the external storage. @@ -117,18 +109,13 @@ def _write_multiple_objects(self, f: IO, object_refs: List[ObjectRef], keys = [] offset = 0 ray_object_pairs = self._get_objects_from_store(object_refs) - for ref, (buf, metadata), owner_address in zip( - object_refs, ray_object_pairs, owner_addresses): - address_len = len(owner_address) + for ref, (buf, metadata) in zip(object_refs, ray_object_pairs): metadata_len = len(metadata) buf_len = len(buf) - # 24 bytes to store owner address, metadata, and buffer lengths. - data_size_in_bytes = ( - address_len + metadata_len + buf_len + self.HEADER_LENGTH) - f.write(address_len.to_bytes(8, byteorder="little")) + # 16 bytes to store metadata and buffer length. + data_size_in_bytes = metadata_len + buf_len + 16 f.write(metadata_len.to_bytes(8, byteorder="little")) f.write(buf_len.to_bytes(8, byteorder="little")) - f.write(owner_address) f.write(metadata) f.write(memoryview(buf)) url_with_offset = create_url_with_offset( @@ -137,8 +124,7 @@ def _write_multiple_objects(self, f: IO, object_refs: List[ObjectRef], offset += data_size_in_bytes return keys - def _size_check(self, address_len, metadata_len, buffer_len, - obtained_data_size): + def _size_check(self, metadata_len, buffer_len, obtained_data_size): """Check whether or not the obtained_data_size is as expected. Args: @@ -149,11 +135,9 @@ def _size_check(self, address_len, metadata_len, buffer_len, Raises: ValueError if obtained_data_size is different from - address_len + metadata_len + buffer_len + - 24 (first 8 bytes to store length). + metadata_len + buffer_len + 16(first 8 bytes to store length). """ - data_size_in_bytes = ( - address_len + metadata_len + buffer_len + self.HEADER_LENGTH) + data_size_in_bytes = metadata_len + buffer_len + 16 if data_size_in_bytes != obtained_data_size: raise ValueError( f"Obtained data has a size of {data_size_in_bytes}, " @@ -161,7 +145,7 @@ def _size_check(self, address_len, metadata_len, buffer_len, f"size of {obtained_data_size}.") @abc.abstractmethod - def spill_objects(self, object_refs, owner_addresses) -> List[str]: + def spill_objects(self, object_refs) -> List[str]: """Spill objects to the external storage. Objects are specified by their object refs. @@ -192,19 +176,11 @@ def delete_spilled_objects(self, urls: List[str]): urls: URLs that store spilled object files. """ - @abc.abstractmethod - def destroy_external_storage(self): - """Destroy external storage when a head node is down. - - NOTE: This is currently working when the cluster is - started by ray.init - """ - class NullStorage(ExternalStorage): """The class that represents an uninitialized external storage.""" - def spill_objects(self, object_refs, owner_addresses) -> List[str]: + def spill_objects(self, object_refs) -> List[str]: raise NotImplementedError("External storage is not initialized") def restore_spilled_objects(self, object_refs, url_with_offset_list): @@ -213,9 +189,6 @@ def restore_spilled_objects(self, object_refs, url_with_offset_list): def delete_spilled_objects(self, urls: List[str]): raise NotImplementedError("External storage is not initialized") - def destroy_external_storage(self): - raise NotImplementedError("External storage is not initialized") - class FileSystemStorage(ExternalStorage): """The class for filesystem-like external storage. @@ -226,23 +199,22 @@ class FileSystemStorage(ExternalStorage): """ def __init__(self, directory_path): - self.spill_dir_name = DEFAULT_OBJECT_PREFIX - self.directory_path = os.path.join(directory_path, self.spill_dir_name) + self.directory_path = directory_path + self.prefix = DEFAULT_OBJECT_PREFIX os.makedirs(self.directory_path, exist_ok=True) if not os.path.exists(self.directory_path): raise ValueError("The given directory path to store objects, " f"{self.directory_path}, could not be created.") - def spill_objects(self, object_refs, owner_addresses) -> List[str]: + def spill_objects(self, object_refs) -> List[str]: if len(object_refs) == 0: return [] # Always use the first object ref as a key when fusioning objects. first_ref = object_refs[0] - filename = f"{first_ref.hex()}-multi-{len(object_refs)}" + filename = f"{self.prefix}-{first_ref.hex()}-multi-{len(object_refs)}" url = f"{os.path.join(self.directory_path, filename)}" with open(url, "wb") as f: - return self._write_multiple_objects(f, object_refs, - owner_addresses, url) + return self._write_multiple_objects(f, object_refs, url) def restore_spilled_objects(self, object_refs: List[ObjectRef], url_with_offset_list: List[str]): @@ -257,17 +229,13 @@ def restore_spilled_objects(self, object_refs: List[ObjectRef], # Read a part of the file and recover the object. with open(base_url, "rb") as f: f.seek(offset) - address_len = int.from_bytes(f.read(8), byteorder="little") metadata_len = int.from_bytes(f.read(8), byteorder="little") buf_len = int.from_bytes(f.read(8), byteorder="little") - self._size_check(address_len, metadata_len, buf_len, - parsed_result.size) + self._size_check(metadata_len, buf_len, parsed_result.size) total += buf_len - owner_address = f.read(address_len) metadata = f.read(metadata_len) # read remaining data to our buffer - self._put_object_to_store(metadata, buf_len, f, object_ref, - owner_address) + self._put_object_to_store(metadata, buf_len, f, object_ref) return total def delete_spilled_objects(self, urls: List[str]): @@ -275,25 +243,6 @@ def delete_spilled_objects(self, urls: List[str]): filename = parse_url_with_offset(url.decode()).base_url os.remove(os.path.join(self.directory_path, filename)) - def destroy_external_storage(self): - # Q: Should we add stdout here to - # indicate we are deleting a directory? - - # There's a race condition where IO workers are still - # deleting each objects while we try deleting the - # whole directory. So we should keep trying it until - # The directory is actually deleted. - while os.path.isdir(self.directory_path): - try: - shutil.rmtree(self.directory_path) - except FileNotFoundError: - # If excpetion occurs when other IO workers are - # deleting the file at the same time. - pass - except Exception: - logger.exception("Error cleaning up spill files") - break - class ExternalStorageSmartOpenImpl(ExternalStorage): """The external storage class implemented by smart_open. @@ -338,7 +287,7 @@ def __init__(self, self.transport_params = {"defer_seek": True} self.transport_params.update(self.override_transport_params) - def spill_objects(self, object_refs, owner_addresses) -> List[str]: + def spill_objects(self, object_refs) -> List[str]: if len(object_refs) == 0: return [] from smart_open import open @@ -349,8 +298,7 @@ def spill_objects(self, object_refs, owner_addresses) -> List[str]: with open( url, "wb", transport_params=self.transport_params) as file_like: - return self._write_multiple_objects(file_like, object_refs, - owner_addresses, url) + return self._write_multiple_objects(file_like, object_refs, url) def restore_spilled_objects(self, object_refs: List[ObjectRef], url_with_offset_list: List[str]): @@ -371,24 +319,18 @@ def restore_spilled_objects(self, object_refs: List[ObjectRef], # smart open seek reads the file from offset-end_of_the_file # when the seek is called. f.seek(offset) - address_len = int.from_bytes(f.read(8), byteorder="little") metadata_len = int.from_bytes(f.read(8), byteorder="little") buf_len = int.from_bytes(f.read(8), byteorder="little") self._size_check(metadata_len, buf_len, parsed_result.size) - owner_address = f.read(address_len) total += buf_len metadata = f.read(metadata_len) # read remaining data to our buffer - self._put_object_to_store(metadata, buf_len, f, object_ref, - owner_address) + self._put_object_to_store(metadata, buf_len, f, object_ref) return total def delete_spilled_objects(self, urls: List[str]): pass - def destroy_external_storage(self): - pass - _external_storage = NullStorage() @@ -403,15 +345,10 @@ def setup_external_storage(config): elif storage_type == "smart_open": _external_storage = ExternalStorageSmartOpenImpl( **config["params"]) - elif storage_type == "mock_distributed_fs": - # This storage is used to unit test distributed external storages. - # TODO(sang): Delete it after introducing the mock S3 test. - _external_storage = FileSystemStorage(**config["params"]) else: raise ValueError(f"Unknown external storage type: {storage_type}") else: _external_storage = NullStorage() - return _external_storage def reset_external_storage(): @@ -419,17 +356,16 @@ def reset_external_storage(): _external_storage = NullStorage() -def spill_objects(object_refs, owner_addresses): +def spill_objects(object_refs): """Spill objects to the external storage. Objects are specified by their object refs. Args: object_refs: The list of the refs of the objects to be spilled. - owner_addresses: The owner addresses of the provided object refs. Returns: A list of keys corresponding to the input object refs. """ - return _external_storage.spill_objects(object_refs, owner_addresses) + return _external_storage.spill_objects(object_refs) def restore_spilled_objects(object_refs: List[ObjectRef], diff --git a/python/ray/includes/common.pxd b/python/ray/includes/common.pxd index 679ff6f0aa3b..a7ba4b23b8b2 100644 --- a/python/ray/includes/common.pxd +++ b/python/ray/includes/common.pxd @@ -270,8 +270,7 @@ cdef extern from "ray/core_worker/common.h" nogil: CPlacementGroupCreationOptions( const c_string &name, CPlacementStrategy strategy, - const c_vector[unordered_map[c_string, double]] &bundles, - c_bool is_detached + const c_vector[unordered_map[c_string, double]] &bundles ) cdef extern from "ray/gcs/gcs_client.h" nogil: diff --git a/python/ray/includes/global_state_accessor.pxd b/python/ray/includes/global_state_accessor.pxd index e27aa0547d2a..31418f10c0af 100644 --- a/python/ray/includes/global_state_accessor.pxd +++ b/python/ray/includes/global_state_accessor.pxd @@ -32,6 +32,4 @@ cdef extern from "ray/gcs/gcs_client/global_state_accessor.h" nogil: c_bool AddWorkerInfo(const c_string &serialized_string) unique_ptr[c_string] GetPlacementGroupInfo( const CPlacementGroupID &placement_group_id) - unique_ptr[c_string] GetPlacementGroupByName( - const c_string &placement_group_name) c_vector[c_string] GetAllPlacementGroupInfo() diff --git a/python/ray/includes/global_state_accessor.pxi b/python/ray/includes/global_state_accessor.pxi index 5690d3bab65e..cbb1bac0aed9 100644 --- a/python/ray/includes/global_state_accessor.pxi +++ b/python/ray/includes/global_state_accessor.pxi @@ -147,13 +147,3 @@ cdef class GlobalStateAccessor: if result: return c_string(result.get().data(), result.get().size()) return None - - def get_placement_group_by_name(self, placement_group_name): - cdef unique_ptr[c_string] result - cdef c_string cplacement_group_name = placement_group_name - with nogil: - result = self.inner.get().GetPlacementGroupByName( - cplacement_group_name) - if result: - return c_string(result.get().data(), result.get().size()) - return None diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd index 2eb5f109bf65..f1acad1fadd8 100644 --- a/python/ray/includes/libcoreworker.pxd +++ b/python/ray/includes/libcoreworker.pxd @@ -183,9 +183,6 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: CRayStatus Get(const c_vector[CObjectID] &ids, int64_t timeout_ms, c_vector[shared_ptr[CRayObject]] *results, c_bool plasma_objects_only) - CRayStatus GetIfLocal( - const c_vector[CObjectID] &ids, - c_vector[shared_ptr[CRayObject]] *results) CRayStatus Contains(const CObjectID &object_id, c_bool *has_object) CRayStatus Wait(const c_vector[CObjectID] &object_ids, int num_objects, int64_t timeout_ms, c_vector[c_bool] *results, @@ -241,16 +238,13 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: (void(const CWorkerID &) nogil) on_worker_shutdown (CRayStatus() nogil) check_signals (void() nogil) gc_collect - (c_vector[c_string]( - const c_vector[CObjectID] &, - const c_vector[c_string] &) nogil) spill_objects + (c_vector[c_string](const c_vector[CObjectID] &) nogil) spill_objects (int64_t( const c_vector[CObjectID] &, const c_vector[c_string] &) nogil) restore_spilled_objects (void( const c_vector[c_string]&, CWorkerType) nogil) delete_spilled_objects - (void(const CRayObject&) nogil) unhandled_exception_handler (void(c_string *stack_out) nogil) get_lang_stack c_bool ref_counting_enabled c_bool is_local_mode diff --git a/python/ray/includes/object_ref.pxi b/python/ray/includes/object_ref.pxi index 31c59d08ba2c..3353e696edbf 100644 --- a/python/ray/includes/object_ref.pxi +++ b/python/ray/includes/object_ref.pxi @@ -1,7 +1,6 @@ from ray.includes.unique_ids cimport CObjectID import asyncio -from typing import Callable, Any import ray @@ -72,41 +71,10 @@ cdef class ObjectRef(BaseID): def as_future(self): loop = asyncio.get_event_loop() - py_future = loop.create_future() - - def callback(result): - loop = py_future._loop - - def set_future(): - # Issue #11030, #8841 - # If this future has result set already, we just need to - # skip the set result/exception procedure. - if py_future.done(): - return - - if isinstance(result, RayTaskError): - ray.worker.last_task_error_raise_time = time.time() - py_future.set_exception(result.as_instanceof_cause()) - elif isinstance(result, RayError): - # Directly raise exception for RayActorError - py_future.set_exception(result) - else: - py_future.set_result(result) - - loop.call_soon_threadsafe(set_future) - - self._on_completed(callback) + core_worker = ray.worker.global_worker.core_worker + future = loop.create_future() + core_worker.get_async(self, future) # A hack to keep a reference to the object ref for ref counting. - py_future.object_ref = self - return py_future - - def _on_completed(self, py_callback: Callable[[Any], None]): - """Register a callback that will be called after Object is ready. - If the ObjectRef is already ready, the callback will be called soon. - The callback should take the result as the only argument. The result - can be an exception object in case of task error. - """ - core_worker = ray.worker.global_worker.core_worker - core_worker.set_get_async_callback(self, py_callback) - return self + future.object_ref = self + return future diff --git a/python/ray/includes/ray_config.pxd b/python/ray/includes/ray_config.pxd index 309132cf74c6..079f30690998 100644 --- a/python/ray/includes/ray_config.pxd +++ b/python/ray/includes/ray_config.pxd @@ -13,7 +13,7 @@ cdef extern from "ray/common/ray_config.h" nogil: int64_t handler_warning_timeout_ms() const - int64_t raylet_heartbeat_period_milliseconds() const + int64_t raylet_heartbeat_timeout_milliseconds() const int64_t debug_dump_period_milliseconds() const diff --git a/python/ray/includes/ray_config.pxi b/python/ray/includes/ray_config.pxi index d6c28805c48c..96a2a14f24d8 100644 --- a/python/ray/includes/ray_config.pxi +++ b/python/ray/includes/ray_config.pxi @@ -10,8 +10,8 @@ cdef class Config: return RayConfig.instance().handler_warning_timeout_ms() @staticmethod - def raylet_heartbeat_period_milliseconds(): - return RayConfig.instance().raylet_heartbeat_period_milliseconds() + def raylet_heartbeat_timeout_milliseconds(): + return RayConfig.instance().raylet_heartbeat_timeout_milliseconds() @staticmethod def debug_dump_period_milliseconds(): diff --git a/python/ray/internal/internal_api.py b/python/ray/internal/internal_api.py index 7956725b7b05..67c1a9275f37 100644 --- a/python/ray/internal/internal_api.py +++ b/python/ray/internal/internal_api.py @@ -13,9 +13,7 @@ def global_gc(): worker.core_worker.global_gc() -def memory_summary(node_manager_address=None, - node_manager_port=None, - stats_only=False): +def memory_summary(node_manager_address=None, node_manager_port=None): """Returns a formatted string describing memory usage in the cluster.""" import grpc @@ -65,11 +63,6 @@ def memory_summary(node_manager_address=None, reply.store_stats.restored_objects_total, int(reply.store_stats.restored_bytes_total / (1024 * 1024) / reply.store_stats.restore_time_total_s))) - if reply.store_stats.consumed_bytes > 0: - store_summary += ("Objects consumed by Ray tasks: {} MiB.".format( - int(reply.store_stats.consumed_bytes / (1024 * 1024)))) - if stats_only: - return store_summary return reply.memory_summary + "\n" + store_summary diff --git a/python/ray/memory_monitor.py b/python/ray/memory_monitor.py index 448678d0283f..9381c506459e 100644 --- a/python/ray/memory_monitor.py +++ b/python/ray/memory_monitor.py @@ -54,9 +54,7 @@ def get_message(used_gb, total_gb, threshold): round(get_shared(psutil.virtual_memory()) / (1024**3), 2)) + "currently being used by the Ray object store.\n---\n" "--- Tip: Use the `ray memory` command to list active " - "objects in the cluster.\n" - "--- To disable OOM exceptions, set " - "RAY_DISABLE_MEMORY_MONITOR=1.\n---\n") + "objects in the cluster.\n---\n") class MemoryMonitor: @@ -122,9 +120,8 @@ def get_memory_usage(self): def raise_if_low_memory(self): if time.time() - self.last_checked > self.check_interval: - if ("RAY_DEBUG_DISABLE_MEMORY_MONITOR" in os.environ - or "RAY_DISABLE_MEMORY_MONITOR" in os.environ): - return + if "RAY_DEBUG_DISABLE_MEMORY_MONITOR" in os.environ: + return # escape hatch, not intended for user use self.last_checked = time.time() used_gb, total_gb = self.get_memory_usage() diff --git a/python/ray/monitor.py b/python/ray/monitor.py index 30b7f35a578e..fe1edad6380d 100644 --- a/python/ray/monitor.py +++ b/python/ray/monitor.py @@ -8,8 +8,6 @@ import traceback import json -import grpc - import ray from ray.autoscaler._private.autoscaler import StandardAutoscaler from ray.autoscaler._private.commands import teardown_cluster @@ -19,10 +17,11 @@ from ray.autoscaler._private.constants import \ AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE from ray.autoscaler._private.util import DEBUG_AUTOSCALING_STATUS - -from ray.core.generated import gcs_service_pb2, gcs_service_pb2_grpc +import ray.gcs_utils +import ray.utils import ray.ray_constants as ray_constants from ray.ray_logging import setup_component_logger +from ray._raylet import GlobalStateAccessor from ray.experimental.internal_kv import _internal_kv_put, \ _internal_kv_initialized, _internal_kv_get @@ -91,17 +90,16 @@ def __init__(self, redis_address, redis_password=redis_password) self.redis = ray._private.services.create_redis_client( redis_address, password=redis_password) - - # Initialize the gcs stub for getting all node resource usage. - gcs_address = self.redis.get("GcsServerAddress").decode("utf-8") - gcs_channel = grpc.insecure_channel(gcs_address) - self.gcs_node_resources_stub = \ - gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel) - + self.global_state_accessor = GlobalStateAccessor( + redis_address, redis_password, False) + self.global_state_accessor.connect() # Set the redis client and mode so _internal_kv works for autoscaler. worker = ray.worker.global_worker worker.redis_client = self.redis worker.mode = 0 + # Keep a mapping from raylet client ID to IP address to use + # for updating the load metrics. + self.raylet_id_to_ip_map = {} head_node_ip = redis_address.split(":")[0] self.load_metrics = LoadMetrics(local_ip=head_node_ip) self.last_avail_resources = None @@ -119,14 +117,19 @@ def __init__(self, logger.info("Monitor: Started") + def __del__(self): + """Destruct the monitor object.""" + # We close the pubsub client to avoid leaking file descriptors. + if self.global_state_accessor is not None: + self.global_state_accessor.disconnect() + self.global_state_accessor = None + def update_load_metrics(self): """Fetches resource usage data from GCS and updates load metrics.""" - request = gcs_service_pb2.GetAllResourceUsageRequest() - response = self.gcs_node_resources_stub.GetAllResourceUsage( - request, timeout=4) - resources_batch_data = response.resource_usage_data - + all_resources = self.global_state_accessor.get_all_resource_usage() + resources_batch_data = \ + ray.gcs_utils.ResourceUsageBatchData.FromString(all_resources) for resource_message in resources_batch_data.batch: resource_load = dict(resource_message.resource_load) total_resources = dict(resource_message.resources_total) @@ -138,10 +141,17 @@ def update_load_metrics(self): pending_placement_groups = list( resources_batch_data.placement_group_load.placement_group_data) - ip = resource_message.node_manager_address - self.load_metrics.update( - ip, total_resources, available_resources, resource_load, - waiting_bundles, infeasible_bundles, pending_placement_groups) + # Update the load metrics for this raylet. + node_id = ray.utils.binary_to_hex(resource_message.node_id) + ip = self.raylet_id_to_ip_map.get(node_id) + if ip: + self.load_metrics.update(ip, total_resources, + available_resources, resource_load, + waiting_bundles, infeasible_bundles, + pending_placement_groups) + else: + logger.warning( + f"Monitor: could not find ip for node {node_id}") def update_resource_requests(self): """Fetches resource requests from the internal KV and updates load.""" @@ -156,10 +166,29 @@ def update_resource_requests(self): except Exception: logger.exception("Error parsing resource requests") + def update_raylet_map(self, _append_port=False): + """Updates internal raylet map. + + Args: + _append_port (bool): Defaults to False. Appending the port is + useful in testing, as mock clusters have many nodes with + the same IP and cannot be uniquely identified. + """ + all_raylet_nodes = ray.nodes() + self.raylet_id_to_ip_map = {} + for raylet_info in all_raylet_nodes: + node_id = (raylet_info.get("DBClientID") or raylet_info["NodeID"]) + ip_address = (raylet_info.get("AuxAddress") + or raylet_info["NodeManagerAddress"]).split(":")[0] + if _append_port: + ip_address += ":" + str(raylet_info["NodeManagerPort"]) + self.raylet_id_to_ip_map[node_id] = ip_address + def _run(self): """Run the monitor loop.""" while True: + self.update_raylet_map() self.update_load_metrics() self.update_resource_requests() self.update_event_summary() @@ -335,9 +364,9 @@ def run(self): # Something went wrong, so push an error to all drivers. redis_client = ray._private.services.create_redis_client( args.redis_address, password=args.redis_password) + traceback_str = ray.utils.format_error_message(traceback.format_exc()) message = ("The monitor failed with the " - f"following error:\n{traceback.format_exc()}") - from ray.utils import push_error_to_driver_through_redis - push_error_to_driver_through_redis( + f"following error:\n{traceback_str}") + ray.utils.push_error_to_driver_through_redis( redis_client, ray_constants.MONITOR_DIED_ERROR, message) raise e diff --git a/python/ray/node.py b/python/ray/node.py index 05f3383a552f..425965021240 100644 --- a/python/ray/node.py +++ b/python/ray/node.py @@ -11,12 +11,8 @@ import subprocess import sys import tempfile -import threading import time -from typing import Optional, Dict -from collections import defaultdict - import ray import ray.ray_constants as ray_constants import ray._private.services @@ -92,7 +88,6 @@ def __init__(self, self.kernel_fate_share = bool( spawn_reaper and ray.utils.detect_fate_sharing_support()) self.all_processes = {} - self.removal_lock = threading.Lock() # Try to get node IP address with the parameters. if ray_params.node_ip_address: @@ -120,12 +115,24 @@ def __init__(self, raise ValueError( "Internal config parameters can only be set on the head node.") + if ray_params._lru_evict: + assert (connect_only or + head), "LRU Evict can only be passed into the head node." + self._raylet_ip_address = raylet_ip_address + self.metrics_agent_port = (ray_params.metrics_agent_port + or self._get_unused_port()[0]) + self._metrics_export_port = ray_params.metrics_export_port + if self._metrics_export_port is None: + self._metrics_export_port = self._get_unused_port()[0] + ray_params.update_if_absent( include_log_monitor=True, resources={}, temp_dir=ray.utils.get_ray_temp_dir(), + metrics_agent_port=self.metrics_agent_port, + metrics_export_port=self._metrics_export_port, worker_path=os.path.join( os.path.dirname(os.path.abspath(__file__)), "workers/default_worker.py")) @@ -140,18 +147,6 @@ def __init__(self, if "plasma_store_as_thread" not in self._config: self._config["plasma_store_as_thread"] = True - # Configure log rotation parameters. - self.max_bytes = int( - os.getenv("RAY_ROTATION_MAX_BYTES", - ray_constants.LOGGING_ROTATE_BYTES)) - self.backup_count = int( - os.getenv("RAY_ROTATION_BACKUP_COUNT", - ray_constants.LOGGING_ROTATE_BACKUP_COUNT)) - - assert self.max_bytes >= 0 - assert self.backup_count >= 0 - - # Register the temp dir. if head: redis_client = None # date including microsecond @@ -165,11 +160,6 @@ def __init__(self, self._init_temp(redis_client) - # If it is a head node, try validating if - # external storage is configurable. - if head: - self.validate_external_storage() - if connect_only: # Get socket names from the configuration. self._plasma_store_socket_name = ( @@ -200,15 +190,6 @@ def __init__(self, self._raylet_socket_name = self._prepare_socket_file( self._ray_params.raylet_socket_name, default_prefix="raylet") - self.metrics_agent_port = self._get_cached_port( - "metrics_agent_port", default_port=ray_params.metrics_agent_port) - self._metrics_export_port = self._get_cached_port( - "metrics_export_port", default_port=ray_params.metrics_export_port) - - ray_params.update_if_absent( - metrics_agent_port=self.metrics_agent_port, - metrics_export_port=self._metrics_export_port) - if head: ray_params.update_if_absent(num_redis_shards=1) self._webui_url = None @@ -307,10 +288,9 @@ def merge_resources(env_dict, params_dict): for key in set(env_dict.keys()).intersection( set(params_dict.keys())): - if params_dict[key] != env_dict[key]: - logger.warning("Autoscaler is overriding your resource:" - "{}: {} with {}.".format( - key, params_dict[key], env_dict[key])) + logger.warning("Autoscaler is overriding your resource:" + "{}: {} with {}.".format( + key, params_dict[key], env_dict[key])) return num_cpus, num_gpus, memory, object_store_memory, result if not self._resource_spec: @@ -402,14 +382,6 @@ def socket(self): except AttributeError: return None - @property - def logging_config(self): - """Get the logging config of the current node.""" - return { - "log_rotation_max_bytes": self.max_bytes, - "log_rotation_backup_count": self.backup_count - } - @property def address_info(self): """Get a dictionary of addresses.""" @@ -424,9 +396,6 @@ def address_info(self): "metrics_export_port": self._metrics_export_port } - def is_head(self): - return self.head - def create_redis_client(self): """Create a redis client.""" return ray._private.services.create_redis_client( @@ -586,50 +555,6 @@ def _prepare_socket_file(self, socket_path, default_prefix): "{} bytes: {!r}".format(maxlen, result)) return result - def _get_cached_port(self, - port_name: str, - default_port: Optional[int] = None) -> int: - """Get a port number from a cache on this node. - - Different driver processes on a node should use the same ports for - some purposes, e.g. exporting metrics. This method returns a port - number for the given port name and caches it in a file. If the - port isn't already cached, an unused port is generated and cached. - - Args: - port_name (str): the name of the port, e.g. metrics_export_port - default_port (Optional[int]): The port to return and cache if no - port has already been cached for the given port_name. If None, an - unused port is generated and cached. - Returns: - port (int): the port number. - """ - file_path = os.path.join(self.get_session_dir_path(), - "ports_by_node.json") - - # Maps a Node.unique_id to a dict that maps port names to port numbers. - ports_by_node: Dict[str, Dict[str, int]] = defaultdict(dict) - - if not os.path.exists(file_path): - with open(file_path, "w") as f: - json.dump({}, f) - - with open(file_path, "r") as f: - ports_by_node.update(json.load(f)) - - if (self.unique_id in ports_by_node - and port_name in ports_by_node[self.unique_id]): - # The port has already been cached at this node, so use it. - port = int(ports_by_node[self.unique_id][port_name]) - else: - # Pick a new port to use and cache it at this node. - port = (default_port or self._get_unused_port()[0]) - ports_by_node[self.unique_id][port_name] = port - with open(file_path, "w") as f: - json.dump(ports_by_node, f) - - return port - def start_reaper_process(self): """ Start the reaper process. @@ -679,9 +604,7 @@ def start_log_monitor(self): stdout_file=subprocess.DEVNULL, stderr_file=subprocess.DEVNULL, redis_password=self._ray_params.redis_password, - fate_share=self.kernel_fate_share, - max_bytes=self.max_bytes, - backup_count=self.backup_count) + fate_share=self.kernel_fate_share) assert ray_constants.PROCESS_TYPE_LOG_MONITOR not in self.all_processes self.all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR] = [ process_info, @@ -705,8 +628,6 @@ def start_dashboard(self, require_dashboard): stderr_file=subprocess.DEVNULL, # Avoid hang(fd inherit) redis_password=self._ray_params.redis_password, fate_share=self.kernel_fate_share, - max_bytes=self.max_bytes, - backup_count=self.backup_count, port=self._ray_params.dashboard_port) assert ray_constants.PROCESS_TYPE_DASHBOARD not in self.all_processes if process_info is not None: @@ -802,8 +723,6 @@ def start_raylet(self, fate_share=self.kernel_fate_share, socket_to_use=self.socket, head_node=self.head, - max_bytes=self.max_bytes, - backup_count=self.backup_count, start_initial_python_workers_for_first_job=self._ray_params. start_initial_python_workers_for_first_job) assert ray_constants.PROCESS_TYPE_RAYLET not in self.all_processes @@ -829,9 +748,7 @@ def start_monitor(self): stderr_file=stderr_file, autoscaling_config=self._ray_params.autoscaling_config, redis_password=self._ray_params.redis_password, - fate_share=self.kernel_fate_share, - max_bytes=self.max_bytes, - backup_count=self.backup_count) + fate_share=self.kernel_fate_share) assert ray_constants.PROCESS_TYPE_MONITOR not in self.all_processes self.all_processes[ray_constants.PROCESS_TYPE_MONITOR] = [process_info] @@ -921,23 +838,6 @@ def _kill_process_type(self, 2. The process had been started in valgrind and had a non-zero exit code. """ - - # Ensure thread safety - with self.removal_lock: - self._kill_process_impl( - process_type, - allow_graceful=allow_graceful, - check_alive=check_alive, - wait=wait) - - def _kill_process_impl(self, - process_type, - allow_graceful=False, - check_alive=True, - wait=False): - """See `_kill_process_type`.""" - if process_type not in self.all_processes: - return process_infos = self.all_processes[process_type] if process_type != ray_constants.PROCESS_TYPE_REDIS_SERVER: assert len(process_infos) == 1 @@ -1175,53 +1075,3 @@ def remaining_processes_alive(self): True if any process that wasn't explicitly killed is still alive. """ return not any(self.dead_processes()) - - def destroy_external_storage(self): - object_spilling_config = self._config.get("object_spilling_config", {}) - if object_spilling_config: - object_spilling_config = json.loads(object_spilling_config) - from ray import external_storage - storage = external_storage.setup_external_storage( - object_spilling_config) - storage.destroy_external_storage() - - def validate_external_storage(self): - """Make sure we can setup the object spilling external storage. - This will also fill up the default setting for object spilling - if not specified. - """ - object_spilling_config = self._config.get("object_spilling_config", {}) - automatic_spilling_enabled = self._config.get( - "automatic_object_spilling_enabled", True) - if not automatic_spilling_enabled: - return - - # If the config is not specified, we fill up the default. - if not object_spilling_config: - object_spilling_config = json.dumps({ - "type": "filesystem", - "params": { - "directory_path": self._session_dir - } - }) - - # Try setting up the storage. - # Configure the proper system config. - # We need to set both ray param's system config and self._config - # because they could've been diverged at this point. - deserialized_config = json.loads(object_spilling_config) - self._ray_params._system_config["object_spilling_config"] = ( - object_spilling_config) - self._config["object_spilling_config"] = object_spilling_config - - is_external_storage_type_fs = ( - deserialized_config["type"] == "filesystem") - self._ray_params._system_config["is_external_storage_type_fs"] = ( - is_external_storage_type_fs) - self._config["is_external_storage_type_fs"] = ( - is_external_storage_type_fs) - - # Validate external storage usage. - from ray import external_storage - external_storage.setup_external_storage(deserialized_config) - external_storage.reset_external_storage() diff --git a/python/ray/ray_operator/__init__.py b/python/ray/operator/__init__.py similarity index 100% rename from python/ray/ray_operator/__init__.py rename to python/ray/operator/__init__.py diff --git a/python/ray/ray_operator/operator.py b/python/ray/operator/operator.py similarity index 72% rename from python/ray/ray_operator/operator.py rename to python/ray/operator/operator.py index bfbde80553ce..cf83eaa240d5 100644 --- a/python/ray/ray_operator/operator.py +++ b/python/ray/operator/operator.py @@ -9,15 +9,13 @@ from ray._private import services from ray.autoscaler._private import commands from ray import monitor -from ray.ray_operator import operator_utils +from ray.operator import operator_utils from ray import ray_constants -logger = logging.getLogger(__name__) - class RayCluster(): def __init__(self, config: Dict[str, Any]): - self.set_config(config) + self.config = config self.name = self.config["cluster_name"] self.config_path = operator_utils.config_path(self.name) @@ -25,9 +23,6 @@ def __init__(self, config: Dict[str, Any]): self.subprocess = None # type: Optional[mp.Process] - def set_config(self, config: Dict[str, Any]) -> None: - self.config = config - def do_in_subprocess(self, f: Callable[[], None], wait_to_finish: bool = False) -> None: @@ -62,8 +57,7 @@ def start_head(self) -> None: no_restart=False, restart_only=False, yes=True, - no_config_cache=True, - no_monitor_on_head=True) + no_config_cache=True) self.write_config() def start_monitor(self) -> None: @@ -102,42 +96,18 @@ def delete_config(self) -> None: ray_clusters = {} -last_generation = {} - -def handle_event(event_type, cluster_cr, cluster_name): - # TODO: This only detects errors in the parent process and thus doesn't - # catch cluster-specific autoscaling failures. Fix that (perhaps at - # the same time that we eliminate subprocesses). - try: - cluster_action(event_type, cluster_cr, cluster_name) - except Exception: - logger.exception(f"Error while updating RayCluster {cluster_name}.") - operator_utils.set_status(cluster_cr, cluster_name, "Error") - -def cluster_action(event_type, cluster_cr, cluster_name) -> None: - - cluster_config = operator_utils.cr_to_config(cluster_cr) +def cluster_action(cluster_config: Dict[str, Any], event_type: str) -> None: cluster_name = cluster_config["cluster_name"] - if event_type == "ADDED": - operator_utils.set_status(cluster_cr, cluster_name, "Running") ray_clusters[cluster_name] = RayCluster(cluster_config) ray_clusters[cluster_name].create_or_update() - last_generation[cluster_name] = cluster_cr["metadata"]["generation"] elif event_type == "MODIFIED": - # Check metadata.generation to determine if there's a spec change. - current_generation = cluster_cr["metadata"]["generation"] - if current_generation > last_generation[cluster_name]: - ray_clusters[cluster_name].set_config(cluster_config) - ray_clusters[cluster_name].create_or_update() - last_generation[cluster_name] = current_generation - + ray_clusters[cluster_name].create_or_update() elif event_type == "DELETED": ray_clusters[cluster_name].clean_up() del ray_clusters[cluster_name] - del last_generation[cluster_name] def main() -> None: @@ -149,9 +119,9 @@ def main() -> None: try: for event in cluster_cr_stream: cluster_cr = event["object"] - cluster_name = cluster_cr["metadata"]["name"] event_type = event["type"] - handle_event(event_type, cluster_cr, cluster_name) + cluster_config = operator_utils.cr_to_config(cluster_cr) + cluster_action(cluster_config, event_type) except ApiException as e: if e.status == 404: raise Exception( diff --git a/python/ray/operator/operator_utils.py b/python/ray/operator/operator_utils.py new file mode 100644 index 000000000000..08926a723857 --- /dev/null +++ b/python/ray/operator/operator_utils.py @@ -0,0 +1,99 @@ +import copy +import logging +import os +from typing import Any, Dict, Iterator + +from kubernetes.watch import Watch + +from ray.autoscaler._private.kubernetes import custom_objects_api + +RAY_NAMESPACE = os.environ.get("RAY_OPERATOR_POD_NAMESPACE") + +RAY_CONFIG_DIR = os.path.expanduser("~/ray_cluster_configs") +CONFIG_SUFFIX = "_config.yaml" + +CONFIG_FIELDS = { + "maxWorkers": "max_workers", + "upscalingSpeed": "upscaling_speed", + "idleTimeoutMinutes": "idle_timeout_minutes", + "headPodType": "head_node_type", + "workerDefaultPodType": "worker_default_node_type", + "workerStartRayCommands": "worker_start_ray_commands", + "headStartRayCommands": "head_start_ray_commands", + "podTypes": "available_node_types" +} + +NODE_TYPE_FIELDS = { + "minWorkers": "min_workers", + "maxWorkers": "max_workers", + "podConfig": "node_config", + "rayResources": "resources", + "setupCommands": "worker_setup_commands" +} + +PROVIDER_CONFIG = { + "type": "kubernetes", + "use_internal_ips": True, + "namespace": RAY_NAMESPACE +} + +root_logger = logging.getLogger("ray") +root_logger.setLevel(logging.getLevelName("DEBUG")) + + +def config_path(cluster_name: str) -> str: + file_name = cluster_name + CONFIG_SUFFIX + return os.path.join(RAY_CONFIG_DIR, file_name) + + +def cluster_cr_stream() -> Iterator: + w = Watch() + return w.stream( + custom_objects_api().list_namespaced_custom_object, + namespace=RAY_NAMESPACE, + group="cluster.ray.io", + version="v1", + plural="rayclusters") + + +def cr_to_config(cluster_resource: Dict[str, Any]) -> Dict[str, Any]: + """Convert RayCluster custom resource to a ray cluster config for use by the + autoscaler.""" + config = translate(cluster_resource["spec"], dictionary=CONFIG_FIELDS) + config["available_node_types"] = get_node_types(cluster_resource) + config["cluster_name"] = cluster_resource["metadata"]["name"] + config["provider"] = PROVIDER_CONFIG + return config + + +def get_node_types(cluster_resource: Dict[str, Any]) -> Dict[str, Any]: + cluster_owner_reference = get_cluster_owner_reference(cluster_resource) + node_types = {} + for pod_type in cluster_resource["spec"]["podTypes"]: + name = pod_type["name"] + pod_type_copy = copy.deepcopy(pod_type) + pod_type_copy.pop("name") + node_types[name] = translate( + pod_type_copy, dictionary=NODE_TYPE_FIELDS) + # Deleting a RayCluster CR will also delete the associated pods. + node_types[name]["node_config"]["metadata"].update({ + "ownerReferences": [cluster_owner_reference] + }) + return node_types + + +def get_cluster_owner_reference( + cluster_resource: Dict[str, Any]) -> Dict[str, Any]: + return { + "apiVersion": cluster_resource["apiVersion"], + "kind": cluster_resource["kind"], + "blockOwnerDeletion": True, + "controller": True, + "name": cluster_resource["metadata"]["name"], + "uid": cluster_resource["metadata"]["uid"] + } + + +def translate(configuration: Dict[str, Any], + dictionary: Dict[str, str]) -> Dict[str, Any]: + return {dictionary[field]: configuration[field] for field in configuration} diff --git a/python/ray/parameter.py b/python/ray/parameter.py index bdeec7627e58..a9b20769d1e2 100644 --- a/python/ray/parameter.py +++ b/python/ray/parameter.py @@ -1,3 +1,4 @@ +import json import logging import os @@ -17,12 +18,9 @@ class RayParams: raylet, a plasma store, a plasma manager, and some workers. It will also kill these processes when Python exits. redis_port (int): The port that the primary Redis shard should listen - to. If None, then it will fall back to - ray.ray_constants.DEFAULT_PORT, or a random port if the default is - not available. + to. If None, then a random port will be chosen. redis_shard_ports: A list of the ports to use for the non-primary Redis - shards. If None, then it will fall back to the ports right after - redis_port, or random ports if those are not available. + shards. num_cpus (int): Number of CPUs to configure the raylet with. num_gpus (int): Number of GPUs to configure the raylet with. resources: A dictionary mapping the name of a resource to the quantity @@ -102,6 +100,7 @@ class RayParams: _system_config (dict): Configuration for overriding RayConfig defaults. Used to set system configuration and for experimental Ray core feature flags. + lru_evict (bool): Enable LRU eviction if space is needed. enable_object_reconstruction (bool): Enable plasma reconstruction on failure. start_initial_python_workers_for_first_job (bool): If true, start @@ -198,22 +197,30 @@ def __init__(self, self.start_initial_python_workers_for_first_job = ( start_initial_python_workers_for_first_job) self._system_config = _system_config or {} + self._lru_evict = lru_evict self._enable_object_reconstruction = enable_object_reconstruction self._check_usage() # Set the internal config options for LRU eviction. if lru_evict: - raise DeprecationWarning( - "The lru_evict flag is deprecated as Ray natively " - "supports object spilling. Please read " - "https://docs.ray.io/en/master/memory-management.html#object-spilling " # noqa - "for more details.") + # Turn off object pinning. + if self._system_config is None: + self._system_config = dict() + if self._system_config.get("object_pinning_enabled", False): + raise Exception( + "Object pinning cannot be enabled if using LRU eviction.") + self._system_config["object_pinning_enabled"] = False + self._system_config["free_objects_period_milliseconds"] = 1000 # Set the internal config options for object reconstruction. if enable_object_reconstruction: # Turn off object pinning. if self._system_config is None: self._system_config = dict() + if lru_evict: + raise Exception( + "Object reconstruction cannot be enabled if using LRU " + "eviction.") print(self._system_config) self._system_config["lineage_pinning_enabled"] = True self._system_config["free_objects_period_milliseconds"] = -1 @@ -313,3 +320,13 @@ def _check_usage(self): if numpy_major <= 1 and numpy_minor < 16: logger.warning("Using ray with numpy < 1.16.0 will result in slow " "serialization. Upgrade numpy if using with ray.") + + # Make sure object spilling configuration is applicable. + object_spilling_config = self._system_config.get( + "object_spilling_config", {}) + if object_spilling_config: + object_spilling_config = json.loads(object_spilling_config) + from ray import external_storage + # Validate external storage usage. + external_storage.setup_external_storage(object_spilling_config) + external_storage.reset_external_storage() diff --git a/python/ray/ray_constants.py b/python/ray/ray_constants.py index cbfbaaa5bc08..a5459b8637ba 100644 --- a/python/ray/ray_constants.py +++ b/python/ray/ray_constants.py @@ -150,9 +150,12 @@ def to_memory_units(memory_bytes, round_up): LOGGER_LEVEL_CHOICES = ["debug", "info", "warning", "error", "critical"] LOGGER_LEVEL_HELP = ("The logging level threshold, choices=['debug', 'info'," " 'warning', 'error', 'critical'], default='info'") - -LOGGING_ROTATE_BYTES = 512 * 1024 * 1024 # 512MB. -LOGGING_ROTATE_BACKUP_COUNT = 5 # 5 Backup files at max. +# Default param for RotatingFileHandler +# maxBytes. 10G by default. We intentionally set the default value high +# so that users who won't care don't know about the existence of this. +LOGGING_ROTATE_BYTES = 10 * 1000 * 1000 * 1000 +# The default will grow logs up until 500GB without log loss. +LOGGING_ROTATE_BACKUP_COUNT = 50 # backupCount # Constants used to define the different process types. PROCESS_TYPE_REAPER = "reaper" @@ -169,8 +172,6 @@ def to_memory_units(memory_bytes, round_up): PROCESS_TYPE_REDIS_SERVER = "redis_server" PROCESS_TYPE_WEB_UI = "web_ui" PROCESS_TYPE_GCS_SERVER = "gcs_server" -PROCESS_TYPE_PYTHON_CORE_WORKER_DRIVER = "python-core-driver" -PROCESS_TYPE_PYTHON_CORE_WORKER = "python-core-worker" # Log file names MONITOR_LOG_FILE_NAME = f"{PROCESS_TYPE_MONITOR}.log" @@ -234,4 +235,4 @@ def to_memory_units(memory_bytes, round_up): MAX_INT64_VALUE = 9223372036854775807 # Object Spilling related constants -DEFAULT_OBJECT_PREFIX = "ray_spilled_objects" +DEFAULT_OBJECT_PREFIX = "ray_spilled_object" diff --git a/python/ray/ray_logging.py b/python/ray/ray_logging.py index c9af57536b0c..56df7b5c2092 100644 --- a/python/ray/ray_logging.py +++ b/python/ray/ray_logging.py @@ -165,17 +165,15 @@ def get_worker_log_file_name(worker_type): "please report it to Ray's Github issue.") worker_name = "worker" else: - job_id = "" + job_id = ray.JobID.nil() worker_name = "io_worker" # Make sure these values are set already. assert ray.worker._global_node is not None assert ray.worker.global_worker is not None filename = (f"{worker_name}-" - f"{binary_to_hex(ray.worker.global_worker.worker_id)}-") - if job_id: - filename += f"{job_id}-" - filename += f"{os.getpid()}" + f"{binary_to_hex(ray.worker.global_worker.worker_id)}-" + f"{job_id}-{os.getpid()}") return filename diff --git a/python/ray/ray_operator/operator_utils.py b/python/ray/ray_operator/operator_utils.py deleted file mode 100644 index 98a31ce6f9b7..000000000000 --- a/python/ray/ray_operator/operator_utils.py +++ /dev/null @@ -1,145 +0,0 @@ -import copy -import logging -import os -from typing import Any, Dict, Iterator - -from kubernetes.watch import Watch - -from ray.autoscaler._private.kubernetes import custom_objects_api -from ray.autoscaler._private.providers import _get_default_config - -RAY_NAMESPACE = os.environ.get("RAY_OPERATOR_POD_NAMESPACE") - -RAY_CONFIG_DIR = os.environ.get("RAY_CONFIG_DIR") or \ - os.path.expanduser("~/ray_cluster_configs") - -CONFIG_SUFFIX = "_config.yaml" - -CONFIG_FIELDS = { - "maxWorkers": "max_workers", - "upscalingSpeed": "upscaling_speed", - "idleTimeoutMinutes": "idle_timeout_minutes", - "headPodType": "head_node_type", - "workerStartRayCommands": "worker_start_ray_commands", - "headStartRayCommands": "head_start_ray_commands", - "podTypes": "available_node_types" -} - -NODE_TYPE_FIELDS = { - "minWorkers": "min_workers", - "maxWorkers": "max_workers", - "podConfig": "node_config", - "rayResources": "resources", - "setupCommands": "worker_setup_commands" -} - -PROVIDER_CONFIG = { - "type": "kubernetes", - "use_internal_ips": True, - "namespace": RAY_NAMESPACE -} - -root_logger = logging.getLogger("ray") -root_logger.setLevel(logging.getLevelName("DEBUG")) - - -def config_path(cluster_name: str) -> str: - file_name = cluster_name + CONFIG_SUFFIX - return os.path.join(RAY_CONFIG_DIR, file_name) - - -def cluster_cr_stream() -> Iterator: - w = Watch() - return w.stream( - custom_objects_api().list_namespaced_custom_object, - namespace=RAY_NAMESPACE, - group="cluster.ray.io", - version="v1", - plural="rayclusters") - - -def cr_to_config(cluster_resource: Dict[str, Any]) -> Dict[str, Any]: - """Convert RayCluster custom resource to a ray cluster config for use by the - autoscaler.""" - config = translate(cluster_resource["spec"], dictionary=CONFIG_FIELDS) - cluster_name = cluster_resource["metadata"]["name"] - config["available_node_types"] = get_node_types(cluster_resource, - cluster_name) - config["cluster_name"] = cluster_name - config["provider"] = get_provider_config(cluster_name) - return config - - -def get_node_types(cluster_resource: Dict[str, Any], cluster_name) ->\ - Dict[str, Any]: - cluster_owner_reference = get_cluster_owner_reference( - cluster_resource, cluster_name) - node_types = {} - for pod_type in cluster_resource["spec"]["podTypes"]: - name = pod_type["name"] - pod_type_copy = copy.deepcopy(pod_type) - pod_type_copy.pop("name") - node_type = translate(pod_type_copy, dictionary=NODE_TYPE_FIELDS) - metadata = node_type["node_config"]["metadata"] - metadata.update({"ownerReferences": [cluster_owner_reference]}) - if name == cluster_resource["spec"]["headPodType"]: - if "labels" not in metadata: - metadata["labels"] = {} - metadata["labels"].update(head_service_selector(cluster_name)) - node_types[name] = node_type - return node_types - - -def get_provider_config(cluster_name): - default_kubernetes_config = _get_default_config({"type": "kubernetes"}) - default_provider_conf = default_kubernetes_config["provider"] - - # Configure head service for dashboard and client - head_service = copy.deepcopy(default_provider_conf["services"][0]) - service_name = f"{cluster_name}-ray-head" - head_service["metadata"]["name"] = service_name - head_service["spec"]["selector"] = head_service_selector(cluster_name) - - provider_conf = {} - provider_conf["type"] = "kubernetes" - provider_conf["use_internal_ips"] = True - provider_conf["namespace"] = RAY_NAMESPACE - provider_conf["services"] = [head_service] - return provider_conf - - -def head_service_selector(cluster_name): - return {"component": f"{cluster_name}-ray-head"} - - -def get_cluster_owner_reference(cluster_resource: Dict[str, Any], - cluster_name: str) -> Dict[str, Any]: - return { - "apiVersion": cluster_resource["apiVersion"], - "kind": cluster_resource["kind"], - "blockOwnerDeletion": True, - "controller": True, - "name": cluster_name, - "uid": cluster_resource["metadata"]["uid"] - } - - -def translate(configuration: Dict[str, Any], - dictionary: Dict[str, str]) -> Dict[str, Any]: - return { - dictionary[field]: configuration[field] - for field in dictionary if field in configuration - } - - -def set_status(cluster_cr: Dict[str, Any], cluster_name: str, - status: str) -> None: - # TODO: Add retry logic in case of 409 due to old resource version. - cluster_cr["status"] = {"phase": status} - custom_objects_api()\ - .patch_namespaced_custom_object_status(namespace=RAY_NAMESPACE, - group="cluster.ray.io", - version="v1", - plural="rayclusters", - name=cluster_name, - body=cluster_cr) diff --git a/python/ray/remote_function.py b/python/ray/remote_function.py index 3b8b42062b3e..e717e2d28fe7 100644 --- a/python/ray/remote_function.py +++ b/python/ray/remote_function.py @@ -4,8 +4,6 @@ from ray import cloudpickle as pickle from ray._raylet import PythonFunctionDescriptor from ray import cross_language, Language -from ray._private.client_mode_hook import client_mode_convert_function -from ray._private.client_mode_hook import client_mode_should_convert from ray.util.placement_group import ( PlacementGroup, check_placement_group_index, @@ -183,26 +181,6 @@ def _remote(self, override_environment_variables=None, name=""): """Submit the remote function for execution.""" - if client_mode_should_convert(): - return client_mode_convert_function( - self, - args, - kwargs, - num_returns=num_returns, - num_cpus=num_cpus, - num_gpus=num_gpus, - memory=memory, - object_store_memory=object_store_memory, - accelerator_type=accelerator_type, - resources=resources, - max_retries=max_retries, - placement_group=placement_group, - placement_group_bundle_index=placement_group_bundle_index, - placement_group_capture_child_tasks=( - placement_group_capture_child_tasks), - override_environment_variables=override_environment_variables, - name=name) - worker = ray.worker.global_worker worker.check_connected() diff --git a/python/ray/runtime_context.py b/python/ray/runtime_context.py index fed3ab132ae0..fa922cfa0267 100644 --- a/python/ray/runtime_context.py +++ b/python/ray/runtime_context.py @@ -1,6 +1,5 @@ import ray.worker import logging -from ray._private.client_mode_hook import client_mode_hook logger = logging.getLogger(__name__) @@ -150,7 +149,6 @@ def should_capture_child_tasks_in_placement_group(self): _runtime_context = None -@client_mode_hook def get_runtime_context(): global _runtime_context if _runtime_context is None: diff --git a/python/ray/scripts/scripts.py b/python/ray/scripts/scripts.py index 50ac89f03bf7..6fecd2dc272b 100644 --- a/python/ray/scripts/scripts.py +++ b/python/ray/scripts/scripts.py @@ -117,13 +117,13 @@ def cli(logging_level, logging_format): "-p", required=False, type=int, - default=ray_constants.DEFAULT_DASHBOARD_PORT, + default=8265, help="The local port to forward to the dashboard") @click.option( "--remote-port", required=False, type=int, - default=ray_constants.DEFAULT_DASHBOARD_PORT, + default=8265, help="The remote port your dashboard runs on") def dashboard(cluster_config_file, cluster_name, port, remote_port): """Port-forward a Ray cluster's dashboard to the local machine.""" @@ -285,7 +285,7 @@ def debug(address): "--ray-client-server-port", required=False, type=int, - default=10001, + default=None, help="the port number the ray client server will bind on. If not set, " "the ray client server will not be started.") @click.option( @@ -739,7 +739,6 @@ def stop(force, verbose, log_style, log_color): total_found = 0 total_stopped = 0 - stopped = [] for keyword, filter_by_cmd in processes_to_kill: if filter_by_cmd and is_linux and len(keyword) > 15: # getting here is an internal bug, so we do not use cli_logger @@ -778,7 +777,6 @@ def stop(force, verbose, log_style, log_color): cf.dimmed("(via SIGTERM)")) total_stopped += 1 - stopped.append(proc) except psutil.NoSuchProcess: cli_logger.verbose( "Attempted to stop `{}`, but process was already dead.", @@ -801,8 +799,8 @@ def stop(force, verbose, log_style, log_color): cli_logger.warning("Try running the command again, or use `{}`.", cf.bold("--force")) - # Wait for the processes to actually stop. - psutil.wait_procs(stopped, timeout=2) + # TODO(maximsmol): we should probably block until the processes actually + # all died somehow @cli.command() @@ -1372,13 +1370,7 @@ def timeline(address): type=str, default=ray_constants.REDIS_DEFAULT_PASSWORD, help="Connect to ray with redis_password.") -@click.option( - "--stats-only", - is_flag=True, - type=bool, - default=False, - help="Connect to ray with redis_password.") -def memory(address, redis_password, stats_only): +def memory(address, redis_password): """Print object references held in a Ray cluster.""" if not address: address = services.get_ray_address_to_use_or_die() @@ -1387,8 +1379,7 @@ def memory(address, redis_password, stats_only): raylet = state.node_table()[0] print( ray.internal.internal_api.memory_summary(raylet["NodeManagerAddress"], - raylet["NodeManagerPort"], - stats_only)) + raylet["NodeManagerPort"])) @cli.command() diff --git a/python/ray/serialization.py b/python/ray/serialization.py index a2009e4fd453..724cf477ef61 100644 --- a/python/ray/serialization.py +++ b/python/ray/serialization.py @@ -31,7 +31,7 @@ class DeserializationError(Exception): pass -def _object_ref_deserializer(binary, owner_address): +def object_ref_deserializer(reduced_obj_ref, owner_address): # NOTE(suquark): This function should be a global function so # cloudpickle can access it directly. Otherwise couldpickle # has to dump the whole function definition, which is inefficient. @@ -40,7 +40,9 @@ def _object_ref_deserializer(binary, owner_address): # the core worker to resolve the value. This is to make sure # that the ref count for the ObjectRef is greater than 0 by the # time the core worker resolves the value of the object. - obj_ref = ray.ObjectRef(binary) + + # UniqueIDs are serialized as (class name, (unique bytes,)). + obj_ref = reduced_obj_ref[0](*reduced_obj_ref[1]) # TODO(edoakes): we should be able to just capture a reference # to 'self' here instead, but this function is itself pickled @@ -59,7 +61,7 @@ def _object_ref_deserializer(binary, owner_address): return obj_ref -def _actor_handle_deserializer(serialized_obj): +def actor_handle_deserializer(serialized_obj): # If this actor handle was stored in another object, then tell the # core worker. context = ray.worker.global_worker.get_serialization_context() @@ -83,7 +85,7 @@ def actor_handle_reducer(obj): serialized, actor_handle_id = obj._serialization_helper() # Update ref counting for the actor handle self.add_contained_object_ref(actor_handle_id) - return _actor_handle_deserializer, (serialized, ) + return actor_handle_deserializer, (serialized, ) self._register_cloudpickle_reducer(ray.actor.ActorHandle, actor_handle_reducer) @@ -94,16 +96,13 @@ def object_ref_reducer(obj): worker.check_connected() obj, owner_address = ( worker.core_worker.serialize_and_promote_object_ref(obj)) - return _object_ref_deserializer, (obj.binary(), owner_address) + return object_ref_deserializer, (obj.__reduce__(), owner_address) self._register_cloudpickle_reducer(ray.ObjectRef, object_ref_reducer) def _register_cloudpickle_reducer(self, cls, reducer): pickle.CloudPickler.dispatch[cls] = reducer - def _unregister_cloudpickle_reducer(self, cls): - pickle.CloudPickler.dispatch.pop(cls, None) - def _register_cloudpickle_serializer(self, cls, custom_serializer, custom_deserializer): def _CloudPicklerReducer(obj): @@ -199,7 +198,7 @@ def _deserialize_object(self, data, metadata, object_ref): elif metadata_fields[ 0] == ray_constants.OBJECT_METADATA_TYPE_ACTOR_HANDLE: obj = self._deserialize_msgpack_data(data, metadata_fields) - return _actor_handle_deserializer(obj) + return actor_handle_deserializer(obj) # Otherwise, return an exception object based on # the error type. try: diff --git a/python/ray/serve/api.py b/python/ray/serve/api.py index 4c0a0a91ff7b..b42cd78464a7 100644 --- a/python/ray/serve/api.py +++ b/python/ray/serve/api.py @@ -66,8 +66,6 @@ def check(self, *args, **kwargs): class ThreadProxiedRouter: def __init__(self, controller_handle, sync: bool): - self.controller_handle = controller_handle - self.sync = sync self.router = Router(controller_handle) if sync: @@ -94,11 +92,6 @@ def _remote(self, endpoint_name, handle_options, request_data, **kwargs) return coro - def __reduce__(self): - deserializer = ThreadProxiedRouter - serialized_data = (self.controller_handle, self.sync) - return deserializer, serialized_data - class Client: def __init__(self, @@ -330,23 +323,22 @@ def get_backend_config(self, backend_tag: str) -> BackendConfig: def create_backend( self, backend_tag: str, - backend_def: Union[Callable, Type[Callable], str], - *init_args: Any, + func_or_class: Union[Callable, Type[Callable]], + *actor_init_args: Any, ray_actor_options: Optional[Dict] = None, config: Optional[Union[BackendConfig, Dict[str, Any]]] = None, env: Optional[CondaEnv] = None) -> None: """Create a backend with the provided tag. + The backend will serve requests with func_or_class. + Args: backend_tag (str): a unique tag assign to identify this backend. - backend_def (callable, class, str): a function or class - implementing __call__ and returning a JSON-serializable object - or a Starlette Response object. A string import path can also - be provided (e.g., "my_module.MyClass"), in which case the - underlying function or class will be imported dynamically in - the worker replicas. - *init_args (optional): the arguments to pass to the class - initialization method. Not valid if backend_def is a function. + func_or_class (callable, class): a function or a class implementing + __call__, returning a JSON-serializable object or a + Starlette Response object. + *actor_init_args (optional): the arguments to pass to the class + initialization method. ray_actor_options (optional): options to be passed into the @ray.remote decorator for the backend actor. config (dict, serve.BackendConfig, optional): configuration options @@ -394,7 +386,9 @@ def create_backend( ray_actor_options.update( override_environment_variables={"PYTHONHOME": conda_env_dir}) replica_config = ReplicaConfig( - backend_def, *init_args, ray_actor_options=ray_actor_options) + func_or_class, + *actor_init_args, + ray_actor_options=ray_actor_options) metadata = BackendMetadata( accepts_batches=replica_config.accepts_batches, is_blocking=replica_config.is_blocking) diff --git a/python/ray/serve/backend_state.py b/python/ray/serve/backend_state.py index ba6e2260f2f8..673c4b2cfbc8 100644 --- a/python/ray/serve/backend_state.py +++ b/python/ray/serve/backend_state.py @@ -1,8 +1,7 @@ import asyncio +from asyncio.futures import Future from collections import defaultdict -from enum import Enum -import time -from typing import Dict, List, Optional, Tuple +from typing import Dict, Any, List, Optional, Set, Tuple import ray import ray.cloudpickle as pickle @@ -18,6 +17,7 @@ ) from ray.serve.config import BackendConfig, ReplicaConfig from ray.serve.constants import LongPollKey +from ray.serve.exceptions import RayServeException from ray.serve.kv_store import RayInternalKVStore from ray.serve.long_poll import LongPollHost from ray.serve.utils import (format_actor_name, get_random_letters, logger, @@ -30,150 +30,6 @@ _RESOURCE_CHECK_ENABLED = True -class ReplicaState(Enum): - SHOULD_START = 1 - STARTING = 2 - RUNNING = 3 - SHOULD_STOP = 4 - STOPPING = 5 - STOPPED = 6 - - -class BackendReplica: - def __init__(self, controller_name: str, detached: bool, - replica_tag: ReplicaTag, backend_tag: BackendTag): - self._actor_name = format_actor_name(replica_tag, controller_name) - self._controller_name = controller_name - self._detached = detached - self._replica_tag = replica_tag - self._backend_tag = backend_tag - self._actor_handle = None - self._startup_obj_ref = None - self._drain_obj_ref = None - self._state = ReplicaState.SHOULD_START - - def __get_state__(self): - clean_dict = self.__dict__.copy() - del clean_dict["_actor_handle"] - del clean_dict["_startup_obj_ref"] - del clean_dict["_drain_obj_ref"] - return clean_dict - - def __set_state__(self, d): - self.__dict__ = d - self._actor_handle = None - self._startup_obj_ref = None - self._drain_obj_ref = None - self._recover_from_checkpoint() - - def _recover_from_checkpoint(self): - if self._state == ReplicaState.STARTING: - # We do not need to pass in the class here because the actor - # creation has already been started if this class was checkpointed - # in the STARTING state. - self.start() - elif self._state == ReplicaState.RUNNING: - # Fetch actor handles for all backend replicas in the system. - # The actors must exist if this class was checkpointed in the - # RUNNING state. - self._actor_handle = ray.get_actor(self._actor_name) - elif self._state == ReplicaState.STOPPING: - self.stop() - - def start(self, backend_info: Optional[BackendInfo]): - assert self._state in { - ReplicaState.SHOULD_START, ReplicaState.STARTING - }, (f"State must be {ReplicaState.SHOULD_START} or " - f"{ReplicaState.STARTING}, *not* {self._state}") - try: - self._actor_handle = ray.get_actor(self._actor_name) - except ValueError: - logger.debug("Starting replica '{}' for backend '{}'.".format( - self._replica_tag, self._backend_tag)) - self._actor_handle = ray.remote(backend_info.worker_class).options( - name=self._actor_name, - lifetime="detached" if self._detached else None, - max_restarts=-1, - max_task_retries=-1, - **backend_info.replica_config.ray_actor_options).remote( - self._backend_tag, self._replica_tag, - backend_info.replica_config.init_args, - backend_info.backend_config, self._controller_name) - self._startup_obj_ref = self._actor_handle.ready.remote() - self._state = ReplicaState.STARTING - - def check_started(self): - if self._state == ReplicaState.RUNNING: - return True - assert self._state == ReplicaState.STARTING, ( - f"State must be {ReplicaState.STARTING}, *not* {self._state}") - ready, _ = ray.wait([self._startup_obj_ref], timeout=0) - if len(ready) == 1: - self._state = ReplicaState.RUNNING - return True - return False - - def set_should_stop(self, graceful_shutdown_timeout_s: Duration): - self._state = ReplicaState.SHOULD_STOP - self._graceful_shutdown_timeout_s = graceful_shutdown_timeout_s - - def stop(self): - # We need to handle transitions from: - # SHOULD_START -> SHOULD_STOP -> STOPPING - # This means that the replica_handle may not have been created. - - assert self._state in { - ReplicaState.SHOULD_STOP, ReplicaState.STOPPING - }, (f"State must be {ReplicaState.SHOULD_STOP} or " - f"{ReplicaState.STOPPING}, *not* {self._state}") - - def drain_actor(actor_name): - # NOTE: the replicas may already be stopped if we failed - # after stopping them but before writing a checkpoint. - try: - replica = ray.get_actor(actor_name) - except ValueError: - return None - return replica.drain_pending_queries.remote() - - self._state = ReplicaState.STOPPING - self._drain_obj_ref = drain_actor(self._actor_name) - self._shutdown_deadline = time.time( - ) + self._graceful_shutdown_timeout_s - - def check_stopped(self): - if self._state == ReplicaState.STOPPED: - return True - assert self._state == ReplicaState.STOPPING, ( - f"State must be {ReplicaState.STOPPING}, *not* {self._state}") - - try: - replica = ray.get_actor(self._actor_name) - except ValueError: - self._state = ReplicaState.STOPPED - return True - - ready, _ = ray.wait([self._drain_obj_ref], timeout=0) - timeout_passed = time.time() > self._shutdown_deadline - - if len(ready) == 1 or timeout_passed: - if timeout_passed: - # Graceful period passed, kill it forcefully. - logger.debug( - f"{self._actor_name} did not shutdown after " - f"{self._graceful_shutdown_timeout_s}s, force-killing.") - - ray.kill(replica, no_restart=True) - self._state = ReplicaState.STOPPED - return True - return False - - def get_actor_handle(self): - assert self._state == ReplicaState.RUNNING, ( - f"State must be {ReplicaState.RUNNING}, *not* {self._state}") - return self._actor_handle - - class BackendState: """Manages all state for backends in the system. @@ -190,65 +46,79 @@ def __init__(self, controller_name: str, detached: bool, self._long_poll_host = long_poll_host self._goal_manager = goal_manager - self._replicas: Dict[BackendTag, Dict[ReplicaState, List[ - BackendReplica]]] = defaultdict(lambda: defaultdict(list)) - self._backend_metadata: Dict[BackendTag, BackendInfo] = dict() - self._target_replicas: Dict[BackendTag, int] = defaultdict(int) - self.backend_goals: Dict[BackendTag, GoalId] = dict() + # Non-checkpointed state. + self.currently_starting_replicas: Dict[asyncio.Future, Tuple[ + BackendTag, ReplicaTag, ActorHandle]] = dict() + self.currently_stopping_replicas: Dict[asyncio.Future, Tuple[ + BackendTag, ReplicaTag]] = dict() - # Un-Checkpointed state. - self.pending_goals: Dict[GoalId, asyncio.Event] = dict() + # Checkpointed state. + self.backends: Dict[BackendTag, BackendInfo] = dict() + self.backend_replicas: Dict[BackendTag, Dict[ + ReplicaTag, ActorHandle]] = defaultdict(dict) + self.backend_goals: Dict[BackendTag, GoalId] = dict() + self.backend_replicas_to_start: Dict[BackendTag, List[ + ReplicaTag]] = defaultdict(list) + self.backend_replicas_to_stop: Dict[BackendTag, List[Tuple[ + ReplicaTag, Duration]]] = defaultdict(list) + self.backends_to_remove: List[BackendTag] = list() checkpoint = self._kv_store.get(CHECKPOINT_KEY) if checkpoint is not None: - (self._replicas, self._backend_metadata, self._target_replicas, - self.backend_goals, pending_goal_ids) = pickle.loads(checkpoint) + (self.backends, self.backend_replicas, self.backend_goals, + self.backend_replicas_to_start, self.backend_replicas_to_stop, + self.backend_to_remove, + pending_goal_ids) = pickle.loads(checkpoint) for goal_id in pending_goal_ids: self._goal_manager.create_goal(goal_id) + # Fetch actor handles for all backend replicas in the system. + # All of these backend_replicas are guaranteed to already exist + # because they would not be written to a checkpoint in + # self.backend_replicas until they were created. + for backend_tag, replica_dict in self.backend_replicas.items(): + for replica_tag in replica_dict.keys(): + replica_name = format_actor_name(replica_tag, + self._controller_name) + self.backend_replicas[backend_tag][ + replica_tag] = ray.get_actor(replica_name) + self._notify_backend_configs_changed() self._notify_replica_handles_changed() def _checkpoint(self) -> None: self._kv_store.put( CHECKPOINT_KEY, - pickle.dumps((self._replicas, self._backend_metadata, - self._target_replicas, self.backend_goals, - self._goal_manager.get_pending_goal_ids()))) + pickle.dumps( + (self.backends, self.backend_replicas, self.backend_goals, + self.backend_replicas_to_start, self.backend_replicas_to_stop, + self.backends_to_remove, + self._goal_manager.get_pending_goal_ids()))) def _notify_backend_configs_changed(self) -> None: self._long_poll_host.notify_changed(LongPollKey.BACKEND_CONFIGS, self.get_backend_configs()) - def get_running_replica_handles( - self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]: - return { - backend_tag: { - backend_replica._replica_tag: - backend_replica.get_actor_handle() - for backend_replica in state_to_replica_dict[ - ReplicaState.RUNNING] - } - for backend_tag, state_to_replica_dict in self._replicas.items() - } - def _notify_replica_handles_changed(self) -> None: self._long_poll_host.notify_changed( LongPollKey.REPLICA_HANDLES, { backend_tag: list(replica_dict.values()) - for backend_tag, replica_dict in - self.get_running_replica_handles().items() + for backend_tag, replica_dict in self.backend_replicas.items() }) def get_backend_configs(self) -> Dict[BackendTag, BackendConfig]: return { tag: info.backend_config - for tag, info in self._backend_metadata.items() + for tag, info in self.backends.items() } + def get_replica_handles( + self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]: + return self.backend_replicas + def get_backend(self, backend_tag: BackendTag) -> Optional[BackendInfo]: - return self._backend_metadata.get(backend_tag) + return self.backends.get(backend_tag) def _set_backend_goal(self, backend_tag: BackendTag, backend_info: BackendInfo) -> None: @@ -256,11 +126,7 @@ def _set_backend_goal(self, backend_tag: BackendTag, new_goal_id = self._goal_manager.create_goal() if backend_info is not None: - self._backend_metadata[backend_tag] = backend_info - self._target_replicas[ - backend_tag] = backend_info.backend_config.num_replicas - else: - self._target_replicas[backend_tag] = 0 + self.backends[backend_tag] = backend_info self.backend_goals[backend_tag] = new_goal_id @@ -270,25 +136,31 @@ def create_backend(self, backend_tag: BackendTag, backend_config: BackendConfig, replica_config: ReplicaConfig) -> Optional[GoalId]: # Ensures this method is idempotent. - backend_info = self._backend_metadata.get(backend_tag) + backend_info = self.backends.get(backend_tag) if backend_info is not None: if (backend_info.backend_config == backend_config and backend_info.replica_config == replica_config): return None - backend_replica_class = create_backend_replica( - replica_config.backend_def) + backend_replica = create_backend_replica(replica_config.func_or_class) # Save creator that starts replicas, the arguments to be passed in, # and the configuration for the backends. backend_info = BackendInfo( - worker_class=backend_replica_class, + worker_class=backend_replica, backend_config=backend_config, replica_config=replica_config) new_goal_id, existing_goal_id = self._set_backend_goal( backend_tag, backend_info) + try: + self.scale_backend_replicas(backend_tag, + backend_config.num_replicas) + except RayServeException as e: + del self.backends[backend_tag] + raise e + # NOTE(edoakes): we must write a checkpoint before starting new # or pushing the updated config to avoid inconsistent state if we # crash while making the change. @@ -303,15 +175,20 @@ def delete_backend(self, backend_tag: BackendTag, force_kill: bool = False) -> Optional[GoalId]: # This method must be idempotent. We should validate that the # specified backend exists on the client. - if backend_tag not in self._backend_metadata: + if backend_tag not in self.backends: return None + # Scale its replicas down to 0. + self.scale_backend_replicas(backend_tag, 0, force_kill) + + # Remove the backend's metadata. + del self.backends[backend_tag] + + # Add the intention to remove the backend from the routers. + self.backends_to_remove.append(backend_tag) + new_goal_id, existing_goal_id = self._set_backend_goal( backend_tag, None) - if force_kill: - self._backend_metadata[ - backend_tag].backend_config.\ - experimental_graceful_shutdown_timeout_s = 0 self._checkpoint() if existing_goal_id is not None: @@ -320,18 +197,20 @@ def delete_backend(self, backend_tag: BackendTag, def update_backend_config(self, backend_tag: BackendTag, config_options: BackendConfig): - if backend_tag not in self._backend_metadata: + if backend_tag not in self.backends: raise ValueError(f"Backend {backend_tag} is not registered") - stored_backend_config = self._backend_metadata[ - backend_tag].backend_config + stored_backend_config = self.backends[backend_tag].backend_config updated_config = stored_backend_config.copy( update=config_options.dict(exclude_unset=True)) updated_config._validate_complete() - self._backend_metadata[backend_tag].backend_config = updated_config + self.backends[backend_tag].backend_config = updated_config new_goal_id, existing_goal_id = self._set_backend_goal( - backend_tag, self._backend_metadata[backend_tag]) + backend_tag, self.backends[backend_tag]) + + # Scale the replicas with the new configuration. + self.scale_backend_replicas(backend_tag, updated_config.num_replicas) # NOTE(edoakes): we must write a checkpoint before pushing the # update to avoid inconsistent state if we crash after pushing the @@ -347,40 +226,65 @@ def update_backend_config(self, backend_tag: BackendTag, return new_goal_id - def _scale_backend_replicas( + def _start_backend_replica(self, backend_tag: BackendTag, + replica_tag: ReplicaTag) -> ActorHandle: + """Start a replica and return its actor handle. + + Checks if the named actor already exists before starting a new one. + + Assumes that the backend configuration is already in the Goal State. + """ + # NOTE(edoakes): the replicas may already be created if we + # failed after creating them but before writing a + # checkpoint. + replica_name = format_actor_name(replica_tag, self._controller_name) + try: + replica_handle = ray.get_actor(replica_name) + except ValueError: + logger.debug("Starting replica '{}' for backend '{}'.".format( + replica_tag, backend_tag)) + backend_info = self.get_backend(backend_tag) + + replica_handle = ray.remote(backend_info.worker_class).options( + name=replica_name, + lifetime="detached" if self._detached else None, + max_restarts=-1, + max_task_retries=-1, + **backend_info.replica_config.ray_actor_options).remote( + backend_tag, replica_tag, + backend_info.replica_config.actor_init_args, + backend_info.backend_config, self._controller_name) + + return replica_handle + + def scale_backend_replicas( self, backend_tag: BackendTag, num_replicas: int, - ) -> bool: + force_kill: bool = False, + ) -> None: """Scale the given backend to the number of replicas. NOTE: this does not actually start or stop the replicas, but instead - adds them to ReplicaState.SHOULD_START or ReplicaState.SHOULD_STOP. - The caller is responsible for then first writing a checkpoint and then - actually starting/stopping the intended replicas. This avoids - inconsistencies with starting/stopping a replica and then crashing - before writing a checkpoint. + adds the intention to start/stop them to self.backend_replicas_to_start + and self.backend_replicas_to_stop. The caller is responsible for then + first writing a checkpoint and then actually starting/stopping the + intended replicas. This avoids inconsistencies with starting/stopping a + replica and then crashing before writing a checkpoint. """ + logger.debug("Scaling backend '{}' to {} replicas".format( backend_tag, num_replicas)) - assert (backend_tag in self._backend_metadata + assert (backend_tag in self.backends ), "Backend {} is not registered.".format(backend_tag) assert num_replicas >= 0, ("Number of replicas must be" " greater than or equal to 0.") - current_num_replicas = sum([ - len(self._replicas[backend_tag][ReplicaState.SHOULD_START]), - len(self._replicas[backend_tag][ReplicaState.STARTING]), - len(self._replicas[backend_tag][ReplicaState.RUNNING]), - ]) - + current_num_replicas = len(self.backend_replicas[backend_tag]) delta_num_replicas = num_replicas - current_num_replicas - backend_info: BackendInfo = self._backend_metadata[backend_tag] - if delta_num_replicas == 0: - return False - - elif delta_num_replicas > 0: + backend_info: BackendInfo = self.backends[backend_tag] + if delta_num_replicas > 0: can_schedule = try_schedule_resources_on_nodes(requirements=[ backend_info.replica_config.resource_dict for _ in range(delta_num_replicas) @@ -388,11 +292,10 @@ def _scale_backend_replicas( if _RESOURCE_CHECK_ENABLED and not all(can_schedule): num_possible = sum(can_schedule) - logger.error( + raise RayServeException( "Cannot scale backend {} to {} replicas. Ray Serve tried " "to add {} replicas but the resources only allows {} " - "to be added. This is not a problem if the cluster is " - "autoscaling. To fix this, consider scaling to replica to " + "to be added. To fix this, consider scaling to replica to " "{} or add more resources to the cluster. You can check " "avaiable resources with ray.nodes().".format( backend_tag, num_replicas, delta_num_replicas, @@ -402,133 +305,154 @@ def _scale_backend_replicas( delta_num_replicas, backend_tag)) for _ in range(delta_num_replicas): replica_tag = "{}#{}".format(backend_tag, get_random_letters()) - self._replicas[backend_tag][ReplicaState.SHOULD_START].append( - BackendReplica(self._controller_name, self._detached, - replica_tag, backend_tag)) + self.backend_replicas_to_start[backend_tag].append(replica_tag) elif delta_num_replicas < 0: logger.debug("Removing {} replicas from backend '{}'".format( -delta_num_replicas, backend_tag)) - assert self._target_replicas[backend_tag] >= delta_num_replicas - + assert len( + self.backend_replicas[backend_tag]) >= delta_num_replicas + replicas_copy = self.backend_replicas.copy() for _ in range(-delta_num_replicas): - replica_state_dict = self._replicas[backend_tag] - list_to_use = replica_state_dict[ReplicaState.SHOULD_START] \ - or replica_state_dict[ReplicaState.STARTING] \ - or replica_state_dict[ReplicaState.RUNNING] - - assert len(list_to_use), replica_state_dict - replica_to_stop = list_to_use.pop() + replica_tag, _ = replicas_copy[backend_tag].popitem() graceful_timeout_s = (backend_info.backend_config. experimental_graceful_shutdown_timeout_s) - - replica_to_stop.set_should_stop(graceful_timeout_s) - self._replicas[backend_tag][ReplicaState.SHOULD_STOP].append( - replica_to_stop) - - return True - - def _scale_all_backends(self): - checkpoint_needed = False - for backend_tag, num_replicas in list(self._target_replicas.items()): - checkpoint_needed |= self._scale_backend_replicas( - backend_tag, num_replicas) - if num_replicas == 0: - del self._backend_metadata[backend_tag] - del self._target_replicas[backend_tag] - - if checkpoint_needed: - self._checkpoint() - - def _pop_replicas_of_state(self, state: ReplicaState - ) -> List[Tuple[ReplicaState, BackendTag]]: - replicas = [] - for backend_tag, state_to_replica_dict in self._replicas.items(): - if state in state_to_replica_dict: - replicas.extend( - (replica, backend_tag) - for replica in state_to_replica_dict.pop(state)) - - return replicas + if force_kill: + graceful_timeout_s = 0 + self.backend_replicas_to_stop[backend_tag].append(( + replica_tag, + graceful_timeout_s, + )) + + def _start_pending_replicas(self): + for backend_tag, replicas_to_create in self.backend_replicas_to_start.\ + items(): + for replica_tag in replicas_to_create: + replica_handle = self._start_backend_replica( + backend_tag, replica_tag) + ready_future = replica_handle.ready.remote().as_future() + self.currently_starting_replicas[ready_future] = ( + backend_tag, replica_tag, replica_handle) + + def _stop_pending_replicas(self): + for backend_tag, replicas_to_stop in ( + self.backend_replicas_to_stop.items()): + for replica_tag, shutdown_timeout in replicas_to_stop: + replica_name = format_actor_name(replica_tag, + self._controller_name) + + async def kill_actor(replica_name_to_use): + # NOTE: the replicas may already be stopped if we failed + # after stopping them but before writing a checkpoint. + try: + replica = ray.get_actor(replica_name_to_use) + except ValueError: + return + + try: + await asyncio.wait_for( + replica.drain_pending_queries.remote(), + timeout=shutdown_timeout) + except asyncio.TimeoutError: + # Graceful period passed, kill it forcefully. + logger.debug( + f"{replica_name_to_use} did not shutdown after " + f"{shutdown_timeout}s, killing.") + finally: + ray.kill(replica, no_restart=True) + + self.currently_stopping_replicas[asyncio.ensure_future( + kill_actor(replica_name))] = (backend_tag, replica_tag) + + async def _check_currently_starting_replicas(self) -> int: + """Returns the number of pending replicas waiting to start""" + in_flight: Set[Future[Any]] = set() + + if self.currently_starting_replicas: + done, in_flight = await asyncio.wait( + list(self.currently_starting_replicas.keys()), timeout=0) + for fut in done: + (backend_tag, replica_tag, + replica_handle) = self.currently_starting_replicas.pop(fut) + self.backend_replicas[backend_tag][ + replica_tag] = replica_handle + + backend = self.backend_replicas_to_start.get(backend_tag) + if backend: + try: + backend.remove(replica_tag) + except ValueError: + pass + if len(backend) == 0: + del self.backend_replicas_to_start[backend_tag] + + async def _check_currently_stopping_replicas(self) -> int: + """Returns the number of replicas waiting to stop""" + in_flight: Set[Future[Any]] = set() + + if self.currently_stopping_replicas: + done_stopping, in_flight = await asyncio.wait( + list(self.currently_stopping_replicas.keys()), timeout=0) + for fut in done_stopping: + (backend_tag, + replica_tag) = self.currently_stopping_replicas.pop(fut) + + backend_to_stop = self.backend_replicas_to_stop.get( + backend_tag) + + if backend_to_stop: + try: + backend_to_stop.remove(replica_tag) + except ValueError: + pass + if len(backend_to_stop) == 0: + del self.backend_replicas_to_stop[backend_tag] + + backend = self.backend_replicas.get(backend_tag) + if backend: + try: + del backend[replica_tag] + except KeyError: + pass + + if len(self.backend_replicas[backend_tag]) == 0: + del self.backend_replicas[backend_tag] def _completed_goals(self) -> List[GoalId]: completed_goals = [] - all_tags = set(self._replicas.keys()).union( - set(self._backend_metadata.keys())) + all_tags = set(self.backend_replicas.keys()).union( + set(self.backends.keys())) for backend_tag in all_tags: - desired_num_replicas = self._target_replicas.get(backend_tag) - state_dict = self._replicas.get(backend_tag, {}) - existing_info = state_dict.get(ReplicaState.RUNNING, []) - - # If we have pending ops, the current goal is *not* ready - if (state_dict.get(ReplicaState.SHOULD_START) - or state_dict.get(ReplicaState.STARTING) - or state_dict.get(ReplicaState.SHOULD_STOP) - or state_dict.get(ReplicaState.STOPPING)): - continue - - # Check for deleting. - if (not desired_num_replicas or - desired_num_replicas == 0) and \ + desired_info = self.backends.get(backend_tag) + existing_info = self.backend_replicas.get(backend_tag) + # Check for deleting + if (not desired_info or + desired_info.backend_config.num_replicas == 0) and \ (not existing_info or len(existing_info) == 0): - completed_goals.append( - self.backend_goals.pop(backend_tag, None)) - - # Check for a non-zero number of backends. - if (desired_num_replicas and existing_info) \ - and desired_num_replicas == len(existing_info): - completed_goals.append( - self.backend_goals.pop(backend_tag, None)) - return [goal for goal in completed_goals if goal] + completed_goals.append(self.backend_goals.get(backend_tag)) - def update(self) -> bool: - """Updates the state of all running replicas to match the goal state. - """ - self._scale_all_backends() + # Check for a non-zero number of backends + if desired_info and existing_info and desired_info.backend_config.\ + num_replicas == len(existing_info): + completed_goals.append(self.backend_goals.get(backend_tag)) + return [goal for goal in completed_goals if goal] + async def update(self) -> bool: for goal_id in self._completed_goals(): self._goal_manager.complete_goal(goal_id) - for replica_state, backend_tag in self._pop_replicas_of_state( - ReplicaState.SHOULD_START): - replica_state.start(self._backend_metadata[backend_tag]) - self._replicas[backend_tag][ReplicaState.STARTING].append( - replica_state) - - for replica_state, backend_tag in self._pop_replicas_of_state( - ReplicaState.SHOULD_STOP): - replica_state.stop() - self._replicas[backend_tag][ReplicaState.STOPPING].append( - replica_state) - - transition_triggered = False - - for replica_state, backend_tag in self._pop_replicas_of_state( - ReplicaState.STARTING): - if replica_state.check_started(): - self._replicas[backend_tag][ReplicaState.RUNNING].append( - replica_state) - transition_triggered = True - else: - self._replicas[backend_tag][ReplicaState.STARTING].append( - replica_state) - - for replica_state, backend_tag in self._pop_replicas_of_state( - ReplicaState.STOPPING): - if replica_state.check_stopped(): - transition_triggered = True - else: - self._replicas[backend_tag][ReplicaState.STOPPING].append( - replica_state) - - for backend_tag in list(self._replicas.keys()): - if not any(self._replicas[backend_tag]): - del self._replicas[backend_tag] - del self._backend_metadata[backend_tag] - del self._target_replicas[backend_tag] - - if transition_triggered: + self._start_pending_replicas() + self._stop_pending_replicas() + + num_starting = len(self.currently_starting_replicas) + num_stopping = len(self.currently_stopping_replicas) + + await self._check_currently_starting_replicas() + await self._check_currently_stopping_replicas() + + if (len(self.currently_starting_replicas) != num_starting) or \ + (len(self.currently_stopping_replicas) != num_stopping): self._checkpoint() self._notify_replica_handles_changed() diff --git a/python/ray/serve/backend_worker.py b/python/ray/serve/backend_worker.py index 5740cf4f5a6d..da087efa5434 100644 --- a/python/ray/serve/backend_worker.py +++ b/python/ray/serve/backend_worker.py @@ -13,7 +13,7 @@ from ray.async_compat import sync_to_async from ray.serve.utils import (parse_request_item, _get_logger, chain_future, - unpack_future, import_attr) + unpack_future) from ray.serve.exceptions import RayServeException from ray.util import metrics from ray.serve.config import BackendConfig @@ -94,40 +94,33 @@ async def wait_for_batch(self) -> List[Query]: return batch -def create_backend_replica(backend_def: Union[Callable, Type[Callable], str]): +def create_backend_replica(func_or_class: Union[Callable, Type[Callable]]): """Creates a replica class wrapping the provided function or class. This approach is picked over inheritance to avoid conflict between user provided class and the RayServeReplica class. """ - backend_def = backend_def + + if inspect.isfunction(func_or_class): + is_function = True + elif inspect.isclass(func_or_class): + is_function = False + else: + assert False, "func_or_class must be function or class." # TODO(architkulkarni): Add type hints after upgrading cloudpickle class RayServeWrappedReplica(object): def __init__(self, backend_tag, replica_tag, init_args, backend_config: BackendConfig, controller_name: str): - if isinstance(backend_def, str): - backend = import_attr(backend_def) - else: - backend = backend_def - - if inspect.isfunction(backend): - is_function = True - elif inspect.isclass(backend): - is_function = False - else: - assert False, ("backend_def must be function, class, or " - "corresponding import path.") - # Set the controller name so that serve.connect() in the user's # backend code will connect to the instance that this backend is # running in. ray.serve.api._set_internal_replica_context( backend_tag, replica_tag, controller_name) if is_function: - _callable = backend + _callable = func_or_class else: - _callable = backend(*init_args) + _callable = func_or_class(*init_args) assert controller_name, "Must provide a valid controller_name" controller_handle = ray.get_actor(controller_name) @@ -151,12 +144,8 @@ def ready(self): async def drain_pending_queries(self): return await self.backend.drain_pending_queries() - if isinstance(backend_def, str): - RayServeWrappedReplica.__name__ = "RayServeReplica_{}".format( - backend_def) - else: - RayServeWrappedReplica.__name__ = "RayServeReplica_{}".format( - backend_def.__name__) + RayServeWrappedReplica.__name__ = "RayServeReplica_{}".format( + func_or_class.__name__) return RayServeWrappedReplica @@ -426,7 +415,8 @@ def reconfigure(self, user_config) -> None: if user_config: if self.is_function: raise ValueError( - "backend_def must be a class to use user_config") + "argument func_or_class must be a class to use user_config" + ) elif not hasattr(self.callable, BACKEND_RECONFIGURE_METHOD): raise RayServeException("user_config specified but backend " + self.backend_tag + " missing " + diff --git a/python/ray/serve/backends.py b/python/ray/serve/backends.py index 5f58ad2c9a8d..086755500a46 100644 --- a/python/ray/serve/backends.py +++ b/python/ray/serve/backends.py @@ -1,4 +1,3 @@ -from ray import serve from ray.serve.utils import import_class @@ -27,13 +26,6 @@ def reconfigure(self, *args, **kwargs): # proxy it manually. return self.wrapped.reconfigure(*args, **kwargs) - # We mark 'accept_batch' here just so this will always pass the - # check we make during create_backend(). Unfortunately this means - # that validation won't happen until the replica is created. - @serve.accept_batch - def __call__(self, *args, **kwargs): - return self.wrapped(*args, **kwargs) - def __getattr__(self, attr): """Proxy all other methods to the wrapper class.""" return getattr(self.wrapped, attr) diff --git a/python/ray/serve/benchmarks/cluster.yaml b/python/ray/serve/benchmarks/cluster.yaml index aad50bf97d3e..d588dc06a207 100644 --- a/python/ray/serve/benchmarks/cluster.yaml +++ b/python/ray/serve/benchmarks/cluster.yaml @@ -1,10 +1,13 @@ cluster_name: default min_workers: 5 max_workers: 5 +initial_workers: 5 +autoscaling_mode: default docker: image: 'anyscale/ray-ml:latest' container_name: ray_container pull_before_run: true +target_utilization_fraction: 0.8 idle_timeout_minutes: 5 provider: type: aws diff --git a/python/ray/serve/config.py b/python/ray/serve/config.py index 8060b406f0de..205af81b065a 100644 --- a/python/ray/serve/config.py +++ b/python/ray/serve/config.py @@ -4,30 +4,23 @@ from typing import Any, Dict, List, Optional import pydantic -from pydantic import BaseModel, confloat, PositiveFloat, PositiveInt, validator -from ray.serve.constants import DEFAULT_HTTP_HOST, DEFAULT_HTTP_PORT +from pydantic import BaseModel, PositiveFloat, PositiveInt, validator +from ray.serve.constants import (ASYNC_CONCURRENCY, DEFAULT_HTTP_HOST, + DEFAULT_HTTP_PORT) -def _callable_accepts_batch(backend_def): - if inspect.isfunction(backend_def): - return hasattr(backend_def, "_serve_accept_batch") - elif inspect.isclass(backend_def): - return hasattr(backend_def.__call__, "_serve_accept_batch") - elif isinstance(backend_def, str): - return True - else: - raise TypeError("backend_def must be function, class, or str.") +def _callable_accepts_batch(func_or_class): + if inspect.isfunction(func_or_class): + return hasattr(func_or_class, "_serve_accept_batch") + elif inspect.isclass(func_or_class): + return hasattr(func_or_class.__call__, "_serve_accept_batch") -def _callable_is_blocking(backend_def): - if inspect.isfunction(backend_def): - return not inspect.iscoroutinefunction(backend_def) - elif inspect.isclass(backend_def): - return not inspect.iscoroutinefunction(backend_def.__call__) - elif isinstance(backend_def, str): - return False - else: - raise TypeError("backend_def must be function, class, or str.") +def _callable_is_blocking(func_or_class): + if inspect.isfunction(func_or_class): + return not inspect.iscoroutinefunction(func_or_class) + elif inspect.isclass(func_or_class): + return not inspect.iscoroutinefunction(func_or_class.__call__) @dataclass @@ -71,7 +64,7 @@ class BackendConfig(BaseModel): user_config: Any = None experimental_graceful_shutdown_wait_loop_s: PositiveFloat = 2.0 - experimental_graceful_shutdown_timeout_s: confloat(ge=0) = 20.0 + experimental_graceful_shutdown_timeout_s: PositiveFloat = 20.0 class Config: validate_assignment = True @@ -112,11 +105,8 @@ def set_max_queries_by_mode(cls, v, values): # noqa 805 # Pipeline/async mode: if the servable is not blocking, # router should just keep pushing queries to the replicas # until a high limit. - # TODO(edoakes): setting this to a relatively low constant because - # we can't determine if imported backends are sync or async, but we - # may consider tweaking it in the future. if not values["internal_metadata"].is_blocking: - v = 100 + v = ASYNC_CONCURRENCY # Batch inference mode: user specifies non zero timeout to wait for # full batch. We will use 2*max_batch_size to perform double @@ -129,11 +119,12 @@ def set_max_queries_by_mode(cls, v, values): # noqa 805 class ReplicaConfig: - def __init__(self, backend_def, *init_args, ray_actor_options=None): - self.backend_def = backend_def - self.accepts_batches = _callable_accepts_batch(backend_def) - self.is_blocking = _callable_is_blocking(backend_def) - self.init_args = list(init_args) + def __init__(self, func_or_class, *actor_init_args, + ray_actor_options=None): + self.func_or_class = func_or_class + self.accepts_batches = _callable_accepts_batch(func_or_class) + self.is_blocking = _callable_is_blocking(func_or_class) + self.actor_init_args = list(actor_init_args) if ray_actor_options is None: self.ray_actor_options = {} else: @@ -143,28 +134,27 @@ def __init__(self, backend_def, *init_args, ray_actor_options=None): self._validate() def _validate(self): - # Validate that backend_def is an import path, function, or class. - if isinstance(self.backend_def, str): - pass - elif inspect.isfunction(self.backend_def): - if len(self.init_args) != 0: + # Validate that func_or_class is a function or class. + if inspect.isfunction(self.func_or_class): + if len(self.actor_init_args) != 0: raise ValueError( - "init_args not supported for function backend.") - elif not inspect.isclass(self.backend_def): + "actor_init_args not supported for function backend.") + elif not inspect.isclass(self.func_or_class): raise TypeError( "Backend must be a function or class, it is {}.".format( - type(self.backend_def))) + type(self.func_or_class))) if not isinstance(self.ray_actor_options, dict): raise TypeError("ray_actor_options must be a dictionary.") elif "lifetime" in self.ray_actor_options: raise ValueError( - "Specifying lifetime in init_args is not allowed.") + "Specifying lifetime in actor_init_args is not allowed.") elif "name" in self.ray_actor_options: - raise ValueError("Specifying name in init_args is not allowed.") + raise ValueError( + "Specifying name in actor_init_args is not allowed.") elif "max_restarts" in self.ray_actor_options: raise ValueError("Specifying max_restarts in " - "init_args is not allowed.") + "actor_init_args is not allowed.") else: # Ray defaults to zero CPUs for placement, we default to one here. if "num_cpus" not in self.ray_actor_options: diff --git a/python/ray/serve/controller.py b/python/ray/serve/controller.py index 8996c342dab7..a3c75c711878 100644 --- a/python/ray/serve/controller.py +++ b/python/ray/serve/controller.py @@ -111,14 +111,14 @@ async def run_control_loop(self) -> None: while True: async with self.write_lock: self.http_state.update() - self.backend_state.update() + await self.backend_state.update() await asyncio.sleep(CONTROL_LOOP_PERIOD_S) def _all_replica_handles( self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]: """Used for testing.""" - return self.backend_state.get_running_replica_handles() + return self.backend_state.get_replica_handles() def get_all_backends(self) -> Dict[BackendTag, BackendConfig]: """Returns a dictionary of backend tag to backend config.""" @@ -163,13 +163,10 @@ async def shadow_traffic(self, endpoint_name: str, backend_tag: BackendTag, self.endpoint_state.shadow_traffic(endpoint_name, backend_tag, proportion) - async def create_endpoint( - self, - endpoint: str, - traffic_dict: Dict[str, float], - route: Optional[str], - methods: List[str], - ) -> None: + # TODO(architkulkarni): add Optional for route after cloudpickle upgrade + async def create_endpoint(self, endpoint: str, + traffic_dict: Dict[str, float], route, + methods: List[str]) -> None: """Create a new endpoint with the specified route and methods. If the route is None, this is a "headless" endpoint that will not @@ -238,7 +235,7 @@ async def shutdown(self) -> None: async with self.write_lock: for proxy in self.http_state.get_http_proxy_handles().values(): ray.kill(proxy, no_restart=True) - for replica_dict in self.backend_state.get_running_replica_handles( + for replica_dict in self.backend_state.get_replica_handles( ).values(): for replica in replica_dict.values(): ray.kill(replica, no_restart=True) diff --git a/python/ray/serve/endpoint_state.py b/python/ray/serve/endpoint_state.py index 39a67d090c86..bdbfe2c39351 100644 --- a/python/ray/serve/endpoint_state.py +++ b/python/ray/serve/endpoint_state.py @@ -20,7 +20,7 @@ def __init__(self, kv_store: RayInternalKVStore, long_poll_host: LongPollHost): self._kv_store = kv_store self._long_poll_host = long_poll_host - self._routes: Dict[str, Tuple[EndpointTag, Any]] = dict() + self._routes: Dict[BackendTag, Tuple[EndpointTag, Any]] = dict() self._traffic_policies: Dict[EndpointTag, TrafficPolicy] = dict() checkpoint = self._kv_store.get(CHECKPOINT_KEY) diff --git a/python/ray/serve/examples/doc/imported_backend.py b/python/ray/serve/examples/doc/imported_backend.py index 596604aaa4d9..d80d73b4a72c 100644 --- a/python/ray/serve/examples/doc/imported_backend.py +++ b/python/ray/serve/examples/doc/imported_backend.py @@ -1,12 +1,13 @@ import requests from ray import serve +from ray.serve.backends import ImportedBackend client = serve.start() # Include your class as input to the ImportedBackend constructor. -import_path = "ray.serve.utils.MockImportedBackend" -client.create_backend("imported", import_path, "input_arg") +backend_class = ImportedBackend("ray.serve.utils.MockImportedBackend") +client.create_backend("imported", backend_class, "input_arg") client.create_endpoint("imported", backend="imported", route="/imported") print(requests.get("http://127.0.0.1:8000/imported").text) diff --git a/python/ray/serve/handle.py b/python/ray/serve/handle.py index 3659e5978bf2..c6951c6380b9 100644 --- a/python/ray/serve/handle.py +++ b/python/ray/serve/handle.py @@ -4,8 +4,7 @@ from typing import Any, Dict, Optional, Union from enum import Enum -from ray.serve.utils import get_random_letters -from ray.util import metrics +from ray.serve.router import Router @dataclass(frozen=True) @@ -41,25 +40,13 @@ class RayServeHandle: # raises RayTaskError Exception """ - def __init__( - self, - router, # ThreadProxiedRouter - endpoint_name, - handle_options: Optional[HandleOptions] = None): + def __init__(self, + router: Router, + endpoint_name, + handle_options: Optional[HandleOptions] = None): self.router = router self.endpoint_name = endpoint_name self.handle_options = handle_options or HandleOptions() - self.handle_tag = f"{self.endpoint_name}#{get_random_letters()}" - - self.request_counter = metrics.Count( - "serve_handle_request_counter", - description=("The number of handle.remote() calls that have been " - "made on this handle."), - tag_keys=("handle", "endpoint")) - self.request_counter.set_default_tags({ - "handle": self.handle_tag, - "endpoint": self.endpoint_name - }) def options(self, *, @@ -91,7 +78,7 @@ def options(self, async def remote(self, request_data: Optional[Union[Dict, Any]] = None, **kwargs): - """Issue an asynchronous request to the endpoint. + """Issue an asynchrounous request to the endpoint. Returns a Ray ObjectRef whose results can be waited for or retrieved using ray.wait or ray.get (or ``await object_ref``), respectively. @@ -105,19 +92,12 @@ async def remote(self, ``**kwargs``: All keyword arguments will be available in ``request.query_params``. """ - self.request_counter.record(1) return await self.router._remote( self.endpoint_name, self.handle_options, request_data, kwargs) def __repr__(self): return f"{self.__class__.__name__}(endpoint='{self.endpoint_name}')" - def __reduce__(self): - deserializer = RayServeHandle - serialized_data = (self.router, self.endpoint_name, - self.handle_options) - return deserializer, serialized_data - class RayServeSyncHandle(RayServeHandle): def remote(self, request_data: Optional[Union[Dict, Any]] = None, @@ -138,15 +118,8 @@ def remote(self, request_data: Optional[Union[Dict, Any]] = None, ``**kwargs``: All keyword arguments will be available in ``request.args``. """ - self.request_counter.record(1) coro = self.router._remote(self.endpoint_name, self.handle_options, request_data, kwargs) future: concurrent.futures.Future = asyncio.run_coroutine_threadsafe( coro, self.router.async_loop) return future.result() - - def __reduce__(self): - deserializer = RayServeSyncHandle - serialized_data = (self.router, self.endpoint_name, - self.handle_options) - return deserializer, serialized_data diff --git a/python/ray/serve/http_proxy.py b/python/ray/serve/http_proxy.py index 1aad3e9f4a27..5f722276e7ca 100644 --- a/python/ray/serve/http_proxy.py +++ b/python/ray/serve/http_proxy.py @@ -1,86 +1,23 @@ import asyncio import socket -from typing import List, Dict, Tuple +from typing import List import uvicorn import starlette.responses -import starlette.routing import ray from ray.exceptions import RayTaskError -from ray.serve.common import EndpointTag from ray.serve.constants import LongPollKey from ray.util import metrics from ray.serve.utils import _get_logger from ray.serve.http_util import Response, build_starlette_request from ray.serve.long_poll import LongPollAsyncClient +from ray.serve.router import Router from ray.serve.handle import DEFAULT logger = _get_logger() -class ServeStarletteEndpoint: - """Wraps the given Serve endpoint in a Starlette endpoint. - - Implements the ASGI protocol. Constructs a Starlette endpoint for use by - a Starlette app or Starlette Router which calls the given Serve endpoint - using the given Serve client. - - Usage: - route = starlette.routing.Route( - "/api", - ServeStarletteEndpoint(self.client, endpoint_tag), - methods=methods) - app = starlette.applications.Starlette(routes=[route]) - """ - - def __init__(self, client, endpoint_tag: EndpointTag): - self.client = client - self.endpoint_tag = endpoint_tag - # This will be lazily populated when the first request comes in. - # TODO(edoakes): we should be able to construct the handle here, but - # that currently breaks pytest. This seems like a bug. - self.handle = None - - async def __call__(self, scope, receive, send): - http_body_bytes = await self.receive_http_body(scope, receive, send) - - headers = {k.decode(): v.decode() for k, v in scope["headers"]} - if self.handle is None: - self.handle = self.client.get_handle(self.endpoint_tag, sync=False) - - object_ref = await self.handle.options( - method_name=headers.get("X-SERVE-CALL-METHOD".lower(), - DEFAULT.VALUE), - shard_key=headers.get("X-SERVE-SHARD-KEY".lower(), DEFAULT.VALUE), - http_method=scope["method"].upper(), - http_headers=headers).remote( - build_starlette_request(scope, http_body_bytes)) - - result = await object_ref - - if isinstance(result, RayTaskError): - error_message = "Task Error. Traceback: {}.".format(result) - await Response( - error_message, status_code=500).send(scope, receive, send) - elif isinstance(result, starlette.responses.Response): - await result(scope, receive, send) - else: - await Response(result).send(scope, receive, send) - - async def receive_http_body(self, scope, receive, send): - body_buffer = [] - more_body = True - while more_body: - message = await receive() - assert message["type"] == "http.request" - - more_body = message["more_body"] - body_buffer.append(message["body"]) - - return b"".join(body_buffer) - - class HTTPProxy: """This class is meant to be instantiated and run by an ASGI HTTP server. @@ -96,12 +33,8 @@ def __init__(self, controller_name): self.client = ray.serve.connect() controller = ray.get_actor(controller_name) - - self.router = starlette.routing.Router(default=self._not_found) - - # route -> (endpoint_tag, methods). Updated via long polling. - self.route_table: Dict[str, Tuple[EndpointTag, List[str]]] = {} - + self.route_table = {} # Should be updated via long polling. + self.router = Router(controller) self.long_poll_client = LongPollAsyncClient(controller, { LongPollKey.ROUTE_TABLE: self._update_route_table, }) @@ -111,38 +44,40 @@ def __init__(self, controller_name): description="The number of HTTP requests processed.", tag_keys=("route", )) + async def setup(self): + await self.router.setup_in_async_loop() + async def _update_route_table(self, route_table): logger.debug(f"HTTP Proxy: Get updated route table: {route_table}.") self.route_table = route_table - routes = [ - starlette.routing.Route( - route, - ServeStarletteEndpoint(self.client, endpoint_tag), - methods=methods) - for route, (endpoint_tag, methods) in route_table.items() - if not self._is_headless(route) - ] + async def receive_http_body(self, scope, receive, send): + body_buffer = [] + more_body = True + while more_body: + message = await receive() + assert message["type"] == "http.request" - routes.append( - starlette.routing.Route("/-/routes", self._display_route_table)) + more_body = message["more_body"] + body_buffer.append(message["body"]) - self.router.routes = routes + return b"".join(body_buffer) - async def _not_found(self, scope, receive, send): - current_path = scope["path"] - error_message = ("Path {} not found. " - "Please ping http://.../-/routes for route table." - ).format(current_path) - response = Response(error_message, status_code=404) - await response.send(scope, receive, send) + def _make_error_sender(self, scope, receive, send): + async def sender(error_message, status_code): + response = Response(error_message, status_code=status_code) + await response.send(scope, receive, send) - async def _display_route_table(self, request): - return starlette.responses.JSONResponse(self.route_table) + return sender - def _is_headless(self, route: str): - """Returns True if `route` corresponds to a headless endpoint.""" - return not route.startswith("/") + async def _handle_system_request(self, scope, receive, send): + current_path = scope["path"] + if current_path == "/-/routes": + await Response(self.route_table).send(scope, receive, send) + else: + await Response( + "System path {} not found".format(current_path), + status_code=404).send(scope, receive, send) async def __call__(self, scope, receive, send): """Implements the ASGI protocol. @@ -151,6 +86,8 @@ async def __call__(self, scope, receive, send): https://asgi.readthedocs.io/en/latest/specs/index.html. """ + error_sender = self._make_error_sender(scope, receive, send) + assert self.route_table is not None, ( "Route table must be set via set_route_table.") assert scope["type"] == "http" @@ -158,7 +95,51 @@ async def __call__(self, scope, receive, send): self.request_counter.record(1, tags={"route": current_path}) - await self.router(scope, receive, send) + if current_path.startswith("/-/"): + await self._handle_system_request(scope, receive, send) + return + + try: + endpoint_name, methods_allowed = self.route_table[current_path] + except KeyError: + error_message = ( + "Path {} not found. " + "Please ping http://.../-/routes for routing table" + ).format(current_path) + await error_sender(error_message, 404) + return + + if scope["method"] not in methods_allowed: + error_message = ("Methods {} not allowed. " + "Available HTTP methods are {}.").format( + scope["method"], methods_allowed) + await error_sender(error_message, 405) + return + + http_body_bytes = await self.receive_http_body(scope, receive, send) + + headers = {k.decode(): v.decode() for k, v in scope["headers"]} + + handle = self.client.get_handle( + endpoint_name, sync=False).options( + method_name=headers.get("X-SERVE-CALL-METHOD".lower(), + DEFAULT.VALUE), + shard_key=headers.get("X-SERVE-SHARD-KEY".lower(), + DEFAULT.VALUE), + http_method=scope["method"].upper(), + http_headers=headers) + + request = build_starlette_request(scope, http_body_bytes) + object_ref = await handle.remote(request) + result = await object_ref + + if isinstance(result, RayTaskError): + error_message = "Task Error. Traceback: {}.".format(result) + await error_sender(error_message, 500) + elif isinstance(result, starlette.responses.Response): + await result(scope, receive, send) + else: + await Response(result).send(scope, receive, send) @ray.remote @@ -176,6 +157,7 @@ async def __init__( self.setup_complete = asyncio.Event() self.app = HTTPProxy(controller_name) + await self.app.setup() self.wrapped_app = self.app for middleware in http_middlewares: diff --git a/python/ray/serve/http_util.py b/python/ray/serve/http_util.py index e8a51adf3d52..0aa4ccf84604 100644 --- a/python/ray/serve/http_util.py +++ b/python/ray/serve/http_util.py @@ -19,16 +19,7 @@ async def mock_receive(): "more_body": False } - # scope["router"] and scope["endpoint"] contain references to a router and - # endpoint object, respectively, which each in turn contain a reference to - # the Serve client, which cannot be serialized. - # The solution is to delete these from scope, as they will not be used. - # Per ASGI recommendation, copy scope before passing to child. - child_scope = scope.copy() - del child_scope["router"] - del child_scope["endpoint"] - - return starlette.requests.Request(child_scope, mock_receive) + return starlette.requests.Request(scope, mock_receive) class Response: diff --git a/python/ray/serve/router.py b/python/ray/serve/router.py index ec887d006c43..477f037fd459 100644 --- a/python/ray/serve/router.py +++ b/python/ray/serve/router.py @@ -1,6 +1,7 @@ import asyncio from enum import Enum import itertools +from collections import defaultdict from dataclasses import dataclass, field from typing import Any, ChainMap, Dict, Iterable, List, Optional @@ -48,12 +49,12 @@ class Query: class ReplicaSet: """Data structure representing a set of replica actor handles""" - def __init__(self, backend_tag): - self.backend_tag = backend_tag + def __init__(self): # NOTE(simon): We have to do this because max_concurrent_queries # and the replica handles come from different long poll keys. self.max_concurrent_queries: int = 8 self.in_flight_queries: Dict[ActorHandle, set] = dict() + # The iterator used for load balancing among replicas. Using itertools # cycle, we implements a round-robin policy, skipping overloaded # replicas. @@ -63,25 +64,15 @@ def __init__(self, backend_tag): self.replica_iterator = itertools.cycle(self.in_flight_queries.keys()) # Used to unblock this replica set waiting for free replicas. A newly - # added replica or updated max_concurrent_queries value means the + # added replica or updated max_concurrenty_queries value means the # query that waits on a free replica might be unblocked on. self.config_updated_event = asyncio.Event() - self.num_queued_queries = 0 - self.num_queued_queries_gauge = metrics.Gauge( - "serve_backend_queued_queries", - description=( - "The current number of queries to this backend waiting" - " to be assigned to a replica."), - tag_keys=("backend", "endpoint")) - self.num_queued_queries_gauge.set_default_tags({ - "backend": self.backend_tag - }) def set_max_concurrent_queries(self, new_value): if new_value != self.max_concurrent_queries: self.max_concurrent_queries = new_value logger.debug( - f"ReplicaSet: changing max_concurrent_queries to {new_value}") + f"ReplicaSet: chaging max_concurrent_queries to {new_value}") self.config_updated_event.set() def update_worker_replicas(self, worker_replicas: Iterable[ActorHandle]): @@ -101,7 +92,7 @@ def update_worker_replicas(self, worker_replicas: Iterable[ActorHandle]): self.config_updated_event.set() def _try_assign_replica(self, query: Query) -> Optional[ray.ObjectRef]: - """Try to assign query to a replica, return the object ref if succeeded + """Try to assign query to a replica, return the object ref is succeeded or return None if it can't assign this query to any replicas. """ for _ in range(len(self.in_flight_queries.keys())): @@ -139,10 +130,6 @@ async def assign_replica(self, query: Query) -> ray.ObjectRef: and only send a query to available replicas (determined by the backend max_concurrent_quries value.) """ - endpoint = query.metadata.endpoint - self.num_queued_queries += 1 - self.num_queued_queries_gauge.record( - self.num_queued_queries, tags={"endpoint": endpoint}) assigned_ref = self._try_assign_replica(query) while assigned_ref is None: # Can't assign a replica right now. logger.debug("Failed to assign a replica for " @@ -160,12 +147,8 @@ async def assign_replica(self, query: Query) -> ray.ObjectRef: return_when=asyncio.FIRST_COMPLETED) if self.config_updated_event.is_set(): self.config_updated_event.clear() - # We are pretty sure a free replica is ready now, let's recurse and - # assign this query a replica. + # We are pretty sure a free replica is ready now. assigned_ref = self._try_assign_replica(query) - self.num_queued_queries -= 1 - self.num_queued_queries_gauge.record( - self.num_queued_queries, tags={"endpoint": endpoint}) return assigned_ref @@ -185,8 +168,7 @@ def __init__(self, controller_handle: ActorHandle): self.controller = controller_handle self.endpoint_policies: Dict[str, EndpointPolicy] = dict() - - self.backend_replicas: Dict[str, ReplicaSet] = dict() + self.backend_replicas: Dict[str, ReplicaSet] = defaultdict(ReplicaSet) self._pending_endpoints: Dict[str, asyncio.Future] = dict() @@ -230,8 +212,8 @@ async def _update_replica_handles(self, replica_handles): replica_handles) for backend_tag, replica_handles in ChainMap(added, updated).items(): - self._get_or_create_replica_set( - backend_tag).update_worker_replicas(replica_handles) + self.backend_replicas[backend_tag].update_worker_replicas( + replica_handles) for backend_tag in removed.keys(): if backend_tag in self.backend_replicas: @@ -241,9 +223,8 @@ async def _update_backend_configs(self, backend_configs): added, removed, updated = compute_dict_delta(self.backend_replicas, backend_configs) for backend_tag, config in ChainMap(added, updated).items(): - self._get_or_create_replica_set( - backend_tag).set_max_concurrent_queries( - config.max_concurrent_queries) + self.backend_replicas[backend_tag].set_max_concurrent_queries( + config.max_concurrent_queries) for backend_tag in removed.keys(): if backend_tag in self.backend_replicas: @@ -275,22 +256,15 @@ async def assign_request( raise RayServeException( f"Endpoint {endpoint} was removed. This request " "cannot be completed.") - logger.info(f"Endpoint {endpoint} registered.") endpoint_policy = self.endpoint_policies[endpoint] chosen_backend, *shadow_backends = endpoint_policy.assign(query) - result_ref = await self._get_or_create_replica_set( - chosen_backend).assign_replica(query) + result_ref = await self.backend_replicas[chosen_backend + ].assign_replica(query) for backend in shadow_backends: - (await self._get_or_create_replica_set(backend) - .assign_replica(query)) + await self.backend_replicas[backend].assign_replica(query) self.num_router_requests.record(1, tags={"endpoint": endpoint}) return result_ref - - def _get_or_create_replica_set(self, backend_name): - if backend_name not in self.backend_replicas: - self.backend_replicas[backend_name] = ReplicaSet(backend_name) - return self.backend_replicas[backend_name] diff --git a/python/ray/serve/tests/test_api.py b/python/ray/serve/tests/test_api.py index abfdbf1fb25a..202b01386059 100644 --- a/python/ray/serve/tests/test_api.py +++ b/python/ray/serve/tests/test_api.py @@ -683,9 +683,6 @@ def f(): client.create_endpoint("endpoint", backend="backend") -# This error is only printed because creation is run in the control loop, not -# in the API path. -@pytest.mark.skip() def test_create_infeasible_error(serve_instance): client = serve_instance @@ -875,10 +872,6 @@ def verify_metrics(do_assert=False): # gauge "replica_processing_queries", "replica_queued_queries", - # handle - "serve_handle_request_counter", - # ReplicaSet - "backend_queued_queries" ] for metric in expected_metrics: # For the final error round @@ -989,29 +982,6 @@ async def echo_body(starlette_request): assert resp == long_string -def test_variable_routes(serve_instance): - client = serve_instance - - def f(starlette_request): - return starlette_request.path_params - - client.create_backend("f", f) - client.create_endpoint("basic", backend="f", route="/api/{username}") - - # Test multiple variables and test type conversion - client.create_endpoint( - "complex", backend="f", route="/api/{user_id:int}/{number:float}") - - assert requests.get("http://127.0.0.1:8000/api/scaly").json() == { - "username": "scaly" - } - - assert requests.get("http://127.0.0.1:8000/api/23/12.345").json() == { - "user_id": 23, - "number": 12.345 - } - - if __name__ == "__main__": import sys sys.exit(pytest.main(["-v", "-s", __file__])) diff --git a/python/ray/serve/tests/test_backend_worker.py b/python/ray/serve/tests/test_backend_worker.py index 11c22e02e976..74c5418df253 100644 --- a/python/ray/serve/tests/test_backend_worker.py +++ b/python/ray/serve/tests/test_backend_worker.py @@ -16,7 +16,7 @@ def setup_worker(name, - backend_def, + func_or_class, init_args=None, backend_config=BackendConfig(), controller_name=""): @@ -26,7 +26,7 @@ def setup_worker(name, @ray.remote class WorkerActor: def __init__(self): - self.worker = create_backend_replica(backend_def)( + self.worker = create_backend_replica(func_or_class)( name, name + ":tag", init_args, backend_config, controller_name) diff --git a/python/ray/serve/tests/test_config.py b/python/ray/serve/tests/test_config.py index 5227b3ff5c53..40942ad767eb 100644 --- a/python/ray/serve/tests/test_config.py +++ b/python/ray/serve/tests/test_config.py @@ -3,6 +3,7 @@ from ray import serve from ray.serve.config import (BackendConfig, DeploymentMode, HTTPOptions, ReplicaConfig, BackendMetadata) +from ray.serve.constants import ASYNC_CONCURRENCY from pydantic import ValidationError @@ -41,7 +42,7 @@ def test_backend_config_validation(): assert BackendConfig( max_batch_size=10, internal_metadata=BackendMetadata( - is_blocking=False)).max_concurrent_queries == 100 + is_blocking=False)).max_concurrent_queries == ASYNC_CONCURRENCY assert BackendConfig( max_batch_size=7, batch_wait_timeout=1.0).max_concurrent_queries == 14 diff --git a/python/ray/serve/tests/test_failure.py b/python/ray/serve/tests/test_failure.py index 3cba01ffb3ba..7ecba4d51735 100644 --- a/python/ray/serve/tests/test_failure.py +++ b/python/ray/serve/tests/test_failure.py @@ -1,11 +1,11 @@ import os import requests -import sys +import tempfile import time -import pytest import ray from ray.test_utils import wait_for_condition +from ray import serve from ray.serve.config import BackendConfig, ReplicaConfig @@ -154,34 +154,37 @@ def __call__(self, *args): # Test that if there are multiple replicas for a worker and one dies # unexpectedly, the others continue to serve requests. -@pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.") def test_worker_replica_failure(serve_instance): client = serve_instance - @ray.remote - class Counter: - def __init__(self): - self.count = 0 - - def inc_and_get(self): - self.count += 1 - return self.count - class Worker: # Assumes that two replicas are started. Will hang forever in the # constructor for any workers that are restarted. - def __init__(self, counter): + def __init__(self, path): self.should_hang = False - self.index = ray.get(counter.inc_and_get.remote()) - if self.index > 2: + if not os.path.exists(path): + with open(path, "w") as f: + f.write("1") + else: + with open(path, "r") as f: + num = int(f.read()) + + with open(path, "w") as f: + if num == 2: + self.should_hang = True + else: + f.write(str(num + 1)) + + if self.should_hang: while True: pass def __call__(self, *args): - return self.index + pass - counter = Counter.remote() - client.create_backend("replica_failure", Worker, counter) + temp_path = os.path.join(tempfile.gettempdir(), + serve.utils.get_random_letters()) + client.create_backend("replica_failure", Worker, temp_path) client.update_backend_config( "replica_failure", BackendConfig(num_replicas=2)) client.create_endpoint( @@ -189,16 +192,9 @@ def __call__(self, *args): # Wait until both replicas have been started. responses = set() - start = time.time() - while time.time() - start < 30: + while len(responses) == 1: + responses.add(request_with_retries("/replica_failure", timeout=1).text) time.sleep(0.1) - response = request_with_retries("/replica_failure", timeout=1).text - assert response in ["1", "2"] - responses.add(response) - if len(responses) > 1: - break - else: - raise TimeoutError("Timed out waiting for replicas after 30s.") # Kill one of the replicas. handles = _get_worker_handles(client, "replica_failure") @@ -264,4 +260,6 @@ def f(_): if __name__ == "__main__": + import sys + import pytest sys.exit(pytest.main(["-v", "-s", __file__])) diff --git a/python/ray/serve/tests/test_handle.py b/python/ray/serve/tests/test_handle.py index 88ab9d2c2b7a..c17db7686aad 100644 --- a/python/ray/serve/tests/test_handle.py +++ b/python/ray/serve/tests/test_handle.py @@ -1,51 +1,9 @@ import requests -import pytest + import ray from ray import serve -@pytest.mark.asyncio -async def test_async_handle_serializable(serve_instance): - client = serve_instance - - def f(_): - return "hello" - - client.create_backend("f", f) - client.create_endpoint("f", backend="f") - - @ray.remote - class TaskActor: - async def task(self, handle): - ref = await handle.remote() - output = await ref - return output - - handle = client.get_handle("f", sync=False) - - task_actor = TaskActor.remote() - result = await task_actor.task.remote(handle) - assert result == "hello" - - -def test_sync_handle_serializable(serve_instance): - client = serve_instance - - def f(_): - return "hello" - - client.create_backend("f", f) - client.create_endpoint("f", backend="f") - - @ray.remote - def task(handle): - return ray.get(handle.remote()) - - handle = client.get_handle("f", sync=True) - result_ref = task.remote(handle) - assert ray.get(result_ref) == "hello" - - def test_handle_in_endpoint(serve_instance): client = serve_instance diff --git a/python/ray/serve/tests/test_imported_backend.py b/python/ray/serve/tests/test_imported_backend.py index 4b13980725ac..cc575dd94e1d 100644 --- a/python/ray/serve/tests/test_imported_backend.py +++ b/python/ray/serve/tests/test_imported_backend.py @@ -1,16 +1,15 @@ import ray +from ray.serve.backends import ImportedBackend from ray.serve.config import BackendConfig def test_imported_backend(serve_instance): client = serve_instance - config = BackendConfig(user_config="config", max_batch_size=2) + backend_class = ImportedBackend("ray.serve.utils.MockImportedBackend") + config = BackendConfig(user_config="config") client.create_backend( - "imported", - "ray.serve.utils.MockImportedBackend", - "input_arg", - config=config) + "imported", backend_class, "input_arg", config=config) client.create_endpoint("imported", backend="imported") # Basic sanity check. @@ -28,12 +27,3 @@ def test_imported_backend(serve_instance): # Check that other call methods work. handle = handle.options(method_name="other_method") assert ray.get(handle.remote("hello")) == "hello" - - # Check that functions work as well. - client.create_backend( - "imported_func", - "ray.serve.utils.mock_imported_function", - config=BackendConfig(max_batch_size=2)) - client.create_endpoint("imported_func", backend="imported_func") - handle = client.get_handle("imported_func") - assert ray.get(handle.remote("hello")) == "hello" diff --git a/python/ray/serve/tests/test_router.py b/python/ray/serve/tests/test_router.py index 9b8eb5548b7c..231ac11a5bfd 100644 --- a/python/ray/serve/tests/test_router.py +++ b/python/ray/serve/tests/test_router.py @@ -204,7 +204,7 @@ async def num_queries(self): return self._num_queries # We will test a scenario with two replicas in the replica set. - rs = ReplicaSet("my_backend") + rs = ReplicaSet() workers = [MockWorker.remote() for _ in range(2)] rs.set_max_concurrent_queries(1) rs.update_worker_replicas(workers) diff --git a/python/ray/serve/tests/test_util.py b/python/ray/serve/tests/test_util.py index 95f526c31288..9893bc4cee3e 100644 --- a/python/ray/serve/tests/test_util.py +++ b/python/ray/serve/tests/test_util.py @@ -9,7 +9,7 @@ import ray from ray.serve.utils import (ServeEncoder, chain_future, unpack_future, try_schedule_resources_on_nodes, - get_conda_env_dir, import_attr) + get_conda_env_dir, import_class) def test_bytes_encoder(): @@ -126,11 +126,11 @@ def test_get_conda_env_dir(tmp_path): os.environ["CONDA_PREFIX"] = "" -def test_import_attr(): - assert import_attr("ray.serve.Client") == ray.serve.api.Client - assert import_attr("ray.serve.api.Client") == ray.serve.api.Client +def test_import_class(): + assert import_class("ray.serve.Client") == ray.serve.api.Client + assert import_class("ray.serve.api.Client") == ray.serve.api.Client - policy_cls = import_attr("ray.serve.controller.TrafficPolicy") + policy_cls = import_class("ray.serve.controller.TrafficPolicy") assert policy_cls == ray.serve.controller.TrafficPolicy policy = policy_cls({"endpoint1": 0.5, "endpoint2": 0.5}) @@ -140,10 +140,6 @@ def test_import_attr(): print(repr(policy)) - # Very meta... - import_attr_2 = import_attr("ray.serve.utils.import_attr") - assert import_attr_2 == import_attr - if __name__ == "__main__": import sys diff --git a/python/ray/serve/utils.py b/python/ray/serve/utils.py index 1d19593e63b1..a594b94ddb90 100644 --- a/python/ray/serve/utils.py +++ b/python/ray/serve/utils.py @@ -359,26 +359,22 @@ def get_node_id_for_actor(actor_handle): return ray.actors()[actor_handle._actor_id.hex()]["Address"]["NodeID"] -def import_attr(full_path: str): - """Given a full import path to a module attr, return the imported attr. +def import_class(full_path: str): + """Given a full import path to a class name, return the imported class. For example, the following are equivalent: - MyClass = import_attr("module.submodule.MyClass") + MyClass = import_class("module.submodule.MyClass") from module.submodule import MyClass Returns: - Imported attr + Imported class """ last_period_idx = full_path.rfind(".") - attr_name = full_path[last_period_idx + 1:] + class_name = full_path[last_period_idx + 1:] module_name = full_path[:last_period_idx] module = importlib.import_module(module_name) - return getattr(module, attr_name) - - -async def mock_imported_function(batch): - return [await request.body() for request in batch] + return getattr(module, class_name) class MockImportedBackend: @@ -396,17 +392,11 @@ def __init__(self, arg): def reconfigure(self, config): self.config = config - def __call__(self, batch): - return [{ - "arg": self.arg, - "config": self.config - } for _ in range(len(batch))] + def __call__(self, *args): + return {"arg": self.arg, "config": self.config} - async def other_method(self, batch): - responses = [] - for request in batch: - responses.append(await request.body()) - return responses + async def other_method(self, request): + return await request.body() def compute_iterable_delta(old: Iterable, @@ -416,7 +406,7 @@ def compute_iterable_delta(old: Iterable, Usage: >>> old = {"a", "b"} >>> new = {"a", "d"} - >>> compute_iterable_delta(old, new) + >>> compute_dict_delta(old, new) ({"d"}, {"b"}, {"a"}) """ old_keys, new_keys = set(old), set(new) diff --git a/python/ray/setup-dev.py b/python/ray/setup-dev.py index dcbb622ad16d..285c0028e159 100755 --- a/python/ray/setup-dev.py +++ b/python/ray/setup-dev.py @@ -66,7 +66,7 @@ def do_link(package, force=False, local_path=None): do_link("rllib", force=args.yes, local_path="../../../rllib") do_link("tune", force=args.yes) do_link("autoscaler", force=args.yes) - do_link("ray_operator", force=args.yes) + do_link("operator", force=args.yes) do_link("cloudpickle", force=args.yes) do_link("scripts", force=args.yes) do_link("internal", force=args.yes) diff --git a/python/ray/state.py b/python/ray/state.py index 7524ea1244b2..aa3488e20e78 100644 --- a/python/ray/state.py +++ b/python/ray/state.py @@ -388,20 +388,6 @@ def profile_table(self): return dict(result) - def get_placement_group_by_name(self, placement_group_name): - self._check_connected() - - placement_group_info = ( - self.global_state_accessor.get_placement_group_by_name( - placement_group_name)) - if placement_group_info is None: - return None - else: - placement_group_table_data = \ - gcs_utils.PlacementGroupTableData.FromString( - placement_group_info) - return self._gen_placement_group_info(placement_group_table_data) - def placement_group_table(self, placement_group_id=None): self._check_connected() diff --git a/python/ray/tests/BUILD b/python/ray/tests/BUILD index 2572c50c2dcf..0f2709c82fc0 100644 --- a/python/ray/tests/BUILD +++ b/python/ray/tests/BUILD @@ -23,11 +23,8 @@ py_test_module_list( "test_autoscaling_policy.py", "test_basic.py", "test_basic_2.py", - "test_basic_3.py", "test_cancel.py", "test_cli.py", - "test_client.py", - "test_client_init.py", "test_component_failures_2.py", "test_component_failures_3.py", "test_error_ray_not_initialized.py", @@ -51,12 +48,12 @@ py_test_module_list( "test_metrics.py", "test_multi_node.py", "test_multi_node_2.py", - "test_multi_node_3.py", "test_multi_tenancy.py", "test_multinode_failures.py", "test_multinode_failures_2.py", "test_multiprocessing.py", "test_object_manager.py", + "test_object_spilling.py", "test_output.py", "test_reconstruction.py", "test_reference_counting.py", @@ -83,6 +80,7 @@ py_test_module_list( "test_autoscaler.py", "test_autoscaler_yaml.py", "test_client_metadata.py", + "test_client.py", "test_client_references.py", "test_client_terminate.py", "test_command_runner.py", @@ -92,7 +90,6 @@ py_test_module_list( "test_dask_scheduler.py", "test_debug_tools.py", "test_job.py", - "test_k8s_operator_mock.py", "test_memstat.py", "test_metrics_agent.py", "test_microbenchmarks.py", @@ -103,7 +100,6 @@ py_test_module_list( "test_queue.py", "test_ray_debugger.py", "test_ray_init.py", - "test_shuffle.py", "test_tempfile.py", ], size = "small", @@ -115,9 +111,8 @@ py_test_module_list( py_test_module_list( files = [ "test_k8s_cluster_launcher.py", - "test_k8s_operator_examples.py", ], - size = "medium", + size = "small", extra_srcs = SRCS, deps = ["//:ray_lib"], tags = ["kubernetes"] @@ -137,7 +132,6 @@ py_test_module_list( py_test_module_list( files = [ "test_placement_group.py", - "test_object_spilling.py", ], size = "large", extra_srcs = SRCS, @@ -177,12 +171,11 @@ py_test_module_list( "test_advanced.py", "test_basic.py", "test_basic_2.py", - "test_basic_3.py", ], size = "medium", extra_srcs = SRCS, name_suffix = "_client_mode", - # TODO(barakmich): py_test will support env in Bazel 4.0.0... + # TODO(barakmich): py_test will support env in Bazel 4.0.0... # Until then, we can use tags. #env = {"RAY_CLIENT_MODE": "1"}, tags = ["exclusive", "client_tests"], diff --git a/python/ray/tests/aws/test_autoscaler_aws.py b/python/ray/tests/aws/test_autoscaler_aws.py index acf6c2d628c2..697c9efb163c 100644 --- a/python/ray/tests/aws/test_autoscaler_aws.py +++ b/python/ray/tests/aws/test_autoscaler_aws.py @@ -1,8 +1,6 @@ import pytest -from ray.autoscaler._private.aws.config import _get_vpc_id_or_die, \ - bootstrap_aws, \ - DEFAULT_AMI +from ray.autoscaler._private.aws.config import _get_vpc_id_or_die import ray.tests.aws.utils.stubs as stubs import ray.tests.aws.utils.helpers as helpers from ray.tests.aws.utils.constants import AUX_SUBNET, DEFAULT_SUBNET, \ @@ -115,53 +113,6 @@ def test_create_sg_with_custom_inbound_rules_and_name(iam_client_stub, ec2_client_stub.assert_no_pending_responses() -def test_subnet_given_head_and_worker_sg(iam_client_stub, ec2_client_stub): - stubs.configure_iam_role_default(iam_client_stub) - stubs.configure_key_pair_default(ec2_client_stub) - - # list a security group and a thousand subnets in different vpcs - stubs.describe_a_security_group(ec2_client_stub, DEFAULT_SG) - stubs.describe_a_thousand_subnets_in_different_vpcs(ec2_client_stub) - - config = helpers.bootstrap_aws_example_config_file( - "example-head-and-worker-security-group.yaml") - - # check that just the single subnet in the right vpc is filled - assert config["head_node"]["SubnetIds"] == [DEFAULT_SUBNET["SubnetId"]] - assert config["worker_nodes"]["SubnetIds"] == [DEFAULT_SUBNET["SubnetId"]] - - # expect no pending responses left in IAM or EC2 client stub queues - iam_client_stub.assert_no_pending_responses() - ec2_client_stub.assert_no_pending_responses() - - -def test_fills_out_amis(iam_client_stub, ec2_client_stub): - # Setup stubs to mock out boto3 - stubs.configure_iam_role_default(iam_client_stub) - stubs.configure_key_pair_default(ec2_client_stub) - stubs.describe_a_security_group(ec2_client_stub, DEFAULT_SG) - stubs.configure_subnet_default(ec2_client_stub) - - config = helpers.load_aws_example_config_file("example-full.yaml") - del config["head_node"]["ImageId"] - del config["worker_nodes"]["ImageId"] - - # Pass in SG for stub to work - config["head_node"]["SecurityGroupIds"] = ["sg-1234abcd"] - config["worker_nodes"]["SecurityGroupIds"] = ["sg-1234abcd"] - - defaults_filled = bootstrap_aws(config) - - ami = DEFAULT_AMI.get(config.get("provider", {}).get("region")) - - assert defaults_filled["head_node"].get("ImageId") == ami - - assert defaults_filled["worker_nodes"].get("ImageId") == ami - - iam_client_stub.assert_no_pending_responses() - ec2_client_stub.assert_no_pending_responses() - - if __name__ == "__main__": import sys sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/aws/utils/constants.py b/python/ray/tests/aws/utils/constants.py index adc8a5b2abe4..cdcf5a79c68d 100644 --- a/python/ray/tests/aws/utils/constants.py +++ b/python/ray/tests/aws/utils/constants.py @@ -50,19 +50,6 @@ "VpcId": "vpc-0000000", } - -def subnet_in_vpc(vpc_num): - """Returns a copy of DEFAULT_SUBNET whose VpcId ends with the digits - of vpc_num.""" - subnet = copy.copy(DEFAULT_SUBNET) - subnet["VpcId"] = f"vpc-{vpc_num:07d}" - return subnet - - -A_THOUSAND_SUBNETS_IN_DIFFERENT_VPCS = [ - subnet_in_vpc(vpc_num) for vpc_num in range(1, 1000) -] + [DEFAULT_SUBNET] - # Secondary EC2 subnet to expose to tests as required. AUX_SUBNET = { "AvailabilityZone": "us-west-2a", diff --git a/python/ray/tests/aws/utils/stubs.py b/python/ray/tests/aws/utils/stubs.py index 61f1f9ab632b..7840447d80e0 100644 --- a/python/ray/tests/aws/utils/stubs.py +++ b/python/ray/tests/aws/utils/stubs.py @@ -1,7 +1,7 @@ import ray from ray.tests.aws.utils.mocks import mock_path_exists_key_pair from ray.tests.aws.utils.constants import DEFAULT_INSTANCE_PROFILE, \ - DEFAULT_KEY_PAIR, DEFAULT_SUBNET, A_THOUSAND_SUBNETS_IN_DIFFERENT_VPCS + DEFAULT_KEY_PAIR, DEFAULT_SUBNET from unittest import mock @@ -41,13 +41,6 @@ def configure_subnet_default(ec2_client_stub): service_response={"Subnets": [DEFAULT_SUBNET]}) -def describe_a_thousand_subnets_in_different_vpcs(ec2_client_stub): - ec2_client_stub.add_response( - "describe_subnets", - expected_params={}, - service_response={"Subnets": A_THOUSAND_SUBNETS_IN_DIFFERENT_VPCS}) - - def skip_to_configure_sg(ec2_client_stub, iam_client_stub): configure_iam_role_default(iam_client_stub) configure_key_pair_default(ec2_client_stub) @@ -73,18 +66,6 @@ def describe_no_security_groups(ec2_client_stub): service_response={}) -def describe_a_security_group(ec2_client_stub, security_group): - ec2_client_stub.add_response( - "describe_security_groups", - expected_params={ - "Filters": [{ - "Name": "group-id", - "Values": [security_group["GroupId"]] - }] - }, - service_response={"SecurityGroups": [security_group]}) - - def create_sg_echo(ec2_client_stub, security_group): ec2_client_stub.add_response( "create_security_group", diff --git a/python/ray/tests/test_actor_advanced.py b/python/ray/tests/test_actor_advanced.py index 496e977fe9cd..1913decf83df 100644 --- a/python/ray/tests/test_actor_advanced.py +++ b/python/ray/tests/test_actor_advanced.py @@ -1093,90 +1093,6 @@ class Actor2: global_state_accessor.disconnect() -def test_kill_pending_actor_with_no_restart_true(): - cluster = ray.init() - global_state_accessor = GlobalStateAccessor( - cluster["redis_address"], ray.ray_constants.REDIS_DEFAULT_PASSWORD) - global_state_accessor.connect() - - @ray.remote(resources={"WORKER": 1.0}) - class PendingActor: - pass - - # Kill actor with `no_restart=True`. - actor = PendingActor.remote() - # TODO(ffbin): The raylet doesn't guarantee the order when dealing with - # RequestWorkerLease and CancelWorkerLease. If we kill the actor - # immediately after creating the actor, we may not be able to clean up - # the request cached by the raylet. - # See https://github.com/ray-project/ray/issues/13545 for details. - time.sleep(1) - ray.kill(actor, no_restart=True) - - def condition1(): - message = global_state_accessor.get_all_resource_usage() - resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString( - message) - if len(resource_usages.resource_load_by_shape.resource_demands) == 0: - return True - return False - - # Actor is dead, so the infeasible task queue length is 0. - wait_for_condition(condition1, timeout=10) - - global_state_accessor.disconnect() - ray.shutdown() - - -def test_kill_pending_actor_with_no_restart_false(): - cluster = ray.init() - global_state_accessor = GlobalStateAccessor( - cluster["redis_address"], ray.ray_constants.REDIS_DEFAULT_PASSWORD) - global_state_accessor.connect() - - @ray.remote(resources={"WORKER": 1.0}, max_restarts=1) - class PendingActor: - pass - - # Kill actor with `no_restart=False`. - actor = PendingActor.remote() - # TODO(ffbin): The raylet doesn't guarantee the order when dealing with - # RequestWorkerLease and CancelWorkerLease. If we kill the actor - # immediately after creating the actor, we may not be able to clean up - # the request cached by the raylet. - # See https://github.com/ray-project/ray/issues/13545 for details. - time.sleep(1) - ray.kill(actor, no_restart=False) - - def condition1(): - message = global_state_accessor.get_all_resource_usage() - resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString( - message) - if len(resource_usages.resource_load_by_shape.resource_demands) == 0: - return False - return True - - # Actor restarts, so the infeasible task queue length is 1. - wait_for_condition(condition1, timeout=10) - - # Kill actor again and actor is dead, - # so the infeasible task queue length is 0. - ray.kill(actor, no_restart=False) - - def condition2(): - message = global_state_accessor.get_all_resource_usage() - resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString( - message) - if len(resource_usages.resource_load_by_shape.resource_demands) == 0: - return True - return False - - wait_for_condition(condition2, timeout=10) - - global_state_accessor.disconnect() - ray.shutdown() - - if __name__ == "__main__": import pytest # Test suite is timing out. Disable on windows for now. diff --git a/python/ray/tests/test_actor_failures.py b/python/ray/tests/test_actor_failures.py index 677b0e0fc940..227fb48d211d 100644 --- a/python/ray/tests/test_actor_failures.py +++ b/python/ray/tests/test_actor_failures.py @@ -32,9 +32,10 @@ def ray_init_with_task_retry_delay(): @pytest.mark.parametrize( "ray_start_regular", [{ "object_store_memory": 150 * 1024 * 1024, + "_lru_evict": True, }], indirect=True) -def test_actor_spilled(ray_start_regular): +def test_actor_eviction(ray_start_regular): object_store_memory = 150 * 1024 * 1024 @ray.remote @@ -57,17 +58,21 @@ def create_object(self, size): ray.get(obj) # Get each object again. At this point, the earlier objects should have - # been spilled. - num_success = 0 + # been evicted. + num_evicted, num_success = 0, 0 for obj in objects: - val = ray.get(obj) - assert isinstance(val, np.ndarray), val - num_success += 1 - # All of objects should've been spilled, so all of them should succeed. - assert num_success == len(objects) + try: + val = ray.get(obj) + assert isinstance(val, np.ndarray), val + num_success += 1 + except ray.exceptions.ObjectLostError: + num_evicted += 1 + # Some objects should have been evicted, and some should still be in the + # object store. + assert num_evicted > 0 + assert num_success > 0 -@pytest.mark.skipif(sys.platform == "win32", reason="Very flaky on Windows.") def test_actor_restart(ray_init_with_task_retry_delay): """Test actor restart when actor process is killed.""" @@ -110,8 +115,6 @@ def get_pid(self): ray.get(results[0]) except ray.exceptions.RayActorError: results.pop(0) - else: - break # Check all tasks that executed after the restart. if results: # The actor executed some tasks after the restart. @@ -272,7 +275,7 @@ def call_other(self, counter, signal): def test_actor_restart_on_node_failure(ray_start_cluster): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_period_milliseconds": 100, + "raylet_heartbeat_timeout_milliseconds": 100, "object_timeout_milliseconds": 1000, "task_retry_delay_ms": 100, } @@ -428,7 +431,6 @@ def increase(self): assert ray.get(RetryableTask.remote(remote_actor)) == 3 -@pytest.mark.skipif(sys.platform == "win32", reason="Very flaky on Windows.") # NOTE(hchen): we set object_timeout_milliseconds to 1s for # this test. Because if this value is too small, suprious task reconstruction # may happen and cause the test fauilure. If the value is too large, this test diff --git a/python/ray/tests/test_advanced_3.py b/python/ray/tests/test_advanced_3.py index 5a2b57e2c23d..2e60f40e997c 100644 --- a/python/ray/tests/test_advanced_3.py +++ b/python/ray/tests/test_advanced_3.py @@ -21,8 +21,9 @@ import setproctitle import subprocess -from ray.test_utils import (check_call_ray, wait_for_condition, - wait_for_num_actors, new_scheduler_enabled) +from ray.test_utils import (check_call_ray, RayTestTimeoutException, + wait_for_condition, wait_for_num_actors, + new_scheduler_enabled) logger = logging.getLogger(__name__) @@ -155,6 +156,15 @@ def f(x): assert ray.get(f.remote(non_local.remote())) == non_local_node.unique_id +def wait_for_num_objects(num_objects, timeout=10): + start_time = time.time() + while time.time() - start_time < timeout: + if len(ray.objects()) >= num_objects: + return + time.sleep(0.1) + raise RayTestTimeoutException("Timed out while waiting for global state.") + + def test_global_state_api(shutdown_only): ray.init(num_cpus=5, num_gpus=3, resources={"CustomResource": 1}) @@ -344,7 +354,10 @@ def test_initialized_local_mode(shutdown_only_with_initialization_check): def test_wait_reconstruction(shutdown_only): - ray.init(num_cpus=1, object_store_memory=int(10**8)) + ray.init( + num_cpus=1, + object_store_memory=int(10**8), + _system_config={"object_pinning_enabled": 0}) @ray.remote def f(): @@ -611,14 +624,7 @@ def f(self): def test_lease_request_leak(shutdown_only): - ray.init( - num_cpus=1, - _system_config={ - # This test uses ray.objects(), which only works with the GCS-based - # object directory - "ownership_based_object_directory_enabled": False, - "object_timeout_milliseconds": 200 - }) + ray.init(num_cpus=1, _system_config={"object_timeout_milliseconds": 200}) assert len(ray.objects()) == 0 @ray.remote diff --git a/python/ray/tests/test_asyncio.py b/python/ray/tests/test_asyncio.py index fd99343254d5..18dd63a22d07 100644 --- a/python/ray/tests/test_asyncio.py +++ b/python/ray/tests/test_asyncio.py @@ -6,7 +6,7 @@ import pytest import ray -from ray.test_utils import SignalActor, wait_for_condition +from ray.test_utils import SignalActor def test_asyncio_actor(ray_start_regular_shared): @@ -224,37 +224,6 @@ async def loop_forever(self): ray.get(a.ping.remote()) -def test_async_callback(ray_start_regular_shared): - global_set = set() - - ref = ray.put(None) - ref._on_completed(lambda _: global_set.add("completed-1")) - wait_for_condition(lambda: "completed-1" in global_set) - - signal = SignalActor.remote() - - @ray.remote - def wait(): - ray.get(signal.wait.remote()) - - ref = wait.remote() - ref._on_completed(lambda _: global_set.add("completed-2")) - assert "completed-2" not in global_set - signal.send.remote() - wait_for_condition(lambda: "completed-2" in global_set) - - -def test_async_function_errored(ray_start_regular_shared): - @ray.remote - async def f(): - pass - - ref = f.remote() - - with pytest.raises(ValueError): - ray.get(ref) - - if __name__ == "__main__": import pytest sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_autoscaler.py b/python/ray/tests/test_autoscaler.py index 925cb1d202d8..f0f16318ac37 100644 --- a/python/ray/tests/test_autoscaler.py +++ b/python/ray/tests/test_autoscaler.py @@ -429,53 +429,6 @@ def testGetOrCreateHeadNode(self): f"docker cp {docker_mount_prefix}/~/ray_bootstrap_config.yaml" runner.assert_has_call("1.2.3.4", pattern=pattern_to_assert) - @unittest.skipIf(sys.platform == "win32", "Failing on Windows.") - def testGetOrCreateHeadNodePodman(self): - config = copy.deepcopy(SMALL_CLUSTER) - config["docker"]["use_podman"] = True - config_path = self.write_config(config) - self.provider = MockProvider() - runner = MockProcessRunner() - runner.respond_to_call("json .Mounts", ["[]"]) - # Two initial calls to docker cp, + 2 more calls during run_init - runner.respond_to_call(".State.Running", - ["false", "false", "false", "false"]) - runner.respond_to_call("json .Config.Env", ["[]"]) - commands.get_or_create_head_node( - config, - printable_config_file=config_path, - no_restart=False, - restart_only=False, - yes=True, - override_cluster_name=None, - _provider=self.provider, - _runner=runner) - self.waitForNodes(1) - runner.assert_has_call("1.2.3.4", "init_cmd") - runner.assert_has_call("1.2.3.4", "head_setup_cmd") - runner.assert_has_call("1.2.3.4", "start_ray_head") - self.assertEqual(self.provider.mock_nodes[0].node_type, None) - runner.assert_has_call("1.2.3.4", pattern="podman run") - - docker_mount_prefix = get_docker_host_mount_location( - SMALL_CLUSTER["cluster_name"]) - runner.assert_not_has_call( - "1.2.3.4", - pattern=f"-v {docker_mount_prefix}/~/ray_bootstrap_config") - runner.assert_has_call( - "1.2.3.4", - pattern=f"podman cp {docker_mount_prefix}/~/ray_bootstrap_key.pem") - pattern_to_assert = \ - f"podman cp {docker_mount_prefix}/~/ray_bootstrap_config.yaml" - runner.assert_has_call("1.2.3.4", pattern=pattern_to_assert) - - for cmd in runner.command_history(): - assert "docker" not in cmd, ("Docker (not podman) found in call: " - f"{cmd}") - - runner.assert_has_call("1.2.3.4", "podman inspect") - runner.assert_has_call("1.2.3.4", "podman exec") - @unittest.skipIf(sys.platform == "win32", "Failing on Windows.") def testGetOrCreateHeadNodeFromStopped(self): self.testGetOrCreateHeadNode() @@ -500,7 +453,7 @@ def testGetOrCreateHeadNodeFromStopped(self): _provider=self.provider, _runner=runner) self.waitForNodes(1) - # Init & Setup commands must be run for Docker! + # Init & Setup commands msut be run for Docker! runner.assert_has_call("1.2.3.4", "init_cmd") runner.assert_has_call("1.2.3.4", "head_setup_cmd") runner.assert_has_call("1.2.3.4", "start_ray_head") @@ -543,34 +496,6 @@ def testGetOrCreateHeadNodeFromStopped(self): assert first_mkdir < first_rsync assert first_rsync < first_cp - def testGetOrCreateHeadNodeFromStoppedRestartOnly(self): - self.testGetOrCreateHeadNode() - self.provider.cache_stopped = True - existing_nodes = self.provider.non_terminated_nodes({}) - assert len(existing_nodes) == 1 - self.provider.terminate_node(existing_nodes[0]) - config_path = self.write_config(SMALL_CLUSTER) - runner = MockProcessRunner() - runner.respond_to_call("json .Mounts", ["[]"]) - # Two initial calls to docker cp, + 2 more calls during run_init - runner.respond_to_call(".State.Running", - ["false", "false", "false", "false"]) - runner.respond_to_call("json .Config.Env", ["[]"]) - commands.get_or_create_head_node( - SMALL_CLUSTER, - printable_config_file=config_path, - no_restart=False, - restart_only=True, - yes=True, - override_cluster_name=None, - _provider=self.provider, - _runner=runner) - self.waitForNodes(1) - # Init & Setup commands must be run for Docker! - runner.assert_has_call("1.2.3.4", "init_cmd") - runner.assert_has_call("1.2.3.4", "head_setup_cmd") - runner.assert_has_call("1.2.3.4", "start_ray_head") - @unittest.skipIf(sys.platform == "win32", "Failing on Windows.") def testDockerFileMountsAdded(self): config = copy.deepcopy(SMALL_CLUSTER) diff --git a/python/ray/tests/test_autoscaler_yaml.py b/python/ray/tests/test_autoscaler_yaml.py index 5595382a02ea..b712c8955e97 100644 --- a/python/ray/tests/test_autoscaler_yaml.py +++ b/python/ray/tests/test_autoscaler_yaml.py @@ -11,8 +11,6 @@ from ray.autoscaler._private.util import prepare_config, validate_config from ray.autoscaler._private.providers import _NODE_PROVIDERS -from ray.autoscaler._private.kubernetes.node_provider import\ - KubernetesNodeProvider from ray.test_utils import recursive_fnmatch @@ -27,7 +25,6 @@ def ignore_k8s_operator_configs(paths): return [ path for path in paths if "kubernetes/operator_configs" not in path - and "kubernetes/job-example.yaml" not in path ] @@ -43,44 +40,14 @@ def testValidateDefaultConfig(self): with open(config_path) as f: config = yaml.safe_load(f) config = prepare_config(config) - if config["provider"]["type"] == "kubernetes": - KubernetesNodeProvider.fillout_available_node_types_resources( - config) try: validate_config(config) except Exception: - self.fail( - f"Config {config_path} did not pass validation test!") - - @pytest.mark.skipif( - sys.platform.startswith("win"), reason="Fails on Windows.") - def testValidateDefaultConfigMinMaxWorkers(self): - aws_config_path = os.path.join( - RAY_PATH, "autoscaler/aws/example-multi-node-type.yaml") - with open(aws_config_path) as f: - config = yaml.safe_load(f) - config = prepare_config(config) - for node_type in config["available_node_types"]: - config["available_node_types"][node_type]["resources"] = config[ - "available_node_types"][node_type].get("resources", {}) - try: - validate_config(config) - except Exception: - self.fail("Config did not pass validation test!") - - config["max_workers"] = 0 # the sum of min_workers is 1. - with pytest.raises(ValueError): - validate_config(config) - - # make sure edge case of exactly 1 passes too. - config["max_workers"] = 1 - try: - validate_config(config) - except Exception: - self.fail("Config did not pass validation test!") + self.fail("Config did not pass validation test!") @pytest.mark.skipif( - sys.platform.startswith("win"), reason="Fails on Windows.") + sys.platform.startswith("win"), + reason="TODO(ameer): fails on Windows.") def testValidateDefaultConfigAWSMultiNodeTypes(self): aws_config_path = os.path.join( RAY_PATH, "autoscaler/aws/example-multi-node-type.yaml") diff --git a/python/ray/tests/test_basic.py b/python/ray/tests/test_basic.py index 4c80aea70ebb..4475bb6ea464 100644 --- a/python/ray/tests/test_basic.py +++ b/python/ray/tests/test_basic.py @@ -9,7 +9,11 @@ import pytest import ray.cluster_utils -from ray.test_utils import (client_test_enabled) +from ray.test_utils import ( + client_test_enabled, + dicts_equal, + wait_for_pid_to_exit, +) import ray @@ -166,6 +170,126 @@ class A2: x = 1 +def test_many_fractional_resources(shutdown_only): + ray.init(num_cpus=2, num_gpus=2, resources={"Custom": 2}) + + @ray.remote + def g(): + return 1 + + @ray.remote + def f(block, accepted_resources): + true_resources = { + resource: value[0][1] + for resource, value in ray.get_resource_ids().items() + } + if block: + ray.get(g.remote()) + return dicts_equal(true_resources, accepted_resources) + + # Check that the resource are assigned correctly. + result_ids = [] + for rand1, rand2, rand3 in np.random.uniform(size=(100, 3)): + resource_set = {"CPU": int(rand1 * 10000) / 10000} + result_ids.append(f._remote([False, resource_set], num_cpus=rand1)) + + resource_set = {"CPU": 1, "GPU": int(rand1 * 10000) / 10000} + result_ids.append(f._remote([False, resource_set], num_gpus=rand1)) + + resource_set = {"CPU": 1, "Custom": int(rand1 * 10000) / 10000} + result_ids.append( + f._remote([False, resource_set], resources={"Custom": rand1})) + + resource_set = { + "CPU": int(rand1 * 10000) / 10000, + "GPU": int(rand2 * 10000) / 10000, + "Custom": int(rand3 * 10000) / 10000 + } + result_ids.append( + f._remote( + [False, resource_set], + num_cpus=rand1, + num_gpus=rand2, + resources={"Custom": rand3})) + result_ids.append( + f._remote( + [True, resource_set], + num_cpus=rand1, + num_gpus=rand2, + resources={"Custom": rand3})) + assert all(ray.get(result_ids)) + + # Check that the available resources at the end are the same as the + # beginning. + stop_time = time.time() + 10 + correct_available_resources = False + while time.time() < stop_time: + available_resources = ray.available_resources() + if ("CPU" in available_resources + and ray.available_resources()["CPU"] == 2.0 + and "GPU" in available_resources + and ray.available_resources()["GPU"] == 2.0 + and "Custom" in available_resources + and ray.available_resources()["Custom"] == 2.0): + correct_available_resources = True + break + if not correct_available_resources: + assert False, "Did not get correct available resources." + + +def test_background_tasks_with_max_calls(shutdown_only): + ray.init(num_cpus=2) + + @ray.remote + def g(): + time.sleep(.1) + return 0 + + @ray.remote(max_calls=1, max_retries=0) + def f(): + return [g.remote()] + + nested = ray.get([f.remote() for _ in range(10)]) + + # Should still be able to retrieve these objects, since f's workers will + # wait for g to finish before exiting. + ray.get([x[0] for x in nested]) + + @ray.remote(max_calls=1, max_retries=0) + def f(): + return os.getpid(), g.remote() + + nested = ray.get([f.remote() for _ in range(10)]) + while nested: + pid, g_id = nested.pop(0) + ray.get(g_id) + del g_id + wait_for_pid_to_exit(pid) + + +@pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.") +def test_fair_queueing(shutdown_only): + ray.init(num_cpus=1, _system_config={"fair_queueing_enabled": 1}) + + @ray.remote + def h(): + return 0 + + @ray.remote + def g(): + return ray.get(h.remote()) + + @ray.remote + def f(): + return ray.get(g.remote()) + + # This will never finish without fair queueing of {f, g, h}: + # https://github.com/ray-project/ray/issues/3644 + ready, _ = ray.wait( + [f.remote() for _ in range(1000)], timeout=60.0, num_returns=1000) + assert len(ready) == 1000, len(ready) + + def test_put_get(shutdown_only): ray.init(num_cpus=0) @@ -261,9 +385,6 @@ def foo(): "ray_start_cluster_head", [{ "num_cpus": 0, "object_store_memory": 75 * 1024 * 1024, - "_system_config": { - "automatic_object_spilling_enabled": False - } }], indirect=True) def test_fetch_local(ray_start_cluster_head): diff --git a/python/ray/tests/test_basic_2.py b/python/ray/tests/test_basic_2.py index 21fabc4ba55a..b71c63fbf941 100644 --- a/python/ray/tests/test_basic_2.py +++ b/python/ray/tests/test_basic_2.py @@ -342,7 +342,7 @@ def g(x): @pytest.mark.skipif(client_test_enabled(), reason="message size") def test_system_config_when_connecting(ray_start_cluster): - config = {"object_timeout_milliseconds": 200} + config = {"object_pinning_enabled": 0, "object_timeout_milliseconds": 200} cluster = ray.cluster_utils.Cluster() cluster.add_node( _system_config=config, object_store_memory=100 * 1024 * 1024) @@ -360,7 +360,9 @@ def test_system_config_when_connecting(ray_start_cluster): put_ref = ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8)) del put_ref - ray.get(obj_ref) + # This would not raise an exception if object pinning was enabled. + with pytest.raises(ray.exceptions.ObjectLostError): + ray.get(obj_ref) def test_get_multiple(ray_start_regular_shared): diff --git a/python/ray/tests/test_basic_3.py b/python/ray/tests/test_basic_3.py deleted file mode 100644 index 3b4b7ac9493a..000000000000 --- a/python/ray/tests/test_basic_3.py +++ /dev/null @@ -1,142 +0,0 @@ -# coding: utf-8 -import logging -import os -import sys -import time - -import numpy as np -import pytest - -import ray.cluster_utils -from ray.test_utils import ( - dicts_equal, - wait_for_pid_to_exit, -) - -import ray - -logger = logging.getLogger(__name__) - - -def test_many_fractional_resources(shutdown_only): - ray.init(num_cpus=2, num_gpus=2, resources={"Custom": 2}) - - @ray.remote - def g(): - return 1 - - @ray.remote - def f(block, accepted_resources): - true_resources = { - resource: value[0][1] - for resource, value in ray.get_resource_ids().items() - } - if block: - ray.get(g.remote()) - return dicts_equal(true_resources, accepted_resources) - - # Check that the resource are assigned correctly. - result_ids = [] - for rand1, rand2, rand3 in np.random.uniform(size=(100, 3)): - resource_set = {"CPU": int(rand1 * 10000) / 10000} - result_ids.append(f._remote([False, resource_set], num_cpus=rand1)) - - resource_set = {"CPU": 1, "GPU": int(rand1 * 10000) / 10000} - result_ids.append(f._remote([False, resource_set], num_gpus=rand1)) - - resource_set = {"CPU": 1, "Custom": int(rand1 * 10000) / 10000} - result_ids.append( - f._remote([False, resource_set], resources={"Custom": rand1})) - - resource_set = { - "CPU": int(rand1 * 10000) / 10000, - "GPU": int(rand2 * 10000) / 10000, - "Custom": int(rand3 * 10000) / 10000 - } - result_ids.append( - f._remote( - [False, resource_set], - num_cpus=rand1, - num_gpus=rand2, - resources={"Custom": rand3})) - result_ids.append( - f._remote( - [True, resource_set], - num_cpus=rand1, - num_gpus=rand2, - resources={"Custom": rand3})) - assert all(ray.get(result_ids)) - - # Check that the available resources at the end are the same as the - # beginning. - stop_time = time.time() + 10 - correct_available_resources = False - while time.time() < stop_time: - available_resources = ray.available_resources() - if ("CPU" in available_resources - and ray.available_resources()["CPU"] == 2.0 - and "GPU" in available_resources - and ray.available_resources()["GPU"] == 2.0 - and "Custom" in available_resources - and ray.available_resources()["Custom"] == 2.0): - correct_available_resources = True - break - if not correct_available_resources: - assert False, "Did not get correct available resources." - - -def test_background_tasks_with_max_calls(shutdown_only): - ray.init(num_cpus=2) - - @ray.remote - def g(): - time.sleep(.1) - return 0 - - @ray.remote(max_calls=1, max_retries=0) - def f(): - return [g.remote()] - - nested = ray.get([f.remote() for _ in range(10)]) - - # Should still be able to retrieve these objects, since f's workers will - # wait for g to finish before exiting. - ray.get([x[0] for x in nested]) - - @ray.remote(max_calls=1, max_retries=0) - def f(): - return os.getpid(), g.remote() - - nested = ray.get([f.remote() for _ in range(10)]) - while nested: - pid, g_id = nested.pop(0) - ray.get(g_id) - del g_id - wait_for_pid_to_exit(pid) - - -@pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.") -def test_fair_queueing(shutdown_only): - ray.init(num_cpus=1, _system_config={"fair_queueing_enabled": 1}) - - @ray.remote - def h(): - return 0 - - @ray.remote - def g(): - return ray.get(h.remote()) - - @ray.remote - def f(): - return ray.get(g.remote()) - - # This will never finish without fair queueing of {f, g, h}: - # https://github.com/ray-project/ray/issues/3644 - ready, _ = ray.wait( - [f.remote() for _ in range(1000)], timeout=60.0, num_returns=1000) - assert len(ready) == 1000, len(ready) - - -if __name__ == "__main__": - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_cancel.py b/python/ray/tests/test_cancel.py index aefff09fae62..11b4dfbd4e64 100644 --- a/python/ray/tests/test_cancel.py +++ b/python/ray/tests/test_cancel.py @@ -175,8 +175,6 @@ def infinite_sleep(y): sleep_or_no = [random.randint(0, 1) for _ in range(100)] tasks = [infinite_sleep.remote(i) for i in sleep_or_no] cancelled = set() - - # Randomly kill queued tasks (infinitely sleeping or not). for t in tasks: if random.random() > 0.5: ray.cancel(t, force=use_force) @@ -188,13 +186,10 @@ def infinite_sleep(y): for done in cancelled: with pytest.raises(valid_exceptions(use_force)): ray.get(done, timeout=120) - - # Kill all infinitely sleeping tasks (queued or not). for indx, t in enumerate(tasks): if sleep_or_no[indx]: ray.cancel(t, force=use_force) cancelled.add(t) - for indx, t in enumerate(tasks): if t in cancelled: with pytest.raises(valid_exceptions(use_force)): ray.get(t, timeout=120) @@ -218,8 +213,8 @@ def fast(y): # between a worker receiving a task and the worker executing # that task (specifically the python execution), Cancellation # can fail. - - time.sleep(0.1) + if not use_force: + time.sleep(0.1) ray.cancel(x, force=use_force) ids.append(x) diff --git a/python/ray/tests/test_cli.py b/python/ray/tests/test_cli.py index f5628701f91b..57bf61419690 100644 --- a/python/ray/tests/test_cli.py +++ b/python/ray/tests/test_cli.py @@ -37,7 +37,6 @@ import ray.autoscaler._private.aws.config as aws_config import ray.scripts.scripts as scripts -from ray.test_utils import wait_for_condition boto3_list = [{ "InstanceType": "t1.micro", @@ -416,32 +415,5 @@ def commands_mock(command, stdin): _check_output_via_pattern("test_ray_submit.txt", result) -def test_ray_status(): - import ray - address = ray.init().get("redis_address") - runner = CliRunner() - - def output_ready(): - result = runner.invoke(scripts.status) - result.stdout - return not result.exception and "memory" in result.output - - wait_for_condition(output_ready) - - result = runner.invoke(scripts.status, []) - _check_output_via_pattern("test_ray_status.txt", result) - - result_arg = runner.invoke(scripts.status, ["--address", address]) - _check_output_via_pattern("test_ray_status.txt", result_arg) - - # Try to check status with RAY_ADDRESS set - os.environ["RAY_ADDRESS"] = address - result_env = runner.invoke(scripts.status) - _check_output_via_pattern("test_ray_status.txt", result_env) - - result_env_arg = runner.invoke(scripts.status, ["--address", address]) - _check_output_via_pattern("test_ray_status.txt", result_env_arg) - - if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_cli_patterns/test_ray_status.txt b/python/ray/tests/test_cli_patterns/test_ray_status.txt deleted file mode 100644 index f903c6d62503..000000000000 --- a/python/ray/tests/test_cli_patterns/test_ray_status.txt +++ /dev/null @@ -1,14 +0,0 @@ -======== Cluster status: .+ -Node status ------------------------------------------------------------- - 1 node\(s\) with resources: .+ - -Resources ------------------------------------------------------------- -Usage: - 0.+ - 0.+ - 0.+ - -Demands: - \(no resource demands\) diff --git a/python/ray/tests/test_cli_patterns/test_ray_up_config.yaml b/python/ray/tests/test_cli_patterns/test_ray_up_config.yaml index f3d6a03ce1b1..4d63420092e5 100644 --- a/python/ray/tests/test_cli_patterns/test_ray_up_config.yaml +++ b/python/ray/tests/test_cli_patterns/test_ray_up_config.yaml @@ -12,6 +12,7 @@ head_start_ray_commands: - ray stop - ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml idle_timeout_minutes: 5 +initial_workers: 1 initialization_commands: - echo init max_workers: 2 @@ -26,6 +27,7 @@ setup_commands: - echo a - echo b - echo ${echo hi} +target_utilization_fraction: 0.9 worker_nodes: ImageId: latest_dlami InstanceType: t1.micro diff --git a/python/ray/tests/test_cli_patterns/test_ray_up_docker_config.yaml b/python/ray/tests/test_cli_patterns/test_ray_up_docker_config.yaml index bffd0f53f2ae..8d898f749646 100644 --- a/python/ray/tests/test_cli_patterns/test_ray_up_docker_config.yaml +++ b/python/ray/tests/test_cli_patterns/test_ray_up_docker_config.yaml @@ -17,6 +17,7 @@ head_start_ray_commands: - ray stop - ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml idle_timeout_minutes: 5 +initial_workers: 1 initialization_commands: - echo init max_workers: 2 @@ -31,6 +32,7 @@ setup_commands: - echo a - echo b - echo ${echo hi} +target_utilization_fraction: 0.9 worker_nodes: ImageId: latest_dlami InstanceType: t3a.small diff --git a/python/ray/tests/test_client.py b/python/ray/tests/test_client.py index 73b19a2f2ab9..21bb807fda55 100644 --- a/python/ray/tests/test_client.py +++ b/python/ray/tests/test_client.py @@ -2,13 +2,42 @@ import time import sys import logging -import threading import ray.util.client.server.server as ray_client_server +from ray.util.client import RayAPIStub from ray.util.client.common import ClientObjectRef from ray.util.client.ray_client_helpers import ray_start_client_server +def test_num_clients(shutdown_only): + # Tests num clients reporting; useful if you want to build an app that + # load balances clients between Ray client servers. + server = ray_client_server.serve("localhost:50051") + try: + api1 = RayAPIStub() + info1 = api1.connect("localhost:50051") + assert info1["num_clients"] == 1, info1 + api2 = RayAPIStub() + info2 = api2.connect("localhost:50051") + assert info2["num_clients"] == 2, info2 + + # Disconnect the first two clients. + api1.disconnect() + api2.disconnect() + time.sleep(1) + + api3 = RayAPIStub() + info3 = api3.connect("localhost:50051") + assert info3["num_clients"] == 1, info3 + + # Check info contains ray and python version. + assert isinstance(info3["ray_version"], str), info3 + assert isinstance(info3["ray_commit"], str), info3 + assert isinstance(info3["python_version"], str), info3 + finally: + server.stop(0) + + @pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.") def test_real_ray_fallback(ray_start_regular_shared): with ray_start_client_server() as ray: @@ -322,25 +351,12 @@ def get(self): actor.inc.remote() actor.inc.remote() + del actor - # Make sure the get_actor call works new_actor = ray.get_actor("test_acc") new_actor.inc.remote() assert ray.get(new_actor.get.remote()) == 3 - del actor - - actor = Accumulator.options( - name="test_acc2", lifetime="detached").remote() - actor.inc.remote() - del actor - - detatched_actor = ray.get_actor("test_acc2") - for i in range(5): - detatched_actor.inc.remote() - - assert ray.get(detatched_actor.get.remote()) == 6 - @pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.") def test_internal_kv(ray_start_regular_shared): @@ -357,52 +373,5 @@ def test_internal_kv(ray_start_regular_shared): assert ray._internal_kv_get("apple") == b"" -def test_startup_retry(ray_start_regular_shared): - from ray.util.client import ray as ray_client - ray_client._inside_client_test = True - - with pytest.raises(ConnectionError): - ray_client.connect("localhost:50051", connection_retries=1) - - def run_client(): - ray_client.connect("localhost:50051") - ray_client.disconnect() - - thread = threading.Thread(target=run_client, daemon=True) - thread.start() - time.sleep(3) - server = ray_client_server.serve("localhost:50051") - thread.join() - server.stop(0) - ray_client._inside_client_test = False - - -def test_dataclient_server_drop(ray_start_regular_shared): - from ray.util.client import ray as ray_client - ray_client._inside_client_test = True - - @ray_client.remote - def f(x): - time.sleep(4) - return x - - def stop_server(server): - time.sleep(2) - server.stop(0) - - server = ray_client_server.serve("localhost:50051") - ray_client.connect("localhost:50051") - thread = threading.Thread(target=stop_server, args=(server, )) - thread.start() - x = f.remote(2) - with pytest.raises(ConnectionError): - _ = ray_client.get(x) - thread.join() - ray_client.disconnect() - ray_client._inside_client_test = False - # Wait for f(x) to finish before ray.shutdown() in the fixture - time.sleep(3) - - if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_client_init.py b/python/ray/tests/test_client_init.py deleted file mode 100644 index 8053ab5774e6..000000000000 --- a/python/ray/tests/test_client_init.py +++ /dev/null @@ -1,183 +0,0 @@ -"""Client tests that run their own init (as with init_and_serve) live here""" -import pytest - -import time -import random -import sys - -import ray.util.client.server.server as ray_client_server -import ray.core.generated.ray_client_pb2 as ray_client_pb2 - -from ray.util.client import RayAPIStub, CURRENT_PROTOCOL_VERSION - -import ray - - -@ray.remote -def hello_world(): - c1 = complex_task.remote(random.randint(1, 10)) - c2 = complex_task.remote(random.randint(1, 10)) - return sum(ray.get([c1, c2])) - - -@ray.remote -def complex_task(value): - time.sleep(1) - return value * 10 - - -@ray.remote -class C: - def __init__(self, x): - self.val = x - - def double(self): - self.val += self.val - - def get(self): - return self.val - - -@pytest.fixture -def init_and_serve(): - server_handle, _ = ray_client_server.init_and_serve("localhost:50051") - yield server_handle - ray_client_server.shutdown_with_server(server_handle.grpc_server) - time.sleep(2) - - -@pytest.fixture -def init_and_serve_lazy(): - cluster = ray.cluster_utils.Cluster() - cluster.add_node(num_cpus=1, num_gpus=0) - address = cluster.address - - def connect(): - ray.init(address=address) - - server_handle = ray_client_server.serve("localhost:50051", connect) - yield server_handle - ray_client_server.shutdown_with_server(server_handle.grpc_server) - time.sleep(2) - - -def test_basic_preregister(init_and_serve): - from ray.util.client import ray - ray.connect("localhost:50051") - val = ray.get(hello_world.remote()) - print(val) - assert val >= 20 - assert val <= 200 - c = C.remote(3) - x = c.double.remote() - y = c.double.remote() - ray.wait([x, y]) - val = ray.get(c.get.remote()) - assert val == 12 - ray.disconnect() - - -def test_num_clients(init_and_serve_lazy): - # Tests num clients reporting; useful if you want to build an app that - # load balances clients between Ray client servers. - - def get_job_id(api): - return api.get_runtime_context().worker.current_job_id - - api1 = RayAPIStub() - info1 = api1.connect("localhost:50051") - job_id_1 = get_job_id(api1) - assert info1["num_clients"] == 1, info1 - api2 = RayAPIStub() - info2 = api2.connect("localhost:50051") - job_id_2 = get_job_id(api2) - assert info2["num_clients"] == 2, info2 - - assert job_id_1 == job_id_2 - - # Disconnect the first two clients. - api1.disconnect() - api2.disconnect() - time.sleep(1) - - api3 = RayAPIStub() - info3 = api3.connect("localhost:50051") - job_id_3 = get_job_id(api3) - assert info3["num_clients"] == 1, info3 - assert job_id_1 != job_id_3 - - # Check info contains ray and python version. - assert isinstance(info3["ray_version"], str), info3 - assert isinstance(info3["ray_commit"], str), info3 - assert isinstance(info3["python_version"], str), info3 - assert isinstance(info3["protocol_version"], str), info3 - api3.disconnect() - - -def test_python_version(init_and_serve): - server_handle = init_and_serve - ray = RayAPIStub() - info1 = ray.connect("localhost:50051") - assert info1["python_version"] == ".".join( - [str(x) for x in list(sys.version_info)[:3]]) - ray.disconnect() - time.sleep(1) - - def mock_connection_response(): - return ray_client_pb2.ConnectionInfoResponse( - num_clients=1, - python_version="2.7.12", - ray_version="", - ray_commit="", - protocol_version=CURRENT_PROTOCOL_VERSION, - ) - - # inject mock connection function - server_handle.data_servicer._build_connection_response = \ - mock_connection_response - - ray = RayAPIStub() - with pytest.raises(RuntimeError): - _ = ray.connect("localhost:50051") - - ray = RayAPIStub() - info3 = ray.connect("localhost:50051", ignore_version=True) - assert info3["num_clients"] == 1, info3 - ray.disconnect() - - -def test_protocol_version(init_and_serve): - server_handle = init_and_serve - ray = RayAPIStub() - info1 = ray.connect("localhost:50051") - local_py_version = ".".join([str(x) for x in list(sys.version_info)[:3]]) - assert info1["protocol_version"] == CURRENT_PROTOCOL_VERSION, info1 - ray.disconnect() - time.sleep(1) - - def mock_connection_response(): - return ray_client_pb2.ConnectionInfoResponse( - num_clients=1, - python_version=local_py_version, - ray_version="", - ray_commit="", - protocol_version="2050-01-01", # from the future - ) - - # inject mock connection function - server_handle.data_servicer._build_connection_response = \ - mock_connection_response - - ray = RayAPIStub() - with pytest.raises(RuntimeError): - _ = ray.connect("localhost:50051") - - ray = RayAPIStub() - info3 = ray.connect("localhost:50051", ignore_version=True) - assert info3["num_clients"] == 1, info3 - ray.disconnect() - - -if __name__ == "__main__": - import pytest - sys.exit(pytest.main(["-v", __file__] + sys.argv[1:])) diff --git a/python/ray/tests/test_client_metadata.py b/python/ray/tests/test_client_metadata.py index 1a6c4e2a5633..ffec75a77c17 100644 --- a/python/ray/tests/test_client_metadata.py +++ b/python/ray/tests/test_client_metadata.py @@ -38,8 +38,3 @@ def test_get_runtime_context(ray_start_regular_shared): with pytest.raises(Exception): _ = rtc.task_id - - -if __name__ == "__main__": - import sys - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_client_references.py b/python/ray/tests/test_client_references.py index b0dd01b0498a..834fadfcf874 100644 --- a/python/ray/tests/test_client_references.py +++ b/python/ray/tests/test_client_references.py @@ -1,53 +1,39 @@ -import pytest from ray.util.client.ray_client_helpers import ray_start_client_server -from ray.util.client.ray_client_helpers import ( - ray_start_client_server_pair, ray_start_cluster_client_server_pair) from ray.test_utils import wait_for_condition import ray as real_ray from ray.core.generated.gcs_pb2 import ActorTableData +from ray.util.client.server.server import _get_current_servicer -def server_object_ref_count(server, n): +def server_object_ref_count(n): + server = _get_current_servicer() assert server is not None def test_cond(): - if len(server.task_servicer.object_refs) == 0: + if len(server.object_refs) == 0: # No open clients return n == 0 - client_id = list(server.task_servicer.object_refs.keys())[0] - return len(server.task_servicer.object_refs[client_id]) == n + client_id = list(server.object_refs.keys())[0] + return len(server.object_refs[client_id]) == n return test_cond -def server_actor_ref_count(server, n): +def server_actor_ref_count(n): + server = _get_current_servicer() assert server is not None def test_cond(): - if len(server.task_servicer.actor_refs) == 0: + if len(server.actor_refs) == 0: # No running actors return n == 0 - return len(server.task_servicer.actor_refs) == n + return len(server.actor_refs) == n return test_cond -@pytest.mark.parametrize( - "ray_start_cluster", - [{ - "num_nodes": 1, - "do_init": False, - # This test uses ray.objects(), which only works with the GCS-based - # object directory - "_system_config": { - "ownership_based_object_directory_enabled": False - }, - }], - indirect=True) -def test_delete_refs_on_disconnect(ray_start_cluster): - cluster = ray_start_cluster - with ray_start_cluster_client_server_pair(cluster.address) as pair: - ray, server = pair +def test_delete_refs_on_disconnect(ray_start_regular): + with ray_start_client_server() as ray: @ray.remote def f(x): @@ -60,18 +46,14 @@ def f(x): # in a different category, according to the raylet. assert len(real_ray.objects()) == 2 # But we're maintaining the reference - assert server_object_ref_count(server, 3)() + assert server_object_ref_count(3)() # And can get the data assert ray.get(thing1) == 8 - # Close the client. + # Close the client ray.close() - wait_for_condition(server_object_ref_count(server, 0), timeout=5) - - # Connect to the real ray again, since we disconnected - # upon num_clients = 0. - real_ray.init(address=cluster.address) + wait_for_condition(server_object_ref_count(0), timeout=5) def test_cond(): return len(real_ray.objects()) == 0 @@ -80,8 +62,7 @@ def test_cond(): def test_delete_ref_on_object_deletion(ray_start_regular): - with ray_start_client_server_pair() as pair: - ray, server = pair + with ray_start_client_server() as ray: vals = { "ref": ray.put("Hello World"), "ref2": ray.put("This value stays"), @@ -89,18 +70,11 @@ def test_delete_ref_on_object_deletion(ray_start_regular): del vals["ref"] - wait_for_condition(server_object_ref_count(server, 1), timeout=5) + wait_for_condition(server_object_ref_count(1), timeout=5) -@pytest.mark.parametrize( - "ray_start_cluster", [{ - "num_nodes": 1, - "do_init": False - }], indirect=True) -def test_delete_actor_on_disconnect(ray_start_cluster): - cluster = ray_start_cluster - with ray_start_cluster_client_server_pair(cluster.address) as pair: - ray, server = pair +def test_delete_actor_on_disconnect(ray_start_regular): + with ray_start_client_server() as ray: @ray.remote class Accumulator: @@ -116,13 +90,13 @@ def get(self): actor = Accumulator.remote() actor.inc.remote() - assert server_actor_ref_count(server, 1)() + assert server_actor_ref_count(1)() assert ray.get(actor.get.remote()) == 1 ray.close() - wait_for_condition(server_actor_ref_count(server, 0), timeout=5) + wait_for_condition(server_actor_ref_count(0), timeout=5) def test_cond(): alive_actors = [ @@ -131,16 +105,11 @@ def test_cond(): ] return len(alive_actors) == 0 - # Connect to the real ray again, since we disconnected - # upon num_clients = 0. - real_ray.init(address=cluster.address) - wait_for_condition(test_cond, timeout=10) def test_delete_actor(ray_start_regular): - with ray_start_client_server_pair() as pair: - ray, server = pair + with ray_start_client_server() as ray: @ray.remote class Accumulator: @@ -155,11 +124,11 @@ def inc(self): actor2 = Accumulator.remote() actor2.inc.remote() - assert server_actor_ref_count(server, 2)() + assert server_actor_ref_count(2)() del actor - wait_for_condition(server_actor_ref_count(server, 1), timeout=5) + wait_for_condition(server_actor_ref_count(1), timeout=5) def test_simple_multiple_references(ray_start_regular): @@ -181,9 +150,3 @@ def get(self): del ref1 assert ray.get(ref2) == "hi" del ref2 - - -if __name__ == "__main__": - import sys - import pytest - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_client_terminate.py b/python/ray/tests/test_client_terminate.py index 6f7af830f349..9016c627a6a5 100644 --- a/python/ray/tests/test_client_terminate.py +++ b/python/ray/tests/test_client_terminate.py @@ -83,9 +83,3 @@ def wait_for(t): signaler2.send.remote() ray.get(obj1) - - -if __name__ == "__main__": - import sys - import pytest - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_coordinator_server.py b/python/ray/tests/test_coordinator_server.py index 0c59b909e94c..6fb654e3e550 100644 --- a/python/ray/tests/test_coordinator_server.py +++ b/python/ray/tests/test_coordinator_server.py @@ -52,6 +52,7 @@ def testClusterStateInit(self): "cluster_name": "random_name", "min_workers": 0, "max_workers": 0, + "initial_workers": 0, "provider": { "type": "local", "head_ip": "0.0.0.0:2", @@ -153,6 +154,7 @@ def testCoordinatorSenderNodeProvider(self): "cluster_name": "random_name", "min_workers": 0, "max_workers": 0, + "initial_workers": 0, "provider": { "type": "local", "coordinator_address": self.coordinator_address, diff --git a/python/ray/tests/test_cross_language.py b/python/ray/tests/test_cross_language.py index 4ffd6db3e4f1..10766b18bd44 100644 --- a/python/ray/tests/test_cross_language.py +++ b/python/ray/tests/test_cross_language.py @@ -24,7 +24,3 @@ class PythonObject(object): with pytest.raises(Exception, match="transfer"): ray.java_function("a", "b").remote(PythonObject()) - - -if __name__ == "__main__": - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_dask_optimization.py b/python/ray/tests/test_dask_optimization.py deleted file mode 100644 index e8a045aeee24..000000000000 --- a/python/ray/tests/test_dask_optimization.py +++ /dev/null @@ -1,63 +0,0 @@ -import dask -import dask.dataframe as dd -from dask.dataframe.shuffle import SimpleShuffleLayer -import mock -import numpy as np -import pandas as pd -import pytest - -from ray.util.dask import dataframe_optimize -from ray.util.dask.optimizations import (rewrite_simple_shuffle_layer, - MultipleReturnSimpleShuffleLayer) - - -def test_rewrite_simple_shuffle_layer(): - npartitions = 10 - df = dd.from_pandas( - pd.DataFrame( - np.random.randint(0, 100, size=(100, 2)), columns=["age", - "grade"]), - npartitions=npartitions) - # We set max_branch=npartitions in order to ensure that the task-based - # shuffle happens in a single stage, which is required in order for our - # optimization to work. - a = df.set_index(["age"], shuffle="tasks", max_branch=npartitions) - - dsk = a.__dask_graph__() - keys = a.__dask_keys__() - assert any(type(v) is SimpleShuffleLayer for k, v in dsk.layers.items()) - dsk = rewrite_simple_shuffle_layer(dsk, keys) - assert all( - type(v) is not SimpleShuffleLayer for k, v in dsk.layers.items()) - assert any( - type(v) is MultipleReturnSimpleShuffleLayer - for k, v in dsk.layers.items()) - - -@mock.patch("ray.util.dask.optimizations.rewrite_simple_shuffle_layer") -def test_dataframe_optimize(mock_rewrite): - def side_effect(dsk, keys): - return rewrite_simple_shuffle_layer(dsk, keys) - - mock_rewrite.side_effect = side_effect - with dask.config.set(dataframe_optimize=dataframe_optimize): - npartitions = 10 - df = dd.from_pandas( - pd.DataFrame( - np.random.randint(0, 100, size=(100, 2)), - columns=["age", "grade"]), - npartitions=npartitions) - # We set max_branch=npartitions in order to ensure that the task-based - # shuffle happens in a single stage, which is required in order for our - # optimization to work. - a = df.set_index( - ["age"], shuffle="tasks", max_branch=npartitions).compute() - - assert mock_rewrite.call_count == 2 - assert a.index.is_monotonic_increasing - - -if __name__ == "__main__": - import sys - - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_dask_scheduler.py b/python/ray/tests/test_dask_scheduler.py index 54ba40521a81..28a98a76eda8 100644 --- a/python/ray/tests/test_dask_scheduler.py +++ b/python/ray/tests/test_dask_scheduler.py @@ -35,9 +35,7 @@ def call_add(): def test_ray_dask_persist(ray_start_regular_shared): arr = da.ones(5) + 2 result = arr.persist(scheduler=ray_dask_get) - np.testing.assert_array_equal( - next(iter(result.dask.values())), - np.ones(5) + 2) + np.testing.assert_array_equal(result.dask.values()[0], np.ones(5) + 2) if __name__ == "__main__": diff --git a/python/ray/tests/test_failure.py b/python/ray/tests/test_failure.py index 724033c1965c..f45aea9b4292 100644 --- a/python/ray/tests/test_failure.py +++ b/python/ray/tests/test_failure.py @@ -20,52 +20,6 @@ get_error_message, Semaphore) -def test_unhandled_errors(ray_start_regular): - @ray.remote - def f(): - raise ValueError() - - @ray.remote - class Actor: - def f(self): - raise ValueError() - - a = Actor.remote() - num_exceptions = 0 - - def interceptor(e): - nonlocal num_exceptions - num_exceptions += 1 - - # Test we report unhandled exceptions. - ray.worker._unhandled_error_handler = interceptor - x1 = f.remote() - x2 = a.f.remote() - del x1 - del x2 - wait_for_condition(lambda: num_exceptions == 2) - - # Test we don't report handled exceptions. - x1 = f.remote() - x2 = a.f.remote() - with pytest.raises(ray.exceptions.RayError) as err: # noqa - ray.get([x1, x2]) - del x1 - del x2 - time.sleep(1) - assert num_exceptions == 2, num_exceptions - - # Test suppression with env var works. - try: - os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] = "1" - x1 = f.remote() - del x1 - time.sleep(1) - assert num_exceptions == 2, num_exceptions - finally: - del os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] - - def test_failed_task(ray_start_regular, error_pubsub): @ray.remote def throw_exception_fct1(): @@ -800,15 +754,12 @@ class Foo: def __init__(self): time.sleep(1000) - # NOTE: We should save actor, otherwise it will be out of scope. - actors = [Foo.remote() for _ in range(num_cpus * 3)] - assert len(actors) == num_cpus * 3 + [Foo.remote() for _ in range(num_cpus * 3)] errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR - actors = [Foo.remote() for _ in range(num_cpus)] - assert len(actors) == num_cpus + [Foo.remote() for _ in range(num_cpus)] errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR @@ -1039,7 +990,7 @@ def sleep_to_kill_raylet(): def test_connect_with_disconnected_node(shutdown_only): config = { "num_heartbeats_timeout": 50, - "raylet_heartbeat_period_milliseconds": 10, + "raylet_heartbeat_timeout_milliseconds": 10, } cluster = Cluster() cluster.add_node(num_cpus=0, _system_config=config) @@ -1088,10 +1039,7 @@ def some_expensive_task(self): def test_fill_object_store_exception(shutdown_only): - ray.init( - num_cpus=2, - object_store_memory=10**8, - _system_config={"automatic_object_spilling_enabled": False}) + ray.init(num_cpus=2, object_store_memory=10**8) @ray.remote def expensive_task(): @@ -1120,6 +1068,56 @@ def test(self): ray.put(np.zeros(10**8 + 2, dtype=np.uint8)) +def test_fill_object_store_lru_fallback(shutdown_only): + config = { + "free_objects_batch_size": 1, + } + ray.init( + num_cpus=2, + object_store_memory=10**8, + _lru_evict=True, + _system_config=config) + + @ray.remote + def expensive_task(): + return np.zeros((10**8) // 2, dtype=np.uint8) + + # Check that objects out of scope are cleaned up quickly. + ray.get(expensive_task.remote()) + start = time.time() + for _ in range(3): + ray.get(expensive_task.remote()) + end = time.time() + assert end - start < 3 + + obj_refs = [] + for _ in range(3): + obj_ref = expensive_task.remote() + ray.get(obj_ref) + obj_refs.append(obj_ref) + + @ray.remote + class LargeMemoryActor: + def some_expensive_task(self): + return np.zeros(10**8 // 2, dtype=np.uint8) + + def test(self): + return 1 + + actor = LargeMemoryActor.remote() + for _ in range(3): + obj_ref = actor.some_expensive_task.remote() + ray.get(obj_ref) + obj_refs.append(obj_ref) + # Make sure actor does not die + ray.get(actor.test.remote()) + + for _ in range(3): + obj_ref = ray.put(np.zeros(10**8 // 2, dtype=np.uint8)) + ray.get(obj_ref) + obj_refs.append(obj_ref) + + @pytest.mark.parametrize( "ray_start_cluster", [{ "num_nodes": 1, @@ -1201,7 +1199,7 @@ def get(obj_refs, test_dependent_task): def test_fate_sharing(ray_start_cluster, use_actors, node_failure): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_period_milliseconds": 100, + "raylet_heartbeat_timeout_milliseconds": 100, } cluster = Cluster() # Head node with no resources. diff --git a/python/ray/tests/test_global_state.py b/python/ray/tests/test_global_state.py index 7522039eceed..3dcd64c1ebd2 100644 --- a/python/ray/tests/test_global_state.py +++ b/python/ray/tests/test_global_state.py @@ -7,7 +7,6 @@ import ray import ray.ray_constants -import ray.services import ray.test_utils from ray._raylet import GlobalStateAccessor @@ -333,31 +332,6 @@ def backlog_size_set(): global_state_accessor.disconnect() -def test_heartbeat_ip(shutdown_only): - cluster = ray.init( - num_cpus=1, _system_config={ - "report_worker_backlog": True, - }) - global_state_accessor = GlobalStateAccessor( - cluster["redis_address"], ray.ray_constants.REDIS_DEFAULT_PASSWORD) - global_state_accessor.connect() - - self_ip = ray.services.get_node_ip_address() - - def self_ip_is_set(): - message = global_state_accessor.get_all_resource_usage() - if message is None: - return False - - resource_usage = ray.gcs_utils.ResourceUsageBatchData.FromString( - message) - resources_data = resource_usage.batch[0] - return resources_data.node_manager_address == self_ip - - ray.test_utils.wait_for_condition(self_ip_is_set, timeout=2) - global_state_accessor.disconnect() - - if __name__ == "__main__": import pytest import sys diff --git a/python/ray/tests/test_job.py b/python/ray/tests/test_job.py index 15313d7bafbd..15b082b460e0 100644 --- a/python/ray/tests/test_job.py +++ b/python/ray/tests/test_job.py @@ -33,7 +33,7 @@ def __init__(self): assert len(actor_table) == 1 job_table = ray.jobs() - assert len(job_table) == 2 # dash + assert len(job_table) == 2 # Kill the driver process. p.kill() @@ -79,7 +79,7 @@ def value(self): assert len(actor_table) == 1 job_table = ray.jobs() - assert len(job_table) == 2 # dash + assert len(job_table) == 2 # Kill the driver process. p.kill() diff --git a/python/ray/tests/test_k8s_cluster_launcher.py b/python/ray/tests/test_k8s_cluster_launcher.py index 49ecadd688bb..eb6d596b93e5 100644 --- a/python/ray/tests/test_k8s_cluster_launcher.py +++ b/python/ray/tests/test_k8s_cluster_launcher.py @@ -69,8 +69,8 @@ def test_up_and_down(self): while True: monitor_output = sdk.run_on_cluster( config, cmd=log_cmd, with_output=True).decode() - if ("head-node" in monitor_output - and "worker-node" in monitor_output): + if ("ray-legacy-head-node-type" in monitor_output + and "ray-legacy-worker-node-type" in monitor_output): break else: time.sleep(1) diff --git a/python/ray/tests/test_k8s_operator_examples.py b/python/ray/tests/test_k8s_operator_examples.py deleted file mode 100644 index 025ad1709172..000000000000 --- a/python/ray/tests/test_k8s_operator_examples.py +++ /dev/null @@ -1,193 +0,0 @@ -"""Tests launch, teardown, and update of multiple Ray clusters using Kubernetes -operator.""" -import copy -import sys -import os -import subprocess -import tempfile -import time -import unittest - -import kubernetes -import pytest -import yaml - -IMAGE_ENV = "KUBERNETES_OPERATOR_TEST_IMAGE" -IMAGE = os.getenv(IMAGE_ENV, "rayproject/ray:nightly") -NAMESPACE = "test-k8s-operator-examples" - - -def retry_until_true(f): - # Retry 60 times with 1 second delay between attempts. - def f_with_retries(*args, **kwargs): - for _ in range(120): - if f(*args, **kwargs): - return - else: - time.sleep(1) - pytest.fail("The condition wasn't met before the timeout expired.") - - return f_with_retries - - -@retry_until_true -def wait_for_pods(n): - client = kubernetes.client.CoreV1Api() - pods = client.list_namespaced_pod(namespace=NAMESPACE).items - # Double-check that the correct image is use. - for pod in pods: - assert pod.spec.containers[0].image == IMAGE - return len(pods) == n - - -@retry_until_true -def wait_for_logs(): - """Check if logs indicate presence of nodes of types "head-node" and - "worker-nodes" in the "example-cluster" cluster.""" - cmd = f"kubectl -n {NAMESPACE} logs ray-operator-pod"\ - "| grep ^example-cluster: | tail -n 100" - log_tail = subprocess.check_output(cmd, shell=True).decode() - return ("head-node" in log_tail) and ("worker-node" in log_tail) - - -@retry_until_true -def wait_for_job(job_pod): - cmd = f"kubectl -n {NAMESPACE} logs {job_pod}" - out = subprocess.check_output(cmd, shell=True).decode() - return ("success" in out.lower()) - - -def kubernetes_configs_directory(): - here = os.path.realpath(__file__) - ray_python_root = os.path.dirname(os.path.dirname(here)) - relative_path = "autoscaler/kubernetes" - return os.path.join(ray_python_root, relative_path) - - -def get_kubernetes_config_path(name): - return os.path.join(kubernetes_configs_directory(), name) - - -def get_operator_config_path(file_name): - operator_configs = get_kubernetes_config_path("operator_configs") - return os.path.join(operator_configs, file_name) - - -class KubernetesOperatorTest(unittest.TestCase): - def test_examples(self): - with tempfile.NamedTemporaryFile("w+") as example_cluster_file, \ - tempfile.NamedTemporaryFile("w+") as example_cluster2_file,\ - tempfile.NamedTemporaryFile("w+") as operator_file,\ - tempfile.NamedTemporaryFile("w+") as job_file: - - # Get paths to operator configs - example_cluster_config_path = get_operator_config_path( - "example_cluster.yaml") - example_cluster2_config_path = get_operator_config_path( - "example_cluster2.yaml") - operator_config_path = get_operator_config_path("operator.yaml") - job_path = get_kubernetes_config_path("job-example.yaml") - self.crd_path = get_operator_config_path("cluster_crd.yaml") - - # Load operator configs - example_cluster_config = yaml.safe_load( - open(example_cluster_config_path).read()) - example_cluster2_config = yaml.safe_load( - open(example_cluster2_config_path).read()) - operator_config = list( - yaml.safe_load_all(open(operator_config_path).read())) - job_config = yaml.safe_load(open(job_path).read()) - - # Fill image fields - podTypes = example_cluster_config["spec"]["podTypes"] - podTypes2 = example_cluster2_config["spec"]["podTypes"] - pod_specs = ([operator_config[-1]["spec"]] + [ - job_config["spec"]["template"]["spec"] - ] + [podType["podConfig"]["spec"] for podType in podTypes - ] + [podType["podConfig"]["spec"] for podType in podTypes2]) - for pod_spec in pod_specs: - pod_spec["containers"][0]["image"] = IMAGE - pod_spec["containers"][0]["imagePullPolicy"] = "IfNotPresent" - - # Dump to temporary files - yaml.dump(example_cluster_config, example_cluster_file) - yaml.dump(example_cluster2_config, example_cluster2_file) - yaml.dump(job_config, job_file) - yaml.dump_all(operator_config, operator_file) - files = [ - example_cluster_file, example_cluster2_file, operator_file - ] - for file in files: - file.flush() - - # Apply CR - cmd = f"kubectl apply -f {self.crd_path}" - subprocess.check_call(cmd, shell=True) - - # Create namespace - cmd = f"kubectl create namespace {NAMESPACE}" - subprocess.check_call(cmd, shell=True) - - # Start operator and two clusters - for file in files: - cmd = f"kubectl -n {NAMESPACE} apply -f {file.name}" - subprocess.check_call(cmd, shell=True) - - # Check that autoscaling respects minWorkers by waiting for - # six pods in the namespace. - wait_for_pods(6) - - # Check that logging output looks normal (two workers connected to - # ray cluster example-cluster.) - wait_for_logs() - - # Delete the second cluster - cmd = f"kubectl -n {NAMESPACE} delete -f"\ - f"{example_cluster2_file.name}" - subprocess.check_call(cmd, shell=True) - - # Four pods remain - wait_for_pods(4) - - # Check job submission - cmd = f"kubectl -n {NAMESPACE} create -f {job_file.name}" - subprocess.check_call(cmd, shell=True) - - cmd = f"kubectl -n {NAMESPACE} get pods --no-headers -o"\ - " custom-columns=\":metadata.name\"" - pods = subprocess.check_output(cmd, shell=True).decode().split() - job_pod = [pod for pod in pods if "job" in pod].pop() - time.sleep(10) - wait_for_job(job_pod) - cmd = f"kubectl -n {NAMESPACE} delete jobs --all" - subprocess.check_call(cmd, shell=True) - - # Check that cluster updates work: increase minWorkers to 3 - # and check that one worker is created. - example_cluster_edit = copy.deepcopy(example_cluster_config) - example_cluster_edit["spec"]["podTypes"][1]["minWorkers"] = 3 - yaml.dump(example_cluster_edit, example_cluster_file) - example_cluster_file.flush() - cm = f"kubectl -n {NAMESPACE} apply -f {example_cluster_file.name}" - subprocess.check_call(cm, shell=True) - - wait_for_pods(5) - - # Delete the first cluster - cmd = f"kubectl -n {NAMESPACE} delete -f"\ - f"{example_cluster_file.name}" - subprocess.check_call(cmd, shell=True) - - # Only operator pod remains. - wait_for_pods(1) - - def __del__(self): - cmd = f"kubectl delete -f {self.crd_path}" - subprocess.check_call(cmd, shell=True) - cmd = f"kubectl delete namespace {NAMESPACE}" - subprocess.check_call(cmd, shell=True) - - -if __name__ == "__main__": - kubernetes.config.load_kube_config() - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_k8s_operator_mock.py b/python/ray/tests/test_k8s_operator_mock.py deleted file mode 100644 index a3bbf5766922..000000000000 --- a/python/ray/tests/test_k8s_operator_mock.py +++ /dev/null @@ -1,162 +0,0 @@ -import os -import unittest -from unittest.mock import patch - -import pytest -import tempfile -import yaml - -from ray.autoscaler.tags import TAG_RAY_NODE_KIND, NODE_KIND_HEAD -from ray.autoscaler.node_provider import NodeProvider -from ray.ray_operator.operator import RayCluster -from ray.ray_operator.operator_utils import cr_to_config -from ray.autoscaler._private.kubernetes.node_provider import\ - KubernetesNodeProvider -from ray.autoscaler._private.updater import NodeUpdaterThread -""" -Tests that, when the K8s operator launches a cluster, no files are mounted onto -the head node. -The main idea is to mock the NodeUpdaterThread to report if it received any -file mounts. -""" - -# NodeUpdaterThread mock methods -START = "start" -JOIN = "join" - - -def mock_start(self): - # Detects any file mounts passed in NodeUpdaterThread.__init__() - if self.file_mounts: - raise ValueError("File mounts in operator's code path.") - - -def mock_join(self): - # Fake success - self.exitcode = 0 - return - - -# RayCluster mock methods -SETUP_LOGGING = "setup_logging" -WRITE_CONFIG = "write_config" - - -def mock_setup_logging(self): - return - - -def mock_write_config(self): - # Use a named temporary file instead of a real one. - self.config_file = tempfile.NamedTemporaryFile("w") - self.config_path = self.config_file.name - yaml.dump(self.config, self.config_file) - self.config_file.flush() - - -# KubernetesNodeProvider mock methods -INIT = "__init__" -NON_TERMINATED_NODES = "non_terminated_nodes" -CREATE_NODE = "create_node" -BOOTSTRAP_CONFIG = "bootstrap_config" - -HEAD_NODE_TAGS = {TAG_RAY_NODE_KIND: NODE_KIND_HEAD} - - -def mock_init(self, provider_config, cluster_name): - # Adds an attribute to detect if the provider has created the head. - NodeProvider.__init__(self, provider_config, cluster_name) - self.cluster_name = cluster_name - self.namespace = provider_config["namespace"] - - self._head_created = False - - -def mock_non_terminated_nodes(self, node_tags): - # First time this is called, it returns an empty list. - # Second time, returns a mock head node id. - if HEAD_NODE_TAGS.items() <= node_tags.items() and self._head_created: - # Second call. - return ["HEAD"] - elif node_tags == HEAD_NODE_TAGS: - # First call. - return [] - else: - # Should not go here. - raise ValueError("Test passed invalid parameters.") - - -def mock_create_node(self, node_config, tags, count): - # Called during head node creation. Marks that a head node has been - # created. - if HEAD_NODE_TAGS.items() <= tags.items() and count == 1: - self._head_created = True - else: - raise ValueError(f"Test passed invalid parameter {tags} {count}.") - - -def mock_bootstrap_config(cluster_config): - # KubernetesNodeProvider.bootstrap_config has no side effects - # on cluster_config -- the method just creates K8s API objects. - # Thus it makes sense to dummy out the K8s API calls and return - # the config. - return cluster_config - - -def custom_resources(): - # K8s custom resources used in test. - here = os.path.realpath(__file__) - ray_python_root = os.path.dirname(os.path.dirname(here)) - relative_path = "autoscaler/kubernetes/operator_configs" - abs_path = os.path.join(ray_python_root, relative_path) - cluster1, cluster2 = "example_cluster.yaml", "example_cluster2.yaml" - path1, path2 = os.path.join(abs_path, cluster1), os.path.join( - abs_path, cluster2) - cr1, cr2 = (yaml.safe_load(open(path1).read()), - yaml.safe_load(open(path2).read())) - # Metadata and field is filled by K8s in real life. - cr1["metadata"]["uid"] = "abc" - cr2["metadata"]["uid"] = "xyz" - return cr1, cr2 - - -class OperatorTest(unittest.TestCase): - def test_no_file_mounts_k8s_operator_cluster_launch(self): - with patch.object(NodeUpdaterThread, START, mock_start),\ - patch.object(NodeUpdaterThread, JOIN, mock_join),\ - patch.object(RayCluster, SETUP_LOGGING, mock_setup_logging),\ - patch.object(RayCluster, WRITE_CONFIG, mock_write_config),\ - patch.object(KubernetesNodeProvider, INIT, mock_init),\ - patch.object(KubernetesNodeProvider, NON_TERMINATED_NODES, - mock_non_terminated_nodes),\ - patch.object(KubernetesNodeProvider, CREATE_NODE, - mock_create_node),\ - patch.object(KubernetesNodeProvider, BOOTSTRAP_CONFIG, - mock_bootstrap_config): - - cluster_cr1, cluster_cr2 = custom_resources() - - # Ensure that operator does not mount any files during cluster - # launch. - config1 = cr_to_config(cluster_cr1) - config1["provider"]["namespace"] = "test" - cluster1 = RayCluster(config1) - cluster1.start_head() - - # Check that this test is working correctly by inserting extraneous - # file mounts and confirming a ValueError from the mocked - # NodeUpdater. - config2 = cr_to_config(cluster_cr2) - config2["provider"]["namespace"] = "test" - # Note: There is no user interface for adding file mounts - # to the config of a Ray cluster run via the operator. - # This purely for purposes of testing this test. - config2["file_mounts"] = {"remote_foo": os.path.abspath(__file__)} - cluster2 = RayCluster(config2) - with pytest.raises(ValueError): - cluster2.start_head() - - -if __name__ == "__main__": - import sys - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_logging.py b/python/ray/tests/test_logging.py deleted file mode 100644 index 6796ac4f7187..000000000000 --- a/python/ray/tests/test_logging.py +++ /dev/null @@ -1,112 +0,0 @@ -import os -from collections import defaultdict -from pathlib import Path - -import ray -from ray import ray_constants - - -def set_logging_config(max_bytes, backup_count): - os.environ["RAY_ROTATION_MAX_BYTES"] = str(max_bytes) - os.environ["RAY_ROTATION_BACKUP_COUNT"] = str(backup_count) - - -def test_log_rotation_config(ray_start_cluster): - cluster = ray_start_cluster - max_bytes = 100 - backup_count = 3 - - # Create a cluster. - set_logging_config(max_bytes, backup_count) - head_node = cluster.add_node(num_cpus=0) - # Set a different env var for a worker node. - set_logging_config(0, 0) - worker_node = cluster.add_node(num_cpus=0) - cluster.wait_for_nodes() - - config = head_node.logging_config - assert config["log_rotation_max_bytes"] == max_bytes - assert config["log_rotation_backup_count"] == backup_count - config = worker_node.logging_config - assert config["log_rotation_max_bytes"] == 0 - assert config["log_rotation_backup_count"] == 0 - - -def test_log_rotation(shutdown_only): - max_bytes = 1 - backup_count = 3 - set_logging_config(max_bytes, backup_count) - ray.init(num_cpus=1) - session_dir = ray.worker.global_worker.node.address_info["session_dir"] - session_path = Path(session_dir) - log_dir_path = session_path / "logs" - - log_rotating_component = [ - ray_constants.PROCESS_TYPE_DASHBOARD, - ray_constants.PROCESS_TYPE_DASHBOARD_AGENT, - ray_constants.PROCESS_TYPE_LOG_MONITOR, - ray_constants.PROCESS_TYPE_MONITOR, - ray_constants.PROCESS_TYPE_PYTHON_CORE_WORKER_DRIVER, - ray_constants.PROCESS_TYPE_PYTHON_CORE_WORKER, - # Below components are not log rotating now. - # ray_constants.PROCESS_TYPE_RAYLET, - # ray_constants.PROCESS_TYPE_GCS_SERVER, - # ray_constants.PROCESS_TYPE_WORKER, - ] - - # Run the basic workload. - @ray.remote - def f(): - for i in range(10): - print(f"test {i}") - - ray.get(f.remote()) - - paths = list(log_dir_path.iterdir()) - - def component_exist(component, paths): - for path in paths: - filename = path.stem - if component in filename: - return True - return False - - def component_file_size_small_enough(component): - """Although max_bytes is 1, the file can have size that is big. - For example, if the logger prints the traceback, it can be - much bigger. So, we shouldn't make the assertion too tight. - """ - small_enough_bytes = 512 # 512 bytes. - for path in paths: - if not component_exist(component, [path]): - continue - - if path.stat().st_size > small_enough_bytes: - return False - return True - - for component in log_rotating_component: - assert component_exist(component, paths) - assert component_file_size_small_enough(component) - - # Check if the backup count is respected. - file_cnts = defaultdict(int) - for path in paths: - filename = path.stem - filename_without_suffix = filename.split(".")[0] - file_cnts[filename_without_suffix] += 1 - for filename, file_cnt in file_cnts.items(): - # There could be backup_count + 1 files. - # EX) *.log, *.log.* (as many as backup count). - assert file_cnt <= backup_count + 1, ( - f"{filename} has files that are more than " - f"backup count {backup_count}, file count: {file_cnt}") - - -if __name__ == "__main__": - import pytest - import sys - # Make subprocess happy in bazel. - os.environ["LC_ALL"] = "en_US.UTF-8" - os.environ["LANG"] = "en_US.UTF-8" - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_memstat.py b/python/ray/tests/test_memstat.py index a0e8e3c90ed1..cb734b3b7582 100644 --- a/python/ray/tests/test_memstat.py +++ b/python/ray/tests/test_memstat.py @@ -27,8 +27,7 @@ def data_lines(memory_str): for line in memory_str.split("\n"): if (not line or "---" in line or "===" in line or "Object ID" in line - or "pid=" in line or "Plasma memory" in line - or "Objects consumed" in line): + or "pid=" in line or "Plasma memory" in line): continue yield line diff --git a/python/ray/tests/test_metrics_agent.py b/python/ray/tests/test_metrics_agent.py index 8e02c4ae360b..b52f472efc26 100644 --- a/python/ray/tests/test_metrics_agent.py +++ b/python/ray/tests/test_metrics_agent.py @@ -15,6 +15,54 @@ from ray.test_utils import wait_for_condition, SignalActor, fetch_prometheus +def test_prometheus_file_based_service_discovery(ray_start_cluster): + # Make sure Prometheus service discovery file is correctly written + # when number of nodes are dynamically changed. + NUM_NODES = 5 + cluster = ray_start_cluster + nodes = [cluster.add_node() for _ in range(NUM_NODES)] + cluster.wait_for_nodes() + addr = ray.init(address=cluster.address) + redis_address = addr["redis_address"] + writer = PrometheusServiceDiscoveryWriter( + redis_address, ray.ray_constants.REDIS_DEFAULT_PASSWORD, "/tmp/ray") + + def get_metrics_export_address_from_node(nodes): + return [ + "{}:{}".format(node.node_ip_address, node.metrics_export_port) + for node in nodes + ] + + loaded_json_data = json.loads(writer.get_file_discovery_content())[0] + assert (set(get_metrics_export_address_from_node(nodes)) == set( + loaded_json_data["targets"])) + + # Let's update nodes. + for _ in range(3): + nodes.append(cluster.add_node()) + + # Make sure service discovery file content is correctly updated. + loaded_json_data = json.loads(writer.get_file_discovery_content())[0] + assert (set(get_metrics_export_address_from_node(nodes)) == set( + loaded_json_data["targets"])) + + +@pytest.mark.skipif( + platform.system() == "Windows", reason="Failing on Windows.") +def test_prome_file_discovery_run_by_dashboard(shutdown_only): + ray.init(num_cpus=0) + global_node = ray.worker._global_node + temp_dir = global_node.get_temp_dir_path() + + def is_service_discovery_exist(): + for path in pathlib.Path(temp_dir).iterdir(): + if PROMETHEUS_SERVICE_DISCOVERY_FILE in str(path): + return True + return False + + wait_for_condition(is_service_discovery_exist) + + @pytest.fixture def _setup_cluster_for_test(ray_start_cluster): NUM_NODES = 2 @@ -28,17 +76,11 @@ def _setup_cluster_for_test(ray_start_cluster): worker_should_exit = SignalActor.remote() - # Generate a metric in the driver. - counter = Count("test_driver_counter", description="desc") - counter.record(1) - # Generate some metrics from actor & tasks. @ray.remote def f(): counter = Count("test_counter", description="desc") counter.record(1) - counter = ray.get(ray.put(counter)) # Test serialization. - counter.record(1) ray.get(worker_should_exit.wait.remote()) @ray.remote @@ -46,7 +88,6 @@ class A: async def ping(self): histogram = Histogram( "test_histogram", description="desc", boundaries=[0.1, 1.6]) - histogram = ray.get(ray.put(histogram)) # Test serialization. histogram.record(1.5) ray.get(worker_should_exit.wait.remote()) @@ -91,25 +132,19 @@ def test_cases(): for components in components_dict.values()) # Make sure our user defined metrics exist - for metric_name in [ - "test_counter", "test_histogram", "test_driver_counter" - ]: + for metric_name in ["test_counter", "test_histogram"]: assert any(metric_name in full_name for full_name in metric_names) # Make sure GCS server metrics are recorded. assert "ray_outbound_heartbeat_size_kb_sum" in metric_names - # Make sure the numeric values are correct + # Make sure the numeric value is correct test_counter_sample = [ m for m in metric_samples if "test_counter" in m.name ][0] - assert test_counter_sample.value == 2.0 - - test_driver_counter_sample = [ - m for m in metric_samples if "test_driver_counter" in m.name - ][0] - assert test_driver_counter_sample.value == 1.0 + assert test_counter_sample.value == 1.0 + # Make sure the numeric value is correct test_histogram_samples = [ m for m in metric_samples if "test_histogram" in m.name ] @@ -143,58 +178,10 @@ def wrap_test_case_for_retry(): ) except RuntimeError: print( - f"The components are {pformat(fetch_prometheus(prom_addresses))}") + f"The compoenents are {pformat(fetch_prometheus(prom_addresses))}") test_cases() # Should fail assert -def test_prometheus_file_based_service_discovery(ray_start_cluster): - # Make sure Prometheus service discovery file is correctly written - # when number of nodes are dynamically changed. - NUM_NODES = 5 - cluster = ray_start_cluster - nodes = [cluster.add_node() for _ in range(NUM_NODES)] - cluster.wait_for_nodes() - addr = ray.init(address=cluster.address) - redis_address = addr["redis_address"] - writer = PrometheusServiceDiscoveryWriter( - redis_address, ray.ray_constants.REDIS_DEFAULT_PASSWORD, "/tmp/ray") - - def get_metrics_export_address_from_node(nodes): - return [ - "{}:{}".format(node.node_ip_address, node.metrics_export_port) - for node in nodes - ] - - loaded_json_data = json.loads(writer.get_file_discovery_content())[0] - assert (set(get_metrics_export_address_from_node(nodes)) == set( - loaded_json_data["targets"])) - - # Let's update nodes. - for _ in range(3): - nodes.append(cluster.add_node()) - - # Make sure service discovery file content is correctly updated. - loaded_json_data = json.loads(writer.get_file_discovery_content())[0] - assert (set(get_metrics_export_address_from_node(nodes)) == set( - loaded_json_data["targets"])) - - -@pytest.mark.skipif( - platform.system() == "Windows", reason="Failing on Windows.") -def test_prome_file_discovery_run_by_dashboard(shutdown_only): - ray.init(num_cpus=0) - global_node = ray.worker._global_node - temp_dir = global_node.get_temp_dir_path() - - def is_service_discovery_exist(): - for path in pathlib.Path(temp_dir).iterdir(): - if PROMETHEUS_SERVICE_DISCOVERY_FILE in str(path): - return True - return False - - wait_for_condition(is_service_discovery_exist) - - @pytest.fixture def metric_mock(): mock = MagicMock() diff --git a/python/ray/tests/test_mini.py b/python/ray/tests/test_mini.py index 724deb542aae..dae1e11bd38f 100644 --- a/python/ray/tests/test_mini.py +++ b/python/ray/tests/test_mini.py @@ -59,9 +59,3 @@ def get(self): x = 1 f = Foo.remote(x) assert (ray.get(f.get.remote()) == x) - - -if __name__ == "__main__": - import pytest - import sys - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_monitor.py b/python/ray/tests/test_monitor.py index e4b14166d747..ac67ddcf2cdc 100644 --- a/python/ray/tests/test_monitor.py +++ b/python/ray/tests/test_monitor.py @@ -37,9 +37,3 @@ def test_parse_resource_demands(): # counted as infeasible or waiting, as long as it's accounted for and # doesn't cause an error. assert len(waiting + infeasible) == 10 - - -if __name__ == "__main__": - import sys - import pytest - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_multi_node.py b/python/ray/tests/test_multi_node.py index 464d985eafe2..fbce475c12af 100644 --- a/python/ray/tests/test_multi_node.py +++ b/python/ray/tests/test_multi_node.py @@ -1,13 +1,15 @@ import os import pytest +import subprocess import sys import time import ray -from ray.test_utils import (RayTestTimeoutException, run_string_as_driver, - run_string_as_driver_nonblocking, - wait_for_condition, init_error_pubsub, - get_error_message) +from ray.test_utils import ( + RayTestTimeoutException, check_call_ray, run_string_as_driver, + run_string_as_driver_nonblocking, wait_for_children_of_pid, + wait_for_children_of_pid_to_exit, wait_for_condition, kill_process_by_name, + Semaphore, init_error_pubsub, get_error_message) def test_remote_raylet_cleanup(ray_start_cluster): @@ -178,16 +180,6 @@ def f(): assert "success" in out -@pytest.mark.parametrize( - "call_ray_start", - [ - "ray start --head --num-cpus=1 --min-worker-port=0 " - "--max-worker-port=0 --port 0 --system-config=" - # This test uses ray.objects(), which only works with the GCS-based - # object directory - "{\"ownership_based_object_directory_enabled\":false}", - ], - indirect=True) def test_cleanup_on_driver_exit(call_ray_start): # This test will create a driver that creates a bunch of objects and then # exits. The entries in the object table should be cleaned up. @@ -376,6 +368,385 @@ def wait_for_success_output(process_handle, timeout=10): process_handle.kill() +def test_calling_start_ray_head(call_ray_stop_only): + + # Test that we can call ray start with various command line + # parameters. TODO(rkn): This test only tests the --head code path. We + # should also test the non-head node code path. + + # Test starting Ray with a redis port specified. + check_call_ray(["start", "--head", "--port", "0"]) + check_call_ray(["stop"]) + + # Test starting Ray with a node IP address specified. + check_call_ray( + ["start", "--head", "--node-ip-address", "127.0.0.1", "--port", "0"]) + check_call_ray(["stop"]) + + # Test starting Ray with a system config parameter set. + check_call_ray([ + "start", "--head", "--system-config", + "{\"metrics_report_interval_ms\":100}", "--port", "0" + ]) + check_call_ray(["stop"]) + + # Test starting Ray with the object manager and node manager ports + # specified. + check_call_ray([ + "start", "--head", "--object-manager-port", "12345", + "--node-manager-port", "54321", "--port", "0" + ]) + check_call_ray(["stop"]) + + # Test starting Ray with the worker port range specified. + check_call_ray([ + "start", "--head", "--min-worker-port", "50000", "--max-worker-port", + "51000", "--port", "0" + ]) + check_call_ray(["stop"]) + + # Test starting Ray with a worker port list. + check_call_ray(["start", "--head", "--worker-port-list", "10000,10001"]) + check_call_ray(["stop"]) + + # Test starting Ray with a non-int in the worker port list. + with pytest.raises(subprocess.CalledProcessError): + check_call_ray(["start", "--head", "--worker-port-list", "10000,a"]) + check_call_ray(["stop"]) + + # Test starting Ray with an invalid port in the worker port list. + with pytest.raises(subprocess.CalledProcessError): + check_call_ray(["start", "--head", "--worker-port-list", "100"]) + check_call_ray(["stop"]) + + # Test starting Ray with the number of CPUs specified. + check_call_ray(["start", "--head", "--num-cpus", "2", "--port", "0"]) + check_call_ray(["stop"]) + + # Test starting Ray with the number of GPUs specified. + check_call_ray(["start", "--head", "--num-gpus", "100", "--port", "0"]) + check_call_ray(["stop"]) + + # Test starting Ray with redis shard ports specified. + check_call_ray([ + "start", "--head", "--redis-shard-ports", "6380,6381,6382", "--port", + "0" + ]) + check_call_ray(["stop"]) + + # Test starting Ray with all arguments specified. + check_call_ray([ + "start", "--head", "--redis-shard-ports", "6380,6381,6382", + "--object-manager-port", "12345", "--num-cpus", "2", "--num-gpus", "0", + "--resources", "{\"Custom\": 1}", "--port", "0" + ]) + check_call_ray(["stop"]) + + # Test starting Ray with invalid arguments. + with pytest.raises(subprocess.CalledProcessError): + check_call_ray( + ["start", "--head", "--address", "127.0.0.1:6379", "--port", "0"]) + check_call_ray(["stop"]) + + # Test --block. Killing a child process should cause the command to exit. + blocked = subprocess.Popen( + ["ray", "start", "--head", "--block", "--port", "0"]) + + wait_for_children_of_pid(blocked.pid, num_children=7, timeout=30) + + blocked.poll() + assert blocked.returncode is None + + kill_process_by_name("raylet") + wait_for_children_of_pid_to_exit(blocked.pid, timeout=30) + blocked.wait() + assert blocked.returncode != 0, "ray start shouldn't return 0 on bad exit" + + # Test --block. Killing the command should clean up all child processes. + blocked = subprocess.Popen( + ["ray", "start", "--head", "--block", "--port", "0"]) + blocked.poll() + assert blocked.returncode is None + + wait_for_children_of_pid(blocked.pid, num_children=7, timeout=30) + + blocked.terminate() + wait_for_children_of_pid_to_exit(blocked.pid, timeout=30) + blocked.wait() + assert blocked.returncode != 0, "ray start shouldn't return 0 on bad exit" + + +@pytest.mark.parametrize( + "call_ray_start", + ["ray start --head --num-cpus=1 " + "--node-ip-address=localhost"], + indirect=True) +def test_using_hostnames(call_ray_start): + ray.init(_node_ip_address="localhost", address="localhost:6379") + + @ray.remote + def f(): + return 1 + + assert ray.get(f.remote()) == 1 + + +def test_connecting_in_local_case(ray_start_regular): + address_info = ray_start_regular + + # Define a driver that just connects to Redis. + driver_script = """ +import ray +ray.init(address="{}") +print("success") +""".format(address_info["redis_address"]) + + out = run_string_as_driver(driver_script) + # Make sure the other driver succeeded. + assert "success" in out + + +def test_run_driver_twice(ray_start_regular): + # We used to have issue 2165 and 2288: + # https://github.com/ray-project/ray/issues/2165 + # https://github.com/ray-project/ray/issues/2288 + # both complain that driver will hang when run for the second time. + # This test is used to verify the fix for above issue, it will run the + # same driver for twice and verify whether both of them succeed. + address_info = ray_start_regular + driver_script = """ +import ray +import ray.tune as tune +import os +import time + +def train_func(config, reporter): # add a reporter arg + for i in range(2): + time.sleep(0.1) + reporter(timesteps_total=i, mean_accuracy=i+97) # report metrics + +os.environ["TUNE_RESUME_PROMPT_OFF"] = "True" +ray.init(address="{}") +ray.tune.register_trainable("train_func", train_func) + +tune.run_experiments({{ + "my_experiment": {{ + "run": "train_func", + "stop": {{"mean_accuracy": 99}}, + "config": {{ + "layer1": {{ + "class_name": tune.grid_search(["a"]), + "config": {{"lr": tune.grid_search([1, 2])}} + }}, + }}, + "local_dir": os.path.expanduser("~/tmp") + }} +}}) +print("success") +""".format(address_info["redis_address"]) + + for i in range(2): + out = run_string_as_driver(driver_script) + assert "success" in out + + +@pytest.mark.skip(reason="fate sharing not implemented yet") +def test_driver_exiting_when_worker_blocked(call_ray_start): + # This test will create some drivers that submit some tasks and then + # exit without waiting for the tasks to complete. + address = call_ray_start + + ray.init(address=address) + + # Define a driver that creates two tasks, one that runs forever and the + # other blocked on the first in a `ray.get`. + driver_script = """ +import time +import ray +ray.init(address="{}") +@ray.remote +def f(): + time.sleep(10**6) +@ray.remote +def g(): + ray.get(f.remote()) +g.remote() +time.sleep(1) +print("success") +""".format(address) + + # Create some drivers and let them exit and make sure everything is + # still alive. + for _ in range(3): + out = run_string_as_driver(driver_script) + # Make sure the first driver ran to completion. + assert "success" in out + + # Define a driver that creates two tasks, one that runs forever and the + # other blocked on the first in a `ray.wait`. + driver_script = """ +import time +import ray +ray.init(address="{}") +@ray.remote +def f(): + time.sleep(10**6) +@ray.remote +def g(): + ray.wait([f.remote()]) +g.remote() +time.sleep(1) +print("success") +""".format(address) + + # Create some drivers and let them exit and make sure everything is + # still alive. + for _ in range(3): + out = run_string_as_driver(driver_script) + # Make sure the first driver ran to completion. + assert "success" in out + + # Define a driver that creates one task that depends on a nonexistent + # object. This task will be queued as waiting to execute. + driver_script_template = """ +import time +import ray +ray.init(address="{}") +@ray.remote +def g(x): + return +g.remote(ray.ObjectRef(ray.utils.hex_to_binary("{}"))) +time.sleep(1) +print("success") +""" + + # Create some drivers and let them exit and make sure everything is + # still alive. + for _ in range(3): + nonexistent_id = ray.ObjectRef.from_random() + driver_script = driver_script_template.format(address, + nonexistent_id.hex()) + out = run_string_as_driver(driver_script) + # Simulate the nonexistent dependency becoming available. + ray.worker.global_worker.put_object(None, nonexistent_id) + # Make sure the first driver ran to completion. + assert "success" in out + + # Define a driver that calls `ray.wait` on a nonexistent object. + driver_script_template = """ +import time +import ray +ray.init(address="{}") +@ray.remote +def g(): + ray.wait(ray.ObjectRef(ray.utils.hex_to_binary("{}"))) +g.remote() +time.sleep(1) +print("success") +""" + + # Create some drivers and let them exit and make sure everything is + # still alive. + for _ in range(3): + nonexistent_id = ray.ObjectRef.from_random() + driver_script = driver_script_template.format(address, + nonexistent_id.hex()) + out = run_string_as_driver(driver_script) + # Simulate the nonexistent dependency becoming available. + ray.worker.global_worker.put_object(None, nonexistent_id) + # Make sure the first driver ran to completion. + assert "success" in out + + @ray.remote + def f(): + return 1 + + # Make sure we can still talk with the raylet. + ray.get(f.remote()) + + +def test_multi_driver_logging(ray_start_regular): + address_info = ray_start_regular + address = address_info["redis_address"] + + # ray.init(address=address) + driver1_wait = Semaphore.options(name="driver1_wait").remote(value=0) + driver2_wait = Semaphore.options(name="driver2_wait").remote(value=0) + main_wait = Semaphore.options(name="main_wait").remote(value=0) + + # The creation of an actor is asynchronous. + # We need to wait for the completion of the actor creation, + # otherwise we can't get the actor by name. + ray.get(driver1_wait.locked.remote()) + ray.get(driver2_wait.locked.remote()) + ray.get(main_wait.locked.remote()) + + # Params are address, semaphore name, output1, output2 + driver_script_template = """ +import ray +import sys +from ray.test_utils import Semaphore + +@ray.remote(num_cpus=0) +def remote_print(s, file=None): + print(s, file=file) + +ray.init(address="{}") + +driver_wait = ray.get_actor("{}") +main_wait = ray.get_actor("main_wait") + +ray.get(main_wait.release.remote()) +ray.get(driver_wait.acquire.remote()) + +s1 = "{}" +ray.get(remote_print.remote(s1)) + +ray.get(main_wait.release.remote()) +ray.get(driver_wait.acquire.remote()) + +s2 = "{}" +ray.get(remote_print.remote(s2)) + +ray.get(main_wait.release.remote()) + """ + + p1 = run_string_as_driver_nonblocking( + driver_script_template.format(address, "driver1_wait", "1", "2")) + p2 = run_string_as_driver_nonblocking( + driver_script_template.format(address, "driver2_wait", "3", "4")) + + ray.get(main_wait.acquire.remote()) + ray.get(main_wait.acquire.remote()) + # At this point both of the other drivers are fully initialized. + + ray.get(driver1_wait.release.remote()) + ray.get(driver2_wait.release.remote()) + + # At this point driver1 should receive '1' and driver2 '3' + ray.get(main_wait.acquire.remote()) + ray.get(main_wait.acquire.remote()) + + ray.get(driver1_wait.release.remote()) + ray.get(driver2_wait.release.remote()) + + # At this point driver1 should receive '2' and driver2 '4' + ray.get(main_wait.acquire.remote()) + ray.get(main_wait.acquire.remote()) + + driver1_out = p1.stdout.read().decode("ascii") + driver2_out = p2.stdout.read().decode("ascii") + if sys.platform == "win32": + driver1_out = driver1_out.replace("\r", "") + driver2_out = driver2_out.replace("\r", "") + driver1_out_split = driver1_out.split("\n") + driver2_out_split = driver2_out.split("\n") + + assert driver1_out_split[0][-1] == "1", driver1_out_split + assert driver1_out_split[1][-1] == "2", driver1_out_split + assert driver2_out_split[0][-1] == "3", driver2_out_split + assert driver2_out_split[1][-1] == "4", driver2_out_split + + if __name__ == "__main__": import pytest # Make subprocess happy in bazel. diff --git a/python/ray/tests/test_multi_node_2.py b/python/ray/tests/test_multi_node_2.py index 7569dff68113..b3e739e643eb 100644 --- a/python/ray/tests/test_multi_node_2.py +++ b/python/ray/tests/test_multi_node_2.py @@ -4,7 +4,6 @@ import ray import ray.ray_constants as ray_constants -from ray.util.placement_group import placement_group, remove_placement_group from ray.autoscaler.sdk import request_resources from ray.monitor import Monitor from ray.cluster_utils import Cluster @@ -69,45 +68,16 @@ def f(): def setup_monitor(address): monitor = Monitor( address, None, redis_password=ray_constants.REDIS_DEFAULT_PASSWORD) + monitor.update_raylet_map(_append_port=True) return monitor -def assert_correct_pg(pg_response_data, pg_demands, strategy): - assert len(pg_response_data) == 1 - pg_response_data = pg_response_data[0] - strategy_mapping_dict_protobuf = { - "PACK": 0, - "SPREAD": 1, - "STRICT_PACK": 2, - "STRICT_SPREAD": 3 - } - assert pg_response_data.strategy == strategy_mapping_dict_protobuf[ - strategy] - assert pg_response_data.creator_job_id - assert pg_response_data.creator_actor_id - assert pg_response_data.creator_actor_dead - assert pg_response_data.placement_group_id - - for i, bundle in enumerate(pg_demands): - assert pg_response_data.bundles[i].unit_resources == bundle - assert pg_response_data.bundles[i].bundle_id.placement_group_id - - -# DO NOT CHANGE THIS VERIFICATION WITHOUT NOTIFYING (Eric/Ameer/Alex). def verify_load_metrics(monitor, expected_resource_usage=None, timeout=30): request_resources(num_cpus=42) - # add placement groups. - pg_demands = [{"GPU": 2}, {"extra_resource": 2}] - strategy = "STRICT_PACK" - pg = placement_group(pg_demands, strategy=strategy) - pg.ready() - time.sleep(2) # wait for placemnt groups to propogate. - # Disable event clearing for test. monitor.event_summarizer.clear = lambda *a: None - visited_atleast_once = [set(), set()] while True: monitor.update_load_metrics() monitor.update_resource_requests() @@ -118,29 +88,21 @@ def verify_load_metrics(monitor, expected_resource_usage=None, timeout=30): req = monitor.load_metrics.resource_requests assert req == [{"CPU": 1}] * 42, req - pg_response_data = monitor.load_metrics.pending_placement_groups - assert_correct_pg(pg_response_data, pg_demands, strategy) - if "memory" in resource_usage[0]: del resource_usage[0]["memory"] - visited_atleast_once[0].add("memory") - if "object_store_memory" in resource_usage[0]: + if "object_store_memory" in resource_usage[1]: del resource_usage[0]["object_store_memory"] - visited_atleast_once[0].add("object_store_memory") if "memory" in resource_usage[1]: del resource_usage[1]["memory"] - visited_atleast_once[1].add("memory") if "object_store_memory" in resource_usage[1]: del resource_usage[1]["object_store_memory"] - visited_atleast_once[1].add("object_store_memory") for key in list(resource_usage[0].keys()): if key.startswith("node:"): del resource_usage[0][key] - visited_atleast_once[0].add("node:") for key in list(resource_usage[1].keys()): if key.startswith("node:"): del resource_usage[1][key] - visited_atleast_once[1].add("node:") + if expected_resource_usage is None: if all(x for x in resource_usage[0:]): break @@ -158,13 +120,6 @@ def verify_load_metrics(monitor, expected_resource_usage=None, timeout=30): # Sanity check we emitted a resize event. assert any("Resized to" in x for x in monitor.event_summarizer.summary()) - assert visited_atleast_once[0] == { - "memory", "object_store_memory", "node:" - } - assert visited_atleast_once[0] == visited_atleast_once[1] - - remove_placement_group(pg) - return resource_usage diff --git a/python/ray/tests/test_multi_node_3.py b/python/ray/tests/test_multi_node_3.py deleted file mode 100644 index 9c270b64da55..000000000000 --- a/python/ray/tests/test_multi_node_3.py +++ /dev/null @@ -1,397 +0,0 @@ -import os -import pytest -import subprocess -import sys - -import ray -from ray.test_utils import ( - check_call_ray, run_string_as_driver, run_string_as_driver_nonblocking, - wait_for_children_of_pid, wait_for_children_of_pid_to_exit, - kill_process_by_name, Semaphore) - - -def test_calling_start_ray_head(call_ray_stop_only): - - # Test that we can call ray start with various command line - # parameters. TODO(rkn): This test only tests the --head code path. We - # should also test the non-head node code path. - - # Test starting Ray with a redis port specified. - check_call_ray(["start", "--head", "--port", "0"]) - check_call_ray(["stop"]) - - # Test starting Ray with a node IP address specified. - check_call_ray( - ["start", "--head", "--node-ip-address", "127.0.0.1", "--port", "0"]) - check_call_ray(["stop"]) - - # Test starting Ray with a system config parameter set. - check_call_ray([ - "start", "--head", "--system-config", - "{\"metrics_report_interval_ms\":100}", "--port", "0" - ]) - check_call_ray(["stop"]) - - # Test starting Ray with the object manager and node manager ports - # specified. - check_call_ray([ - "start", "--head", "--object-manager-port", "12345", - "--node-manager-port", "54321", "--port", "0" - ]) - check_call_ray(["stop"]) - - # Test starting Ray with the worker port range specified. - check_call_ray([ - "start", "--head", "--min-worker-port", "50000", "--max-worker-port", - "51000", "--port", "0" - ]) - check_call_ray(["stop"]) - - # Test starting Ray with a worker port list. - check_call_ray(["start", "--head", "--worker-port-list", "10000,10001"]) - check_call_ray(["stop"]) - - # Test starting Ray with a non-int in the worker port list. - with pytest.raises(subprocess.CalledProcessError): - check_call_ray(["start", "--head", "--worker-port-list", "10000,a"]) - check_call_ray(["stop"]) - - # Test starting Ray with an invalid port in the worker port list. - with pytest.raises(subprocess.CalledProcessError): - check_call_ray(["start", "--head", "--worker-port-list", "100"]) - check_call_ray(["stop"]) - - # Test starting Ray with the number of CPUs specified. - check_call_ray(["start", "--head", "--num-cpus", "2", "--port", "0"]) - check_call_ray(["stop"]) - - # Test starting Ray with the number of GPUs specified. - check_call_ray(["start", "--head", "--num-gpus", "100", "--port", "0"]) - check_call_ray(["stop"]) - - # Test starting Ray with redis shard ports specified. - check_call_ray([ - "start", "--head", "--redis-shard-ports", "6380,6381,6382", "--port", - "0" - ]) - check_call_ray(["stop"]) - - # Test starting Ray with all arguments specified. - check_call_ray([ - "start", "--head", "--redis-shard-ports", "6380,6381,6382", - "--object-manager-port", "12345", "--num-cpus", "2", "--num-gpus", "0", - "--resources", "{\"Custom\": 1}", "--port", "0" - ]) - check_call_ray(["stop"]) - - # Test starting Ray with invalid arguments. - with pytest.raises(subprocess.CalledProcessError): - check_call_ray( - ["start", "--head", "--address", "127.0.0.1:6379", "--port", "0"]) - check_call_ray(["stop"]) - - # Test --block. Killing a child process should cause the command to exit. - blocked = subprocess.Popen( - ["ray", "start", "--head", "--block", "--port", "0"]) - - wait_for_children_of_pid(blocked.pid, num_children=7, timeout=30) - - blocked.poll() - assert blocked.returncode is None - - kill_process_by_name("raylet") - wait_for_children_of_pid_to_exit(blocked.pid, timeout=30) - blocked.wait() - assert blocked.returncode != 0, "ray start shouldn't return 0 on bad exit" - - # Test --block. Killing the command should clean up all child processes. - blocked = subprocess.Popen( - ["ray", "start", "--head", "--block", "--port", "0"]) - blocked.poll() - assert blocked.returncode is None - - wait_for_children_of_pid(blocked.pid, num_children=7, timeout=30) - - blocked.terminate() - wait_for_children_of_pid_to_exit(blocked.pid, timeout=30) - blocked.wait() - assert blocked.returncode != 0, "ray start shouldn't return 0 on bad exit" - - -@pytest.mark.parametrize( - "call_ray_start", - ["ray start --head --num-cpus=1 " + "--node-ip-address=localhost"], - indirect=True) -def test_using_hostnames(call_ray_start): - ray.init(_node_ip_address="localhost", address="localhost:6379") - - @ray.remote - def f(): - return 1 - - assert ray.get(f.remote()) == 1 - - -def test_connecting_in_local_case(ray_start_regular): - address_info = ray_start_regular - - # Define a driver that just connects to Redis. - driver_script = """ -import ray -ray.init(address="{}") -print("success") -""".format(address_info["redis_address"]) - - out = run_string_as_driver(driver_script) - # Make sure the other driver succeeded. - assert "success" in out - - -def test_run_driver_twice(ray_start_regular): - # We used to have issue 2165 and 2288: - # https://github.com/ray-project/ray/issues/2165 - # https://github.com/ray-project/ray/issues/2288 - # both complain that driver will hang when run for the second time. - # This test is used to verify the fix for above issue, it will run the - # same driver for twice and verify whether both of them succeed. - address_info = ray_start_regular - driver_script = """ -import ray -import ray.tune as tune -import os -import time - -def train_func(config, reporter): # add a reporter arg - for i in range(2): - time.sleep(0.1) - reporter(timesteps_total=i, mean_accuracy=i+97) # report metrics - -os.environ["TUNE_RESUME_PROMPT_OFF"] = "True" -ray.init(address="{}") -ray.tune.register_trainable("train_func", train_func) - -tune.run_experiments({{ - "my_experiment": {{ - "run": "train_func", - "stop": {{"mean_accuracy": 99}}, - "config": {{ - "layer1": {{ - "class_name": tune.grid_search(["a"]), - "config": {{"lr": tune.grid_search([1, 2])}} - }}, - }}, - "local_dir": os.path.expanduser("~/tmp") - }} -}}) -print("success") -""".format(address_info["redis_address"]) - - for i in range(2): - out = run_string_as_driver(driver_script) - assert "success" in out - - -@pytest.mark.skip(reason="fate sharing not implemented yet") -def test_driver_exiting_when_worker_blocked(call_ray_start): - # This test will create some drivers that submit some tasks and then - # exit without waiting for the tasks to complete. - address = call_ray_start - - ray.init(address=address) - - # Define a driver that creates two tasks, one that runs forever and the - # other blocked on the first in a `ray.get`. - driver_script = """ -import time -import ray -ray.init(address="{}") -@ray.remote -def f(): - time.sleep(10**6) -@ray.remote -def g(): - ray.get(f.remote()) -g.remote() -time.sleep(1) -print("success") -""".format(address) - - # Create some drivers and let them exit and make sure everything is - # still alive. - for _ in range(3): - out = run_string_as_driver(driver_script) - # Make sure the first driver ran to completion. - assert "success" in out - - # Define a driver that creates two tasks, one that runs forever and the - # other blocked on the first in a `ray.wait`. - driver_script = """ -import time -import ray -ray.init(address="{}") -@ray.remote -def f(): - time.sleep(10**6) -@ray.remote -def g(): - ray.wait([f.remote()]) -g.remote() -time.sleep(1) -print("success") -""".format(address) - - # Create some drivers and let them exit and make sure everything is - # still alive. - for _ in range(3): - out = run_string_as_driver(driver_script) - # Make sure the first driver ran to completion. - assert "success" in out - - # Define a driver that creates one task that depends on a nonexistent - # object. This task will be queued as waiting to execute. - driver_script_template = """ -import time -import ray -ray.init(address="{}") -@ray.remote -def g(x): - return -g.remote(ray.ObjectRef(ray.utils.hex_to_binary("{}"))) -time.sleep(1) -print("success") -""" - - # Create some drivers and let them exit and make sure everything is - # still alive. - for _ in range(3): - nonexistent_id = ray.ObjectRef.from_random() - driver_script = driver_script_template.format(address, - nonexistent_id.hex()) - out = run_string_as_driver(driver_script) - # Simulate the nonexistent dependency becoming available. - ray.worker.global_worker.put_object(None, nonexistent_id) - # Make sure the first driver ran to completion. - assert "success" in out - - # Define a driver that calls `ray.wait` on a nonexistent object. - driver_script_template = """ -import time -import ray -ray.init(address="{}") -@ray.remote -def g(): - ray.wait(ray.ObjectRef(ray.utils.hex_to_binary("{}"))) -g.remote() -time.sleep(1) -print("success") -""" - - # Create some drivers and let them exit and make sure everything is - # still alive. - for _ in range(3): - nonexistent_id = ray.ObjectRef.from_random() - driver_script = driver_script_template.format(address, - nonexistent_id.hex()) - out = run_string_as_driver(driver_script) - # Simulate the nonexistent dependency becoming available. - ray.worker.global_worker.put_object(None, nonexistent_id) - # Make sure the first driver ran to completion. - assert "success" in out - - @ray.remote - def f(): - return 1 - - # Make sure we can still talk with the raylet. - ray.get(f.remote()) - - -def test_multi_driver_logging(ray_start_regular): - address_info = ray_start_regular - address = address_info["redis_address"] - - # ray.init(address=address) - driver1_wait = Semaphore.options(name="driver1_wait").remote(value=0) - driver2_wait = Semaphore.options(name="driver2_wait").remote(value=0) - main_wait = Semaphore.options(name="main_wait").remote(value=0) - - # The creation of an actor is asynchronous. - # We need to wait for the completion of the actor creation, - # otherwise we can't get the actor by name. - ray.get(driver1_wait.locked.remote()) - ray.get(driver2_wait.locked.remote()) - ray.get(main_wait.locked.remote()) - - # Params are address, semaphore name, output1, output2 - driver_script_template = """ -import ray -import sys -from ray.test_utils import Semaphore - -@ray.remote(num_cpus=0) -def remote_print(s, file=None): - print(s, file=file) - -ray.init(address="{}") - -driver_wait = ray.get_actor("{}") -main_wait = ray.get_actor("main_wait") - -ray.get(main_wait.release.remote()) -ray.get(driver_wait.acquire.remote()) - -s1 = "{}" -ray.get(remote_print.remote(s1)) - -ray.get(main_wait.release.remote()) -ray.get(driver_wait.acquire.remote()) - -s2 = "{}" -ray.get(remote_print.remote(s2)) - -ray.get(main_wait.release.remote()) - """ - - p1 = run_string_as_driver_nonblocking( - driver_script_template.format(address, "driver1_wait", "1", "2")) - p2 = run_string_as_driver_nonblocking( - driver_script_template.format(address, "driver2_wait", "3", "4")) - - ray.get(main_wait.acquire.remote()) - ray.get(main_wait.acquire.remote()) - # At this point both of the other drivers are fully initialized. - - ray.get(driver1_wait.release.remote()) - ray.get(driver2_wait.release.remote()) - - # At this point driver1 should receive '1' and driver2 '3' - ray.get(main_wait.acquire.remote()) - ray.get(main_wait.acquire.remote()) - - ray.get(driver1_wait.release.remote()) - ray.get(driver2_wait.release.remote()) - - # At this point driver1 should receive '2' and driver2 '4' - ray.get(main_wait.acquire.remote()) - ray.get(main_wait.acquire.remote()) - - driver1_out = p1.stdout.read().decode("ascii") - driver2_out = p2.stdout.read().decode("ascii") - if sys.platform == "win32": - driver1_out = driver1_out.replace("\r", "") - driver2_out = driver2_out.replace("\r", "") - driver1_out_split = driver1_out.split("\n") - driver2_out_split = driver2_out.split("\n") - - assert driver1_out_split[0][-1] == "1", driver1_out_split - assert driver1_out_split[1][-1] == "2", driver1_out_split - assert driver2_out_split[0][-1] == "3", driver2_out_split - assert driver2_out_split[1][-1] == "4", driver2_out_split - - -if __name__ == "__main__": - import pytest - # Make subprocess happy in bazel. - os.environ["LC_ALL"] = "en_US.UTF-8" - os.environ["LANG"] = "en_US.UTF-8" - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_multinode_failures_2.py b/python/ray/tests/test_multinode_failures_2.py index dc8e7465c6ed..3dc65be557c1 100644 --- a/python/ray/tests/test_multinode_failures_2.py +++ b/python/ray/tests/test_multinode_failures_2.py @@ -126,9 +126,7 @@ def test_driver_lives_sequential(ray_start_regular): ray.worker._global_node.kill_raylet() ray.worker._global_node.kill_plasma_store() ray.worker._global_node.kill_log_monitor() - if not sys.platform.startswith("win"): - # fails on windows. - ray.worker._global_node.kill_monitor() + ray.worker._global_node.kill_monitor() ray.worker._global_node.kill_gcs_server() # If the driver can reach the tearDown method, then it is still alive. diff --git a/python/ray/tests/test_multiprocessing.py b/python/ray/tests/test_multiprocessing.py index 8ec3cb43c7df..3f63b72db19a 100644 --- a/python/ray/tests/test_multiprocessing.py +++ b/python/ray/tests/test_multiprocessing.py @@ -340,7 +340,6 @@ def f(*args): args = [tuple(range(i)) for i in range(100)] assert pool.starmap(f, args) == args - assert pool.starmap(lambda x, y: x + y, zip([1, 2], [3, 4])) == [4, 6] def test_callbacks(pool_4_processes): diff --git a/python/ray/tests/test_object_manager.py b/python/ray/tests/test_object_manager.py index 004b1c2f6a5d..b29b9caa228f 100644 --- a/python/ray/tests/test_object_manager.py +++ b/python/ray/tests/test_object_manager.py @@ -296,85 +296,6 @@ def driver(): ray.get(driver.remote()) -@pytest.mark.timeout(30) -def test_pull_bundles_admission_control(shutdown_only): - cluster = Cluster() - object_size = int(6e6) - num_objects = 10 - num_tasks = 10 - # Head node can fit all of the objects at once. - cluster.add_node( - num_cpus=0, - object_store_memory=2 * num_tasks * num_objects * object_size) - cluster.wait_for_nodes() - ray.init(address=cluster.address) - - # Worker node can only fit 1 task at a time. - cluster.add_node( - num_cpus=1, object_store_memory=1.5 * num_objects * object_size) - cluster.wait_for_nodes() - - @ray.remote - def foo(*args): - return - - args = [] - for _ in range(num_tasks): - task_args = [ - ray.put(np.zeros(object_size, dtype=np.uint8)) - for _ in range(num_objects) - ] - args.append(task_args) - - tasks = [foo.remote(*task_args) for task_args in args] - ray.get(tasks) - - -@pytest.mark.timeout(30) -def test_pull_bundles_admission_control_dynamic(shutdown_only): - # This test is the same as test_pull_bundles_admission_control, except that - # the object store's capacity starts off higher and is later consumed - # dynamically by concurrent workers. - cluster = Cluster() - object_size = int(6e6) - num_objects = 10 - num_tasks = 10 - # Head node can fit all of the objects at once. - cluster.add_node( - num_cpus=0, - object_store_memory=2 * num_tasks * num_objects * object_size) - cluster.wait_for_nodes() - ray.init(address=cluster.address) - - # Worker node can fit 2 tasks at a time. - cluster.add_node( - num_cpus=1, object_store_memory=2.5 * num_objects * object_size) - cluster.wait_for_nodes() - - @ray.remote - def foo(i, *args): - print("foo", i) - return - - @ray.remote - def allocate(i): - print("allocate", i) - return np.zeros(object_size, dtype=np.uint8) - - args = [] - for _ in range(num_tasks): - task_args = [ - ray.put(np.zeros(object_size, dtype=np.uint8)) - for _ in range(num_objects) - ] - args.append(task_args) - - tasks = [foo.remote(i, *task_args) for i, task_args in enumerate(args)] - allocated = [allocate.remote(i) for i in range(num_objects)] - ray.get(tasks) - del allocated - - if __name__ == "__main__": import pytest import sys diff --git a/python/ray/tests/test_object_spilling.py b/python/ray/tests/test_object_spilling.py index e0e3033d255a..10b1da77306a 100644 --- a/python/ray/tests/test_object_spilling.py +++ b/python/ray/tests/test_object_spilling.py @@ -3,7 +3,6 @@ import os import random import platform -import subprocess import sys import numpy as np @@ -11,7 +10,7 @@ import ray from ray.external_storage import (create_url_with_offset, parse_url_with_offset) -from ray.test_utils import wait_for_condition, run_string_as_driver +from ray.test_utils import wait_for_condition from ray.internal.internal_api import memory_summary bucket_name = "object-spilling-test" @@ -22,15 +21,6 @@ "directory_path": spill_local_path } } -# Since we have differet protocol for a local external storage (e.g., fs) -# and distributed external storage (e.g., S3), we need to test both cases. -# This mocks the distributed fs with cluster utils. -mock_distributed_fs_object_spilling_config = { - "type": "mock_distributed_fs", - "params": { - "directory_path": spill_local_path - } -} smart_open_object_spilling_config = { "type": "smart_open", "params": { @@ -39,15 +29,6 @@ } -def create_object_spilling_config(request, tmp_path): - if (request.param["type"] == "filesystem" - or request.param["type"] == "mock_distributed_fs"): - temp_folder = tmp_path / "spill" - temp_folder.mkdir() - request.param["params"]["directory_path"] = str(temp_folder) - return json.dumps(request.param), temp_folder - - @pytest.fixture( scope="function", params=[ @@ -55,58 +36,10 @@ def create_object_spilling_config(request, tmp_path): # TODO(sang): Add a mock dependency to test S3. # smart_open_object_spilling_config, ]) -def object_spilling_config(request, tmp_path): - yield create_object_spilling_config(request, tmp_path) - - -@pytest.fixture( - scope="function", - params=[ - file_system_object_spilling_config, - mock_distributed_fs_object_spilling_config - ]) -def multi_node_object_spilling_config(request, tmp_path): - yield create_object_spilling_config(request, tmp_path) - - -def run_basic_workload(): - """Run the workload that requires spilling.""" - arr = np.random.rand(5 * 1024 * 1024) # 40 MB - refs = [] - refs.append([ray.put(arr) for _ in range(2)]) - ray.get(ray.put(arr)) - - -def is_dir_empty(temp_folder, - append_path=ray.ray_constants.DEFAULT_OBJECT_PREFIX): - # append_path is used because the file based spilling will append - # new directory path. - num_files = 0 - temp_folder = temp_folder / append_path - for path in temp_folder.iterdir(): - num_files += 1 - return num_files == 0 - - -def assert_no_thrashing(address): - state = ray.state.GlobalState() - state._initialize_global_state(address, - ray.ray_constants.REDIS_DEFAULT_PASSWORD) - raylet = state.node_table()[0] - memory_summary = ray.internal.internal_api.memory_summary( - raylet["NodeManagerAddress"], - raylet["NodeManagerPort"], - stats_only=True) - restored_bytes = 0 - consumed_bytes = 0 - - for line in memory_summary.split("\n"): - if "Restored" in line: - restored_bytes = int(line.split(" ")[1]) - if "consumed" in line: - consumed_bytes = int(line.split(" ")[-2]) - assert consumed_bytes >= restored_bytes, ( - f"consumed: {consumed_bytes}, restored: {restored_bytes}") +def object_spilling_config(request, tmpdir): + if request.param["type"] == "filesystem": + request.param["params"]["directory_path"] = str(tmpdir) + yield json.dumps(request.param) def test_invalid_config_raises_exception(shutdown_only): @@ -142,79 +75,22 @@ def test_url_generation_and_parse(): @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") -def test_default_config(shutdown_only): - ray.init(num_cpus=0, object_store_memory=75 * 1024 * 1024) - # Make sure the object spilling configuration is properly set. - config = json.loads( - ray.worker._global_node._config["object_spilling_config"]) - assert config["type"] == "filesystem" - assert (config["params"]["directory_path"] == - ray.worker._global_node._session_dir) - # Make sure the basic workload can succeed. - run_basic_workload() - ray.shutdown() - - # Make sure config is not initalized if spilling is not enabled.. - ray.init( - num_cpus=0, - object_store_memory=75 * 1024 * 1024, - _system_config={ - "automatic_object_spilling_enabled": False, - "object_store_full_delay_ms": 100 - }) - assert "object_spilling_config" not in ray.worker._global_node._config - with pytest.raises(ray.exceptions.ObjectStoreFullError): - run_basic_workload() - ray.shutdown() - - # Make sure when we use a different config, it is reflected. - ray.init( - num_cpus=0, - _system_config={ - "object_spilling_config": ( - json.dumps(mock_distributed_fs_object_spilling_config)) - }) - config = json.loads( - ray.worker._global_node._config["object_spilling_config"]) - assert config["type"] == "mock_distributed_fs" - - -@pytest.mark.skipif( - platform.system() == "Windows", reason="Failing on Windows.") -def test_default_config_cluster(ray_start_cluster): - cluster = ray_start_cluster - cluster.add_node(num_cpus=0) - ray.init(cluster.address) - worker_nodes = [] - worker_nodes.append( - cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024)) - cluster.wait_for_nodes() - - # Run the basic spilling workload on both - # worker nodes and make sure they are working. - @ray.remote - def task(): - arr = np.random.rand(5 * 1024 * 1024) # 40 MB - refs = [] - refs.append([ray.put(arr) for _ in range(2)]) - ray.get(ray.put(arr)) - - ray.get([task.remote() for _ in range(2)]) - - -@pytest.mark.skipif( - platform.system() == "Windows", reason="Failing on Windows.") -def test_spilling_not_done_for_pinned_object(object_spilling_config, - shutdown_only): +def test_spilling_not_done_for_pinned_object(tmp_path, shutdown_only): # Limit our object store to 75 MiB of memory. - object_spilling_config, temp_folder = object_spilling_config - address = ray.init( + temp_folder = tmp_path / "spill" + temp_folder.mkdir() + ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 4, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, - "object_spilling_config": object_spilling_config, + "object_spilling_config": json.dumps({ + "type": "filesystem", + "params": { + "directory_path": str(temp_folder) + } + }), "min_spilling_size": 0, }) arr = np.random.rand(5 * 1024 * 1024) # 40 MB @@ -223,29 +99,38 @@ def test_spilling_not_done_for_pinned_object(object_spilling_config, with pytest.raises(ray.exceptions.ObjectStoreFullError): ref2 = ray.put(arr) # noqa - wait_for_condition(lambda: is_dir_empty(temp_folder)) - assert_no_thrashing(address["redis_address"]) + def is_dir_empty(): + num_files = 0 + for path in temp_folder.iterdir(): + num_files += 1 + return num_files == 0 + + wait_for_condition(is_dir_empty) @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") -def test_spill_remote_object(ray_start_cluster, - multi_node_object_spilling_config): - cluster = ray_start_cluster - object_spilling_config, _ = multi_node_object_spilling_config - cluster.add_node( - num_cpus=0, - object_store_memory=75 * 1024 * 1024, - _system_config={ +@pytest.mark.parametrize( + "ray_start_cluster_head", [{ + "num_cpus": 0, + "object_store_memory": 75 * 1024 * 1024, + "_system_config": { "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, "max_io_workers": 4, - "object_spilling_config": object_spilling_config, + "object_spilling_config": json.dumps({ + "type": "filesystem", + "params": { + "directory_path": "/tmp" + } + }), "min_spilling_size": 0, - }) - ray.init(address=cluster.address) + }, + }], + indirect=True) +def test_spill_remote_object(ray_start_cluster_head): + cluster = ray_start_cluster_head cluster.add_node(object_store_memory=75 * 1024 * 1024) - cluster.wait_for_nodes() @ray.remote def put(): @@ -271,15 +156,13 @@ def depends(arg): # Test passing the spilled object as an arg to another task. ray.get(depends.remote(ref)) - assert_no_thrashing(cluster.address) @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") def test_spill_objects_automatically(object_spilling_config, shutdown_only): # Limit our object store to 75 MiB of memory. - object_spilling_config, _ = object_spilling_config - address = ray.init( + ray.init( num_cpus=1, object_store_memory=75 * 1024 * 1024, _system_config={ @@ -302,7 +185,7 @@ def test_spill_objects_automatically(object_spilling_config, shutdown_only): ref = ray.put(arr) replay_buffer.append(ref) solution_buffer.append(arr) - print("spill done.") + # randomly sample objects for _ in range(1000): index = random.choice(list(range(buffer_length))) @@ -310,22 +193,29 @@ def test_spill_objects_automatically(object_spilling_config, shutdown_only): solution = solution_buffer[index] sample = ray.get(ref, timeout=0) assert np.array_equal(sample, solution) - assert_no_thrashing(address["redis_address"]) @pytest.mark.skipif( - platform.system() in ["Windows", "Darwin"], reason="Failing on Windows.") -def test_spill_stats(object_spilling_config, shutdown_only): + platform.system() == "Windows", reason="Failing on Windows.") +def test_spill_stats(tmp_path, shutdown_only): # Limit our object store to 75 MiB of memory. - object_spilling_config, _ = object_spilling_config - address = ray.init( + temp_folder = tmp_path / "spill" + temp_folder.mkdir() + ray.init( num_cpus=1, object_store_memory=100 * 1024 * 1024, _system_config={ "automatic_object_spilling_enabled": True, "max_io_workers": 100, "min_spilling_size": 1, - "object_spilling_config": object_spilling_config + "object_spilling_config": json.dumps( + { + "type": "filesystem", + "params": { + "directory_path": str(temp_folder) + } + }, + separators=(",", ":")) }, ) @@ -343,31 +233,16 @@ def f(): x_id = f.remote() # noqa ray.get(x_id) - s = memory_summary(stats_only=True) + s = memory_summary() assert "Plasma memory usage 50 MiB, 1 objects, 50.0% full" in s, s assert "Spilled 200 MiB, 4 objects" in s, s assert "Restored 150 MiB, 3 objects" in s, s - # Test if consumed bytes are correctly calculated. - obj = ray.put(np.zeros(30 * 1024 * 1024, dtype=np.uint8)) - - @ray.remote - def func_with_ref(obj): - return True - - ray.get(func_with_ref.remote(obj)) - - s = memory_summary(stats_only=True) - # 50MB * 5 references + 30MB used for task execution. - assert "Objects consumed by Ray tasks: 280 MiB." in s, s - assert_no_thrashing(address["redis_address"]) - @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") def test_spill_during_get(object_spilling_config, shutdown_only): - object_spilling_config, _ = object_spilling_config - address = ray.init( + ray.init( num_cpus=4, object_store_memory=100 * 1024 * 1024, _system_config={ @@ -393,15 +268,13 @@ def f(): # objects are being created. for x in ids: print(ray.get(x).shape) - assert_no_thrashing(address["redis_address"]) @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") def test_spill_deadlock(object_spilling_config, shutdown_only): - object_spilling_config, _ = object_spilling_config # Limit our object store to 75 MiB of memory. - address = ray.init( + ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 1, @@ -425,23 +298,27 @@ def test_spill_deadlock(object_spilling_config, shutdown_only): ref = random.choice(replay_buffer) sample = ray.get(ref, timeout=0) assert np.array_equal(sample, arr) - assert_no_thrashing(address["redis_address"]) @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") -def test_delete_objects(object_spilling_config, shutdown_only): +def test_delete_objects(tmp_path, shutdown_only): # Limit our object store to 75 MiB of memory. - object_spilling_config, temp_folder = object_spilling_config - - address = ray.init( + temp_folder = tmp_path / "spill" + temp_folder.mkdir() + ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 1, "min_spilling_size": 0, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, - "object_spilling_config": object_spilling_config, + "object_spilling_config": json.dumps({ + "type": "filesystem", + "params": { + "directory_path": str(temp_folder) + } + }), }) arr = np.random.rand(1024 * 1024) # 8 MB data replay_buffer = [] @@ -454,27 +331,36 @@ def test_delete_objects(object_spilling_config, shutdown_only): print("-----------------------------------") + def is_dir_empty(): + num_files = 0 + for path in temp_folder.iterdir(): + num_files += 1 + return num_files == 0 + del replay_buffer del ref - wait_for_condition(lambda: is_dir_empty(temp_folder)) - assert_no_thrashing(address["redis_address"]) + wait_for_condition(is_dir_empty) @pytest.mark.skipif( - platform.system() in ["Windows", "Darwin"], reason="Failing on Windows.") -def test_delete_objects_delete_while_creating(object_spilling_config, - shutdown_only): + platform.system() == "Windows", reason="Failing on Windows.") +def test_delete_objects_delete_while_creating(tmp_path, shutdown_only): # Limit our object store to 75 MiB of memory. - object_spilling_config, temp_folder = object_spilling_config - - address = ray.init( + temp_folder = tmp_path / "spill" + temp_folder.mkdir() + ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 4, "min_spilling_size": 0, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, - "object_spilling_config": object_spilling_config, + "object_spilling_config": json.dumps({ + "type": "filesystem", + "params": { + "directory_path": str(temp_folder) + } + }), }) arr = np.random.rand(1024 * 1024) # 8 MB data replay_buffer = [] @@ -494,27 +380,36 @@ def test_delete_objects_delete_while_creating(object_spilling_config, sample = ray.get(ref, timeout=0) assert np.array_equal(sample, arr) + def is_dir_empty(): + num_files = 0 + for path in temp_folder.iterdir(): + num_files += 1 + return num_files == 0 + # After all, make sure all objects are killed without race condition. del replay_buffer del ref - wait_for_condition(lambda: is_dir_empty(temp_folder)) - assert_no_thrashing(address["redis_address"]) + wait_for_condition(is_dir_empty, timeout=1000) @pytest.mark.skipif( - platform.system() in ["Windows", "Darwin"], reason="Failing on Windows.") -def test_delete_objects_on_worker_failure(object_spilling_config, - shutdown_only): + platform.system() == "Windows", reason="Failing on Windows.") +def test_delete_objects_on_worker_failure(tmp_path, shutdown_only): # Limit our object store to 75 MiB of memory. - object_spilling_config, temp_folder = object_spilling_config - - address = ray.init( + temp_folder = tmp_path / "spill" + temp_folder.mkdir() + ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 4, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, - "object_spilling_config": object_spilling_config, + "object_spilling_config": json.dumps({ + "type": "filesystem", + "params": { + "directory_path": str(temp_folder) + } + }), "min_spilling_size": 0, }) @@ -558,19 +453,22 @@ def wait_until_actor_dead(): wait_for_condition(wait_until_actor_dead) + def is_dir_empty(): + num_files = 0 + for path in temp_folder.iterdir(): + num_files += 1 + return num_files == 0 + # After all, make sure all objects are deleted upon worker failures. - wait_for_condition(lambda: is_dir_empty(temp_folder)) - assert_no_thrashing(address["redis_address"]) + wait_for_condition(is_dir_empty, timeout=1000) @pytest.mark.skipif( - platform.system() in ["Windows", "Darwin"], - reason="Failing on Windows and MacOS.") -def test_delete_objects_multi_node(multi_node_object_spilling_config, - ray_start_cluster): + platform.system() == "Windows", reason="Failing on Windows.") +def test_delete_objects_multi_node(tmp_path, ray_start_cluster): # Limit our object store to 75 MiB of memory. - object_spilling_config, temp_folder = multi_node_object_spilling_config - + temp_folder = tmp_path / "spill" + temp_folder.mkdir() cluster = ray_start_cluster # Head node. cluster.add_node( @@ -581,13 +479,17 @@ def test_delete_objects_multi_node(multi_node_object_spilling_config, "min_spilling_size": 20 * 1024 * 1024, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, - "object_spilling_config": object_spilling_config, + "object_spilling_config": json.dumps({ + "type": "filesystem", + "params": { + "directory_path": str(temp_folder) + } + }), }) - ray.init(address=cluster.address) # Add 2 worker nodes. for _ in range(2): cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024) - cluster.wait_for_nodes() + ray.init(address=cluster.address) arr = np.random.rand(1024 * 1024) # 8 MB data @@ -610,9 +512,9 @@ def create_objects(self): self.replay_buffer.pop() # Do random sampling. - for _ in range(50): + for _ in range(200): ref = random.choice(self.replay_buffer) - sample = ray.get(ref, timeout=10) + sample = ray.get(ref, timeout=0) assert np.array_equal(sample, arr) actors = [Actor.remote() for _ in range(3)] @@ -625,27 +527,37 @@ def wait_until_actor_dead(actor): return True return False + def is_dir_empty(): + num_files = 0 + for path in temp_folder.iterdir(): + num_files += 1 + return num_files == 0 + # Kill actors to remove all references. for actor in actors: ray.kill(actor) wait_for_condition(lambda: wait_until_actor_dead(actor)) # The multi node deletion should work. - wait_for_condition(lambda: is_dir_empty(temp_folder)) - assert_no_thrashing(cluster.address) + wait_for_condition(is_dir_empty) -@pytest.mark.skipif(platform.system() == "Windows", reason="Flaky on Windows.") -def test_fusion_objects(object_spilling_config, shutdown_only): +def test_fusion_objects(tmp_path, shutdown_only): # Limit our object store to 75 MiB of memory. - object_spilling_config, temp_folder = object_spilling_config + temp_folder = tmp_path / "spill" + temp_folder.mkdir() min_spilling_size = 10 * 1024 * 1024 - address = ray.init( + ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 3, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, - "object_spilling_config": object_spilling_config, + "object_spilling_config": json.dumps({ + "type": "filesystem", + "params": { + "directory_path": str(temp_folder) + } + }), "min_spilling_size": min_spilling_size, }) replay_buffer = [] @@ -672,9 +584,6 @@ def test_fusion_objects(object_spilling_config, shutdown_only): assert np.array_equal(sample, solution) is_test_passing = False - # Since we'd like to see the temp directory that stores the files, - # we need to append this directory. - temp_folder = temp_folder / ray.ray_constants.DEFAULT_OBJECT_PREFIX for path in temp_folder.iterdir(): file_size = path.stat().st_size # Make sure there are at least one @@ -683,20 +592,24 @@ def test_fusion_objects(object_spilling_config, shutdown_only): if file_size >= min_spilling_size: is_test_passing = True assert is_test_passing - assert_no_thrashing(address["redis_address"]) # https://github.com/ray-project/ray/issues/12912 -def do_test_release_resource(object_spilling_config, expect_released): - object_spilling_config, temp_folder = object_spilling_config - address = ray.init( +def do_test_release_resource(tmp_path, expect_released): + temp_folder = tmp_path / "spill" + ray.init( num_cpus=1, object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 1, "release_resources_during_plasma_fetch": expect_released, "automatic_object_spilling_enabled": True, - "object_spilling_config": object_spilling_config, + "object_spilling_config": json.dumps({ + "type": "filesystem", + "params": { + "directory_path": str(temp_folder) + } + }), }) plasma_obj = ray.put(np.ones(50 * 1024 * 1024, dtype=np.uint8)) for _ in range(5): @@ -721,134 +634,18 @@ def f(dep): assert ready else: assert not ready - assert_no_thrashing(address["redis_address"]) @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") -def test_no_release_during_plasma_fetch(object_spilling_config, shutdown_only): - do_test_release_resource(object_spilling_config, expect_released=False) +def test_no_release_during_plasma_fetch(tmp_path, shutdown_only): + do_test_release_resource(tmp_path, expect_released=False) @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") -def test_release_during_plasma_fetch(object_spilling_config, shutdown_only): - do_test_release_resource(object_spilling_config, expect_released=True) - - -@pytest.mark.skipif( - platform.system() == "Windows", reason="Failing on Windows.") -@pytest.mark.timeout(30) -def test_spill_objects_on_object_transfer(object_spilling_config, - ray_start_cluster): - object_spilling_config, _ = object_spilling_config - # This test checks that objects get spilled to make room for transferred - # objects. - cluster = ray_start_cluster - object_size = int(1e7) - num_objects = 10 - num_tasks = 10 - # Head node can fit all of the objects at once. - cluster.add_node( - num_cpus=0, - object_store_memory=2 * num_tasks * num_objects * object_size, - _system_config={ - "max_io_workers": 1, - "automatic_object_spilling_enabled": True, - "object_store_full_delay_ms": 100, - "object_spilling_config": object_spilling_config, - "min_spilling_size": 0 - }) - cluster.wait_for_nodes() - ray.init(address=cluster.address) - - # Worker node can fit 1 tasks at a time. - cluster.add_node( - num_cpus=1, object_store_memory=1.5 * num_objects * object_size) - cluster.wait_for_nodes() - - @ray.remote - def foo(*args): - return - - @ray.remote - def allocate(*args): - return np.zeros(object_size, dtype=np.uint8) - - # Allocate some objects that must be spilled to make room for foo's - # arguments. - allocated = [allocate.remote() for _ in range(num_objects)] - ray.get(allocated) - print("done allocating") - - args = [] - for _ in range(num_tasks): - task_args = [ - ray.put(np.zeros(object_size, dtype=np.uint8)) - for _ in range(num_objects) - ] - args.append(task_args) - - # Check that tasks scheduled to the worker node have enough room after - # spilling. - tasks = [foo.remote(*task_args) for task_args in args] - ray.get(tasks) - assert_no_thrashing(cluster.address) - - -@pytest.mark.skipif( - platform.system() in ["Windows"], reason="Failing on " - "Windows and Mac.") -def test_file_deleted_when_driver_exits(tmp_path, shutdown_only): - # Limit our object store to 75 MiB of memory. - temp_folder = tmp_path / "spill" - temp_folder.mkdir() - - driver = """ -import json -import os -import signal -import numpy as np -import ray -ray.init( - object_store_memory=75 * 1024 * 1024, - _system_config={{ - "max_io_workers": 2, - "min_spilling_size": 0, - "automatic_object_spilling_enabled": True, - "object_store_full_delay_ms": 100, - "object_spilling_config": json.dumps({{ - "type": "filesystem", - "params": {{ - "directory_path": "{temp_dir}" - }} - }}), - }}) -arr = np.random.rand(1024 * 1024) # 8 MB data -replay_buffer = [] -# Spill lots of objects -for _ in range(30): - ref = None - while ref is None: - ref = ray.put(arr) - replay_buffer.append(ref) -# Send sigterm to itself. -signum = {signum} -sig = None -if signum == 2: - sig = signal.SIGINT -elif signum == 15: - sig = signal.SIGTERM -os.kill(os.getpid(), sig) -""" - - # Run a driver with sigint. - print("Sending sigint...") - with pytest.raises(subprocess.CalledProcessError): - print( - run_string_as_driver( - driver.format(temp_dir=str(temp_folder), signum=2))) - wait_for_condition(lambda: is_dir_empty(temp_folder, append_path="")) +def test_release_during_plasma_fetch(tmp_path, shutdown_only): + do_test_release_resource(tmp_path, expect_released=True) if __name__ == "__main__": diff --git a/python/ray/tests/test_placement_group.py b/python/ray/tests/test_placement_group.py index 92ef90ca4e1e..7c5963f9e8a1 100644 --- a/python/ray/tests/test_placement_group.py +++ b/python/ray/tests/test_placement_group.py @@ -375,7 +375,6 @@ def test_remove_pending_placement_group(ray_start_cluster): # Create a placement group that cannot be scheduled now. placement_group = ray.util.placement_group([{"GPU": 2}, {"CPU": 2}]) ray.util.remove_placement_group(placement_group) - # TODO(sang): Add state check here. @ray.remote(num_cpus=4) def f(): @@ -798,10 +797,10 @@ def random_tasks(): pg_tasks = [] # total bundle gpu usage = bundles_per_pg * total_num_pg * per_bundle_gpus # Note this is half of total - for index in range(total_num_pg): + for _ in range(total_num_pg): pgs.append( ray.util.placement_group( - name=f"name{index}", + name="name", strategy="PACK", bundles=[{ "GPU": per_bundle_gpus @@ -902,10 +901,8 @@ def schedule_nested_actor_outside_pg(self): # Kill an actor and wait until it is killed. ray.kill(a) - try: + with pytest.raises(ray.exceptions.RayActorError): ray.get(a.ready.remote()) - except ray.exceptions.RayActorError: - pass # Now create an actor, but do not capture the current tasks a = Actor.options( @@ -927,10 +924,8 @@ def schedule_nested_actor_outside_pg(self): # Kill an actor and wait until it is killed. ray.kill(a) - try: + with pytest.raises(ray.exceptions.RayActorError): ray.get(a.ready.remote()) - except ray.exceptions.RayActorError: - pass # Lastly, make sure when None is specified, actors are not scheduled # on the same placement group. @@ -1314,202 +1309,6 @@ def is_all_placement_group_removed(): wait_for_condition(is_all_placement_group_removed) - ray.shutdown() - - -def test_detached_placement_group(ray_start_cluster): - cluster = ray_start_cluster - for _ in range(2): - cluster.add_node(num_cpus=3) - cluster.wait_for_nodes() - info = ray.init(address=cluster.address) - - # Make sure detached placement group will alive when job dead. - driver_code = f""" -import ray - -ray.init(address="{info["redis_address"]}") - -pg = ray.util.placement_group( - [{{"CPU": 1}} for _ in range(2)], - strategy="STRICT_SPREAD", lifetime="detached") -ray.get(pg.ready()) - -@ray.remote(num_cpus=1) -class Actor: - def ready(self): - return True - -for bundle_index in range(2): - actor = Actor.options(lifetime="detached", placement_group=pg, - placement_group_bundle_index=bundle_index).remote() - ray.get(actor.ready.remote()) - -ray.shutdown() - """ - - run_string_as_driver(driver_code) - - # Wait until the driver is reported as dead by GCS. - def is_job_done(): - jobs = ray.jobs() - for job in jobs: - if "StopTime" in job: - return True - return False - - def assert_alive_num_pg(expected_num_pg): - alive_num_pg = 0 - for _, placement_group_info in ray.util.placement_group_table().items( - ): - if placement_group_info["state"] == "CREATED": - alive_num_pg += 1 - return alive_num_pg == expected_num_pg - - def assert_alive_num_actor(expected_num_actor): - alive_num_actor = 0 - for actor_info in ray.actors().values(): - if actor_info["State"] == ray.gcs_utils.ActorTableData.ALIVE: - alive_num_actor += 1 - return alive_num_actor == expected_num_actor - - wait_for_condition(is_job_done) - - assert assert_alive_num_pg(1) - assert assert_alive_num_actor(2) - - # Make sure detached placement group will alive when its creator which - # is detached actor dead. - # Test actors first. - @ray.remote(num_cpus=1) - class NestedActor: - def ready(self): - return True - - @ray.remote(num_cpus=1) - class Actor: - def __init__(self): - self.actors = [] - - def ready(self): - return True - - def schedule_nested_actor_with_detached_pg(self): - # Create placement group which is detached. - pg = ray.util.placement_group( - [{ - "CPU": 1 - } for _ in range(2)], - strategy="STRICT_SPREAD", - lifetime="detached", - name="detached_pg") - ray.get(pg.ready()) - # Schedule nested actor with the placement group. - for bundle_index in range(2): - actor = NestedActor.options( - placement_group=pg, - placement_group_bundle_index=bundle_index, - lifetime="detached").remote() - ray.get(actor.ready.remote()) - self.actors.append(actor) - - a = Actor.options(lifetime="detached").remote() - ray.get(a.ready.remote()) - # 1 parent actor and 2 children actor. - ray.get(a.schedule_nested_actor_with_detached_pg.remote()) - - # Kill an actor and wait until it is killed. - ray.kill(a) - try: - ray.get(a.ready.remote()) - except ray.exceptions.RayActorError: - pass - - # We should have 2 alive pgs and 4 alive actors. - assert assert_alive_num_pg(2) - assert assert_alive_num_actor(4) - - -def test_named_placement_group(ray_start_cluster): - cluster = ray_start_cluster - for _ in range(2): - cluster.add_node(num_cpus=3) - cluster.wait_for_nodes() - info = ray.init(address=cluster.address) - global_placement_group_name = "named_placement_group" - - # Create a detached placement group with name. - driver_code = f""" -import ray - -ray.init(address="{info["redis_address"]}") - -pg = ray.util.placement_group( - [{{"CPU": 1}} for _ in range(2)], - strategy="STRICT_SPREAD", - name="{global_placement_group_name}", - lifetime="detached") -ray.get(pg.ready()) - -ray.shutdown() - """ - - run_string_as_driver(driver_code) - - # Wait until the driver is reported as dead by GCS. - def is_job_done(): - jobs = ray.jobs() - for job in jobs: - if "StopTime" in job: - return True - return False - - wait_for_condition(is_job_done) - - @ray.remote(num_cpus=1) - class Actor: - def ping(self): - return "pong" - - # Get the named placement group and schedule a actor. - placement_group = ray.util.get_placement_group(global_placement_group_name) - assert placement_group is not None - assert placement_group.wait(5) - actor = Actor.options( - placement_group=placement_group, - placement_group_bundle_index=0).remote() - - ray.get(actor.ping.remote()) - - # Create another placement group and make sure its creation will failed. - same_name_pg = ray.util.placement_group( - [{ - "CPU": 1 - } for _ in range(2)], - strategy="STRICT_SPREAD", - name=global_placement_group_name) - assert not same_name_pg.wait(10) - - # Remove a named placement group and make sure the second creation - # will successful. - ray.util.remove_placement_group(placement_group) - same_name_pg = ray.util.placement_group( - [{ - "CPU": 1 - } for _ in range(2)], - strategy="STRICT_SPREAD", - name=global_placement_group_name) - assert same_name_pg.wait(10) - - # Get a named placement group with a name that doesn't exist - # and make sure it will raise ValueError correctly. - error_count = 0 - try: - ray.util.get_placement_group("inexistent_pg") - except ValueError: - error_count = error_count + 1 - assert error_count == 1 - if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_queue.py b/python/ray/tests/test_queue.py index 88cf6d7b647f..6c2fb5cf0ec9 100644 --- a/python/ray/tests/test_queue.py +++ b/python/ray/tests/test_queue.py @@ -199,19 +199,17 @@ def test_custom_resources(ray_start_regular_shared): assert current_resources["CPU"] == 1.0 # By default an actor should not reserve any resources. - q = Queue() + Queue() current_resources = ray.available_resources() assert current_resources["CPU"] == 1.0 - q.shutdown() # Specify resource requirement. The queue should now reserve 1 CPU. - q = Queue(actor_options={"num_cpus": 1}) + Queue(actor_options={"num_cpus": 1}) def no_cpu_in_resources(): return "CPU" not in ray.available_resources() wait_for_condition(no_cpu_in_resources) - q.shutdown() if __name__ == "__main__": diff --git a/python/ray/tests/test_reconstruction.py b/python/ray/tests/test_reconstruction.py index bad48419f58e..f5eed1e8fb23 100644 --- a/python/ray/tests/test_reconstruction.py +++ b/python/ray/tests/test_reconstruction.py @@ -17,7 +17,7 @@ def test_cached_object(ray_start_cluster): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_period_milliseconds": 100, + "raylet_heartbeat_timeout_milliseconds": 100, "object_timeout_milliseconds": 200, } cluster = ray_start_cluster @@ -59,7 +59,7 @@ def test_reconstruction_cached_dependency(ray_start_cluster, reconstruction_enabled): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_period_milliseconds": 100, + "raylet_heartbeat_timeout_milliseconds": 100, "object_timeout_milliseconds": 200, } # Workaround to reset the config to the default value. @@ -118,7 +118,7 @@ def dependent_task(x): def test_basic_reconstruction(ray_start_cluster, reconstruction_enabled): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_period_milliseconds": 100, + "raylet_heartbeat_timeout_milliseconds": 100, "object_timeout_milliseconds": 200, } # Workaround to reset the config to the default value. @@ -163,12 +163,11 @@ def dependent_task(x): raise e.as_instanceof_cause() -@pytest.mark.skipif(sys.platform == "win32", reason="Very flaky on Windows.") @pytest.mark.parametrize("reconstruction_enabled", [False, True]) def test_basic_reconstruction_put(ray_start_cluster, reconstruction_enabled): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_period_milliseconds": 100, + "raylet_heartbeat_timeout_milliseconds": 100, "object_timeout_milliseconds": 200, } # Workaround to reset the config to the default value. @@ -220,13 +219,12 @@ def dependent_task(x): pass -@pytest.mark.skipif(sys.platform == "win32", reason="Very flaky on Windows.") @pytest.mark.parametrize("reconstruction_enabled", [False, True]) def test_basic_reconstruction_actor_task(ray_start_cluster, reconstruction_enabled): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_period_milliseconds": 100, + "raylet_heartbeat_timeout_milliseconds": 100, "object_timeout_milliseconds": 200, } # Workaround to reset the config to the default value. @@ -299,7 +297,7 @@ def test_basic_reconstruction_actor_constructor(ray_start_cluster, reconstruction_enabled): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_period_milliseconds": 100, + "raylet_heartbeat_timeout_milliseconds": 100, "object_timeout_milliseconds": 200, } # Workaround to reset the config to the default value. @@ -374,12 +372,11 @@ def probe(): raise e.as_instanceof_cause() -@pytest.mark.skip(reason="This hangs due to a deadlock in admission control.") @pytest.mark.parametrize("reconstruction_enabled", [False, True]) def test_multiple_downstream_tasks(ray_start_cluster, reconstruction_enabled): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_period_milliseconds": 100, + "raylet_heartbeat_timeout_milliseconds": 100, "object_timeout_milliseconds": 200, } # Workaround to reset the config to the default value. @@ -439,12 +436,11 @@ def dependent_task(x): raise e.as_instanceof_cause() -@pytest.mark.skip(reason="This hangs due to a deadlock in admission control.") @pytest.mark.parametrize("reconstruction_enabled", [False, True]) def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_period_milliseconds": 100, + "raylet_heartbeat_timeout_milliseconds": 100, "object_timeout_milliseconds": 200, } # Workaround to reset the config to the default value. @@ -491,12 +487,11 @@ def dependent_task(x): raise e.as_instanceof_cause() -@pytest.mark.skip(reason="This hangs due to a deadlock in admission control.") @pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.") def test_reconstruction_stress(ray_start_cluster): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_period_milliseconds": 100, + "raylet_heartbeat_timeout_milliseconds": 100, "max_direct_call_object_size": 100, "task_retry_delay_ms": 100, "object_timeout_milliseconds": 200, diff --git a/python/ray/tests/test_reference_counting.py b/python/ray/tests/test_reference_counting.py index 0c0f3010af13..a47a9a828c11 100644 --- a/python/ray/tests/test_reference_counting.py +++ b/python/ray/tests/test_reference_counting.py @@ -18,10 +18,8 @@ @pytest.fixture def one_worker_100MiB(request): - # It has lots of tests that don't require object spilling. config = { "task_retry_delay_ms": 0, - "automatic_object_spilling_enabled": False } yield ray.init( num_cpus=1, @@ -245,7 +243,9 @@ def pending(input1, input2): def test_feature_flag(shutdown_only): - ray.init(object_store_memory=100 * 1024 * 1024) + ray.init( + object_store_memory=100 * 1024 * 1024, + _system_config={"object_pinning_enabled": 0}) @ray.remote def f(array): @@ -468,10 +468,8 @@ def delete_ref2(self): # Test that the actor exiting stops the reference from being pinned. ray.kill(actor) # Wait for the actor to exit. - try: + with pytest.raises(ray.exceptions.RayActorError): ray.get(actor.delete_ref1.remote()) - except ray.exceptions.RayActorError: - pass else: # Test that deleting the second reference stops it from being pinned. ray.get(actor.delete_ref2.remote()) diff --git a/python/ray/tests/test_reference_counting_2.py b/python/ray/tests/test_reference_counting_2.py index 416afcec0378..8cc7576aa46c 100644 --- a/python/ray/tests/test_reference_counting_2.py +++ b/python/ray/tests/test_reference_counting_2.py @@ -22,7 +22,6 @@ def one_worker_100MiB(request): config = { "task_retry_delay_ms": 0, "object_timeout_milliseconds": 1000, - "automatic_object_spilling_enabled": False } yield ray.init( num_cpus=1, diff --git a/python/ray/tests/test_resource_demand_scheduler.py b/python/ray/tests/test_resource_demand_scheduler.py index d753ffcab35a..4b2027af1d66 100644 --- a/python/ray/tests/test_resource_demand_scheduler.py +++ b/python/ray/tests/test_resource_demand_scheduler.py @@ -8,7 +8,6 @@ import copy import ray -import ray.ray_constants from ray.autoscaler._private.util import \ rewrite_legacy_yaml_to_available_node_types, format_info_string, \ format_info_string_no_node_types @@ -29,7 +28,7 @@ from ray.autoscaler.tags import TAG_RAY_USER_NODE_TYPE, TAG_RAY_NODE_KIND, \ NODE_KIND_WORKER, TAG_RAY_NODE_STATUS, \ STATUS_UP_TO_DATE, STATUS_UNINITIALIZED, \ - STATUS_UPDATE_FAILED, STATUS_WAITING_FOR_SSH, \ + STATUS_UPDATE_FAILED, \ NODE_KIND_HEAD, NODE_TYPE_LEGACY_WORKER, \ NODE_TYPE_LEGACY_HEAD from ray.test_utils import same_elements @@ -88,7 +87,8 @@ MULTI_WORKER_CLUSTER = dict( SMALL_CLUSTER, **{ "available_node_types": TYPES_A, - "head_node_type": "empty_node" + "head_node_type": "empty_node", + "worker_default_node_type": "m4.large", }) @@ -106,14 +106,6 @@ def test_util_score(): (8, 8) -def test_gpu_node_util_score(): - # Avoid scheduling CPU tasks on GPU node. - assert _utilization_score({"GPU": 1, "CPU": 1}, [{"CPU": 1}]) is None - assert _utilization_score({"GPU": 1, "CPU": 1}, [{"CPU": 1, "GPU": 1}]) \ - == (1.0, 1.0) - assert _utilization_score({"GPU": 1, "CPU": 1}, [{"GPU": 1}]) == (0.0, 0.5) - - def test_bin_pack(): assert get_bin_pack_residual([], [{"GPU": 2}, {"GPU": 2}])[0] == \ [{"GPU": 2}, {"GPU": 2}] @@ -256,32 +248,6 @@ def test_get_nodes_packing_heuristic(): } -def test_gpu_node_avoid_cpu_task(): - types = { - "cpu": { - "resources": { - "CPU": 1 - }, - "max_workers": 10, - }, - "gpu": { - "resources": { - "GPU": 1, - "CPU": 100, - }, - "max_workers": 10, - }, - } - r1 = [{"CPU": 1}] * 100 - assert get_nodes_for(types, {}, "empty_node", 100, r1) == {"cpu": 10} - r2 = [{"GPU": 1}] + [{"CPU": 1}] * 100 - assert get_nodes_for(types, {}, "empty_node", 100, r2) == \ - {"gpu": 1} - r3 = [{"GPU": 1}] * 4 + [{"CPU": 1}] * 404 - assert get_nodes_for(types, {}, "empty_node", 100, r3) == \ - {"gpu": 4, "cpu": 4} - - def test_get_nodes_respects_max_limit(): types = { "m4.large": { @@ -1216,27 +1182,15 @@ def testSummary(self): strategy=PlacementStrategy.PACK, bundles=([Bundle(unit_resources={"GPU": 2})] * 2)), ] - lm.update( - "1.1.1.1", - { - "CPU": 64, - "memory": 20, # 1000 MiB - "object_store_memory": 40 # 2000 MiB - }, - { - "CPU": 2, - "memory": 10, # 500 MiB - "object_store_memory": 20 # 1000 MiB - }, - {}) + lm.update("1.1.1.1", {"CPU": 64}, {"CPU": 2}, {}) lm.update("1.1.1.2", { "CPU": 64, "GPU": 8, - "accelerator_type:V100": 1, + "accelerator_type:V100": 1 }, { "CPU": 0, "GPU": 1, - "accelerator_type:V100": 1, + "accelerator_type:V100": 1 }, {}) lm.update("1.1.1.3", { "CPU": 64, @@ -1270,9 +1224,6 @@ def testSummary(self): assert summary.usage["CPU"] == (190, 194) assert summary.usage["GPU"] == (15, 16) - assert summary.usage["memory"] == (500 * 2**20, 1000 * 2**20) - assert summary.usage["object_store_memory"] == \ - (1000 * 2**20, 2000 * 2**20) assert summary.usage["accelerator_type:V100"][1] == 2, \ "Not comparing the usage value due to floating point error." @@ -1296,7 +1247,7 @@ def testSummary(self): # TODO (Alex): This set of nodes won't be very useful in practice # because the node:xxx.xxx.xxx.xxx resources means that no 2 nodes # should ever have the same set of resources. - assert len(summary.node_types) == 3, summary.node_types + assert len(summary.node_types) == 3 class AutoscalingTest(unittest.TestCase): @@ -1468,8 +1419,7 @@ def testSummary(self): assert summary.active_nodes["empty_node"] == 1 assert len(summary.active_nodes) == 2, summary.active_nodes - assert summary.pending_nodes == [("172.0.0.3", "p2.xlarge", - STATUS_WAITING_FOR_SSH)] + assert summary.pending_nodes == [("172.0.0.3", "p2.xlarge")] assert summary.pending_launches == {"m4.16xlarge": 2} assert summary.failed_nodes == [("172.0.0.4", "m4.4xlarge")] @@ -2079,6 +2029,7 @@ def testRequestResourcesIdleTimeout(self): "node_config": {}, "resources": { "CPU": 2, + "GPU": 1, "WORKER": 1 }, "max_workers": 3 @@ -2195,6 +2146,7 @@ def testRequestResourcesRaceConditionsLong(self): "node_config": {}, "resources": { "CPU": 2, + "GPU": 1, "WORKER": 1 }, "max_workers": 3, @@ -2308,6 +2260,7 @@ def testRequestResourcesRaceConditionWithMinWorker(self): "node_config": {}, "resources": { "CPU": 2, + "GPU": 1, "WORKER": 1 }, "max_workers": 3, @@ -2429,8 +2382,8 @@ def test_info_string(): "CPU": (530, 544), "GPU": (2, 2), "AcceleratorType:V100": (0, 2), - "memory": (2 * 2**30, 2**33), - "object_store_memory": (3.14 * 2**30, 2**34) + "memory": (0, 1583.19), + "object_store_memory": (0, 471.02) }, resource_demand=[({ "CPU": 1 @@ -2450,8 +2403,7 @@ def test_info_string(): "p3.2xlarge": 2, "m4.4xlarge": 20 }, - pending_nodes=[("1.2.3.4", "m4.4xlarge", STATUS_WAITING_FOR_SSH), - ("1.2.3.5", "m4.4xlarge", STATUS_WAITING_FOR_SSH)], + pending_nodes=[("1.2.3.4", "m4.4xlarge"), ("1.2.3.5", "m4.4xlarge")], pending_launches={"m4.4xlarge": 2}, failed_nodes=[("1.2.3.6", "p3.2xlarge")]) @@ -2464,8 +2416,8 @@ def test_info_string(): 20 m4.4xlarge Pending: m4.4xlarge, 2 launching - 1.2.3.4: m4.4xlarge, waiting-for-ssh - 1.2.3.5: m4.4xlarge, waiting-for-ssh + 1.2.3.4: m4.4xlarge, setting up + 1.2.3.5: m4.4xlarge, setting up Recent failures: (no failures) @@ -2473,11 +2425,11 @@ def test_info_string(): -------------------------------------------------------- Usage: - 0/2 AcceleratorType:V100 530/544 CPU 2/2 GPU - 2.00/8.000 GiB memory - 3.14/16.000 GiB object_store_memory + 0/2 AcceleratorType:V100 + 0.00/77.304 GiB memory + 0.00/22.999 GiB object_store_memory Demands: {'CPU': 1}: 150+ pending tasks/actors @@ -2500,8 +2452,8 @@ def test_info_string_no_node_type(): "CPU": (530, 544), "GPU": (2, 2), "AcceleratorType:V100": (0, 2), - "memory": (2 * 2**30, 2**33), - "object_store_memory": (3.14 * 2**30, 2**34) + "memory": (0, 1583.19), + "object_store_memory": (0, 471.02) }, resource_demand=[({ "CPU": 1 @@ -2528,11 +2480,11 @@ def test_info_string_no_node_type(): Resources ----------------------------------------------------- Usage: - 0/2 AcceleratorType:V100 530/544 CPU 2/2 GPU - 2.00/8.000 GiB memory - 3.14/16.000 GiB object_store_memory + 0/2 AcceleratorType:V100 + 0.00/77.304 GiB memory + 0.00/22.999 GiB object_store_memory Demands: {'CPU': 1}: 150+ pending tasks/actors diff --git a/python/ray/tests/test_serialization.py b/python/ray/tests/test_serialization.py index 7b5f32f96a70..8c72ba209420 100644 --- a/python/ray/tests/test_serialization.py +++ b/python/ray/tests/test_serialization.py @@ -616,13 +616,6 @@ def custom_deserializer(x): A, serializer=custom_serializer, deserializer=custom_deserializer) ray.get(ray.put(A(1))) - ray.util.deregister_serializer(A) - with pytest.raises(Exception): - ray.get(ray.put(A(1))) - - # deregister again takes no effects - ray.util.deregister_serializer(A) - if __name__ == "__main__": import pytest diff --git a/python/ray/tests/test_shuffle.py b/python/ray/tests/test_shuffle.py deleted file mode 100644 index 31a62f691c9b..000000000000 --- a/python/ray/tests/test_shuffle.py +++ /dev/null @@ -1,12 +0,0 @@ -import pytest -import sys - -from ray.experimental import shuffle - - -def test_shuffle(): - shuffle.main() - - -if __name__ == "__main__": - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_stress.py b/python/ray/tests/test_stress.py index 99ed186716e2..2007887367ef 100644 --- a/python/ray/tests/test_stress.py +++ b/python/ray/tests/test_stress.py @@ -15,7 +15,7 @@ def ray_start_combination(request): initialize_head=True, head_node_args={ "num_cpus": 10, - "redis_max_memory": 10**8 + "redis_max_memory": 10**7 }) for i in range(num_nodes - 1): cluster.add_node(num_cpus=10) diff --git a/python/ray/tests/test_stress_failure.py b/python/ray/tests/test_stress_failure.py index 83d9f40f24ed..01d39afa8065 100644 --- a/python/ray/tests/test_stress_failure.py +++ b/python/ray/tests/test_stress_failure.py @@ -20,7 +20,7 @@ def ray_start_reconstruction(request): head_node_args={ "num_cpus": 1, "object_store_memory": plasma_store_memory // num_nodes, - "redis_max_memory": 10**8, + "redis_max_memory": 10**7, "_system_config": { "object_timeout_milliseconds": 200 } diff --git a/python/ray/tests/test_stress_sharded.py b/python/ray/tests/test_stress_sharded.py index c6e5cd484bb2..7f05f27acb37 100644 --- a/python/ray/tests/test_stress_sharded.py +++ b/python/ray/tests/test_stress_sharded.py @@ -14,7 +14,7 @@ def ray_start_sharded(request): object_store_memory=int(0.5 * 10**9), num_cpus=10, # _num_redis_shards=num_redis_shards, - _redis_max_memory=10**8) + _redis_max_memory=10**7) yield None diff --git a/python/ray/tests/test_unreconstructable_errors.py b/python/ray/tests/test_unreconstructable_errors.py index 24be89b94297..501dce905530 100644 --- a/python/ray/tests/test_unreconstructable_errors.py +++ b/python/ray/tests/test_unreconstructable_errors.py @@ -10,7 +10,7 @@ def setUp(self): ray.init( num_cpus=1, object_store_memory=150 * 1024 * 1024, - _redis_max_memory=10**8) + _redis_max_memory=10000000) def tearDown(self): ray.shutdown() diff --git a/python/ray/tune/BUILD b/python/ray/tune/BUILD index 52e6d0ed116b..007055364a78 100644 --- a/python/ray/tune/BUILD +++ b/python/ray/tune/BUILD @@ -87,7 +87,7 @@ py_test( py_test( name = "test_function_api", - size = "medium", + size = "small", srcs = ["tests/test_function_api.py"], deps = [":tune_lib"], tags = ["exclusive"], @@ -163,14 +163,6 @@ py_test( tags = ["exclusive"], ) -py_test( - name = "test_remote", - size = "medium", - srcs = ["tests/test_remote.py"], - deps = [":tune_lib"], - tags = ["exclusive"], -) - py_test( name = "test_sample", size = "medium", diff --git a/python/ray/tune/examples/mnist_ptl_mini.py b/python/ray/tune/examples/mnist_ptl_mini.py index e3b226d44566..b1c2e2aa9a09 100644 --- a/python/ray/tune/examples/mnist_ptl_mini.py +++ b/python/ray/tune/examples/mnist_ptl_mini.py @@ -1,7 +1,7 @@ import torch from torch.nn import functional as F import pytorch_lightning as pl -from pl_bolts.datamodules.mnist_datamodule import MNISTDataModule +from pl_bolts.datamodules import MNISTDataModule import os from ray.tune.integration.pytorch_lightning import TuneReportCallback @@ -16,7 +16,6 @@ def __init__(self, config, data_dir=None): self.data_dir = data_dir or os.getcwd() self.lr = config["lr"] layer_1, layer_2 = config["layer_1"], config["layer_2"] - self.batch_size = config["batch_size"] # mnist images are (1, 28, 28) (channels, width, height) self.layer_1 = torch.nn.Linear(28 * 28, layer_1) diff --git a/python/ray/tune/function_runner.py b/python/ray/tune/function_runner.py index c7c088293757..9da6b260130a 100644 --- a/python/ray/tune/function_runner.py +++ b/python/ray/tune/function_runner.py @@ -644,22 +644,16 @@ def inner(config, checkpoint_dir=None): fn_kwargs[k] = parameter_registry.get(prefix + k) fn(config, **fn_kwargs) - fn_name = getattr(fn, "__name__", "tune_with_parameters") - inner.__name__ = fn_name - # Use correct function signature if no `checkpoint_dir` parameter is set if not use_checkpoint: def _inner(config): inner(config, checkpoint_dir=None) - _inner.__name__ = fn_name - if hasattr(fn, "__mixins__"): _inner.__mixins__ = fn.__mixins__ return _inner if hasattr(fn, "__mixins__"): inner.__mixins__ = fn.__mixins__ - return inner diff --git a/python/ray/tune/integration/mlflow.py b/python/ray/tune/integration/mlflow.py index 6e038b810f78..cbd3811d4e30 100644 --- a/python/ray/tune/integration/mlflow.py +++ b/python/ray/tune/integration/mlflow.py @@ -274,8 +274,8 @@ def train_fn(config): @mlflow_mixin def train_fn(config): for i in range(10): - loss = config["a"] + config["b"] - mlflow.log_metric(key="loss", value=loss) + loss = self.config["a"] + self.config["b"] + mlflow.log_metric(key="loss", value=loss}) tune.report(loss=loss, done=True) tune.run( diff --git a/python/ray/tune/progress_reporter.py b/python/ray/tune/progress_reporter.py index a462f8e51ef3..a71a2da546a8 100644 --- a/python/ray/tune/progress_reporter.py +++ b/python/ray/tune/progress_reporter.py @@ -57,13 +57,6 @@ def report(self, trials: List[Trial], done: bool, *sys_info: Dict): """ raise NotImplementedError - def set_search_properties(self, metric: Optional[str], - mode: Optional[str]): - return True - - def set_total_samples(self, total_samples: int): - pass - class TuneReporterBase(ProgressReporter): """Abstract base class for the default Tune reporters. diff --git a/python/ray/tune/ray_trial_executor.py b/python/ray/tune/ray_trial_executor.py index c5aaeee79a8e..a1fd4a8f3d06 100644 --- a/python/ray/tune/ray_trial_executor.py +++ b/python/ray/tune/ray_trial_executor.py @@ -154,7 +154,15 @@ class RayTrialExecutor(TrialExecutor): def __init__(self, queue_trials: bool = False, reuse_actors: bool = False, + ray_auto_init: Optional[bool] = None, refresh_period: Optional[float] = None): + if ray_auto_init is None: + if os.environ.get("TUNE_DISABLE_AUTO_INIT") == "1": + logger.info("'TUNE_DISABLE_AUTO_INIT=1' detected.") + ray_auto_init = False + else: + ray_auto_init = True + super(RayTrialExecutor, self).__init__(queue_trials) # Check for if we are launching a trial without resources in kick off # autoscaler. @@ -185,6 +193,11 @@ def __init__(self, self._last_ip_refresh = float("-inf") self._last_ip_addresses = set() self._last_nontrivial_wait = time.time() + if not ray.is_initialized() and ray_auto_init: + logger.info("Initializing Ray automatically." + "For cluster usage or custom Ray initialization, " + "call `ray.init(...)` before `tune.run`.") + ray.init() if ray.is_initialized(): self._update_avail_resources() @@ -560,7 +573,6 @@ def get_next_available_trial(self, timeout: Optional[float] = None): return None shuffled_results = list(self._running.keys()) random.shuffle(shuffled_results) - # Note: We shuffle the results because `ray.wait` by default returns # the first available result, and we want to guarantee that slower # trials (i.e. trials that run remotely) also get fairly reported. diff --git a/python/ray/tune/sample.py b/python/ray/tune/sample.py index 3be1b61e0c68..e4d349ee9db1 100644 --- a/python/ray/tune/sample.py +++ b/python/ray/tune/sample.py @@ -1,4 +1,5 @@ import logging +import random from copy import copy from inspect import signature from math import isclose @@ -294,7 +295,7 @@ def sample(self, spec: Optional[Union[List[Dict], Dict]] = None, size: int = 1): - items = np.random.choice(domain.categories, size=size).tolist() + items = random.choices(domain.categories, k=size) return items if len(items) > 1 else domain.cast(items[0]) default_sampler_cls = _Uniform @@ -470,7 +471,7 @@ def choice(categories: List): """Sample a categorical value. Sampling from ``tune.choice([1, 2])`` is equivalent to sampling from - ``np.random.choice([1, 2])`` + ``random.choice([1, 2])`` """ return Categorical(categories).uniform() diff --git a/python/ray/tune/schedulers/pb2_utils.py b/python/ray/tune/schedulers/pb2_utils.py index 37dc422e0337..881d5345f04d 100644 --- a/python/ray/tune/schedulers/pb2_utils.py +++ b/python/ray/tune/schedulers/pb2_utils.py @@ -75,7 +75,7 @@ def normalize(data, wrt): which can be specified. """ return (data - np.min(wrt, axis=0)) / ( - np.max(wrt, axis=0) - np.min(wrt, axis=0) + 1e-8) + np.max(wrt, axis=0) - np.min(wrt, axis=0)) def standardize(data): diff --git a/python/ray/tune/suggest/ax.py b/python/ray/tune/suggest/ax.py index 85aa79f30284..7cccf74a79d6 100644 --- a/python/ray/tune/suggest/ax.py +++ b/python/ray/tune/suggest/ax.py @@ -1,6 +1,7 @@ import copy from typing import Dict, List, Optional, Union +from ax.service.ax_client import AxClient from ray.tune.result import DEFAULT_METRIC from ray.tune.sample import Categorical, Float, Integer, LogUniform, \ Quantized, Uniform @@ -11,17 +12,8 @@ try: import ax - from ax.service.ax_client import AxClient except ImportError: - ax = AxClient = None - -# This exception only exists in newer Ax releases for python 3.7 -try: - from ax.exceptions.generation_strategy import \ - MaxParallelismReachedException -except ImportError: - MaxParallelismReachedException = Exception - + ax = None import logging from ray.tune.suggest import Searcher @@ -132,7 +124,6 @@ def __init__(self, assert ax is not None, """Ax must be installed! You can install AxSearch with the command: `pip install ax-platform sqlalchemy`.""" - if mode: assert mode in ["min", "max"], "`mode` must be 'min' or 'max'." @@ -160,6 +151,7 @@ def __init__(self, self.max_concurrent = max_concurrent + self._objective_name = metric self._parameters = [] self._live_trial_mapping = {} @@ -187,10 +179,6 @@ def _setup_experiment(self): "`AxClient.create_experiment()`, or you should pass an " "Ax search space as the `space` parameter to `AxSearch`, " "or pass a `config` dict to `tune.run()`.") - if self._mode not in ["min", "max"]: - raise ValueError( - "Please specify the `mode` argument when initializing " - "the `AxSearch` object or pass it to `tune.run()`.") self._ax.create_experiment( parameters=self._space, objective_name=self._metric, @@ -200,25 +188,16 @@ def _setup_experiment(self): else: if any([ self._space, self._parameter_constraints, - self._outcome_constraints, self._mode, self._metric + self._outcome_constraints ]): raise ValueError( "If you create the Ax experiment yourself, do not pass " "values for these parameters to `AxSearch`: {}.".format([ - "space", - "parameter_constraints", - "outcome_constraints", - "mode", - "metric", + "space", "parameter_constraints", "outcome_constraints" ])) exp = self._ax.experiment - - # Update mode and metric from experiment if it has been passed - self._mode = "min" \ - if exp.optimization_config.objective.minimize else "max" - self._metric = exp.optimization_config.objective.metric.name - + self._objective_name = exp.optimization_config.objective.metric.name self._parameters = list(exp.parameters) if self._ax._enforce_sequential_optimization: @@ -260,10 +239,7 @@ def suggest(self, trial_id: str) -> Optional[Dict]: config = self._points_to_evaluate.pop(0) parameters, trial_index = self._ax.attach_trial(config) else: - try: - parameters, trial_index = self._ax.get_next_trial() - except MaxParallelismReachedException: - return None + parameters, trial_index = self._ax.get_next_trial() self._live_trial_mapping[trial_id] = trial_index return unflatten_dict(parameters) @@ -279,12 +255,14 @@ def on_trial_complete(self, trial_id, result=None, error=False): def _process_result(self, trial_id, result): ax_trial_index = self._live_trial_mapping[trial_id] - metric_dict = {self._metric: (result[self._metric], None)} + metric_dict = { + self._objective_name: (result[self._objective_name], 0.0) + } outcome_names = [ oc.metric.name for oc in self._ax.experiment.optimization_config.outcome_constraints ] - metric_dict.update({on: (result[on], None) for on in outcome_names}) + metric_dict.update({on: (result[on], 0.0) for on in outcome_names}) self._ax.complete_trial( trial_index=ax_trial_index, raw_data=metric_dict) diff --git a/python/ray/tune/suggest/optuna.py b/python/ray/tune/suggest/optuna.py index a966892d0ef5..a6468b8617dd 100644 --- a/python/ray/tune/suggest/optuna.py +++ b/python/ray/tune/suggest/optuna.py @@ -98,7 +98,7 @@ class OptunaSearch(Searcher): param.suggest_uniform("b", 10, 20) ] - optuna_search = OptunaSearch( + algo = OptunaSearch( space, metric="loss", mode="min") @@ -218,14 +218,8 @@ def on_trial_complete(self, error: bool = False): ot_trial = self._ot_trials[trial_id] ot_trial_id = ot_trial._trial_id - - val = result.get(self.metric, None) - if hasattr(self._storage, "set_trial_value"): - # Backwards compatibility with optuna < 2.4.0 - self._storage.set_trial_value(ot_trial_id, val) - else: - self._storage.set_trial_values(ot_trial_id, [val]) - + self._storage.set_trial_value(ot_trial_id, result.get( + self.metric, None)) self._storage.set_trial_state(ot_trial_id, ot.trial.TrialState.COMPLETE) diff --git a/python/ray/tune/suggest/zoopt.py b/python/ray/tune/suggest/zoopt.py index 71cedffd5500..c0c0ddb18562 100644 --- a/python/ray/tune/suggest/zoopt.py +++ b/python/ray/tune/suggest/zoopt.py @@ -198,8 +198,8 @@ def _setup_zoopt(self): init_samples = None if self._points_to_evaluate: - logger.warning("`points_to_evaluate` is ignored by ZOOpt in " - "versions <= 0.4.1.") + logger.warning( + "`points_to_evaluate` seems to be ignored by ZOOpt.") init_samples = [ Solution(x=tuple(point[dim] for dim in self._dim_keys)) for point in self._points_to_evaluate @@ -213,6 +213,8 @@ def _setup_zoopt(self): parameter=par, parallel_num=self.parallel_num, **self.kwargs) + if init_samples: + self.optimizer.init_attribute() def set_search_properties(self, metric: Optional[str], mode: Optional[str], config: Dict) -> bool: diff --git a/python/ray/tune/tests/test_convergence_gaussian_process.py b/python/ray/tune/tests/test_convergence_gaussian_process.py index c0abecdd3aef..c81eff8ef6e7 100644 --- a/python/ray/tune/tests/test_convergence_gaussian_process.py +++ b/python/ray/tune/tests/test_convergence_gaussian_process.py @@ -1,4 +1,3 @@ -import math import numpy as np import ray @@ -16,41 +15,33 @@ def loss(config, reporter): class ConvergenceTest(unittest.TestCase): """Test convergence in gaussian process.""" - def shutDown(self): - ray.shutdown() - def test_convergence_gaussian_process(self): np.random.seed(0) ray.init(local_mode=True, num_cpus=1, num_gpus=1) - # This is the space of parameters to explore - space = {"x": tune.uniform(0, 20)} + space = { + "x": (0, 20) # This is the space of parameters to explore + } resources_per_trial = {"cpu": 1, "gpu": 0} # Following bayesian optimization - gp = BayesOptSearch(random_search_steps=10) + gp = BayesOptSearch( + space, metric="loss", mode="min", random_search_steps=10) gp.repeat_float_precision = 5 gp = ConcurrencyLimiter(gp, 1) # Execution of the BO. analysis = tune.run( loss, - metric="loss", - mode="min", # stop=EarlyStopping("loss", mode="min", patience=5), search_alg=gp, - config=space, + config={}, num_samples=100, # Number of iterations resources_per_trial=resources_per_trial, raise_on_failed_trial=False, fail_fast=True, verbose=1) - assert len(analysis.trials) in {13, 43} # it is 43 on the cluster? - assert math.isclose(analysis.best_config["x"], 0, abs_tol=1e-8) + assert len(analysis.trials) == 41 - -if __name__ == "__main__": - import pytest - import sys - sys.exit(pytest.main(["-v", __file__])) + ray.shutdown() diff --git a/python/ray/tune/tests/test_function_api.py b/python/ray/tune/tests/test_function_api.py index e18ee35e07cc..9ee2cdc64777 100644 --- a/python/ray/tune/tests/test_function_api.py +++ b/python/ray/tune/tests/test_function_api.py @@ -6,6 +6,7 @@ import unittest import ray +import ray.cloudpickle as cloudpickle from ray.rllib import _register_all from ray import tune @@ -229,7 +230,7 @@ def train(config, checkpoint_dir=None): new_trainable2 = wrapped(logger_creator=self.logger_creator) new_trainable2.restore(checkpoint) result = new_trainable2.train() - self.assertEqual(result[TRAINING_ITERATION], 1) + self.assertEquals(result[TRAINING_ITERATION], 1) checkpoint = new_trainable2.save() new_trainable2.stop() @@ -404,15 +405,14 @@ def train(config, checkpoint_dir=None): def testEnabled(self): def train(config, checkpoint_dir=None): is_active = tune.is_session_enabled() - result = {"active": is_active} if is_active: - tune.report(**result) - return result + tune.report(active=is_active) + return is_active - assert train({})["active"] is False + assert train({}) is False analysis = tune.run(train) t = analysis.trials[0] - assert t.last_result["active"], t.last_result + assert t.last_result["active"] def testBlankCheckpoint(self): def train(config, checkpoint_dir=None): @@ -450,12 +450,11 @@ def train(config, data=None): trial_1, trial_2 = tune.run( with_parameters(train, data=data), num_samples=2).trials - self.assertEqual(data.data[101], 0) - self.assertEqual(trial_1.last_result["metric"], 500_000) - self.assertEqual(trial_1.last_result["hundred"], 1) - self.assertEqual(trial_2.last_result["metric"], 500_000) - self.assertEqual(trial_2.last_result["hundred"], 1) - self.assertTrue(str(trial_1).startswith("train_")) + self.assertEquals(data.data[101], 0) + self.assertEquals(trial_1.last_result["metric"], 500_000) + self.assertEquals(trial_1.last_result["hundred"], 1) + self.assertEquals(trial_2.last_result["metric"], 500_000) + self.assertEquals(trial_2.last_result["hundred"], 1) # With checkpoint dir parameter def train(config, checkpoint_dir="DIR", data=None): @@ -465,12 +464,11 @@ def train(config, checkpoint_dir="DIR", data=None): trial_1, trial_2 = tune.run( with_parameters(train, data=data), num_samples=2).trials - self.assertEqual(data.data[101], 0) - self.assertEqual(trial_1.last_result["metric"], 500_000) - self.assertEqual(trial_1.last_result["cp"], "DIR") - self.assertEqual(trial_2.last_result["metric"], 500_000) - self.assertEqual(trial_2.last_result["cp"], "DIR") - self.assertTrue(str(trial_1).startswith("train_")) + self.assertEquals(data.data[101], 0) + self.assertEquals(trial_1.last_result["metric"], 500_000) + self.assertEquals(trial_1.last_result["cp"], "DIR") + self.assertEquals(trial_2.last_result["metric"], 500_000) + self.assertEquals(trial_2.last_result["cp"], "DIR") def testWithParameters2(self): class Data: @@ -482,9 +480,7 @@ def train(config, data=None): tune.report(metric=len(data.data)) trainable = tune.with_parameters(train, data=Data()) - # ray.cloudpickle will crash for some reason - import cloudpickle as cp - dumped = cp.dumps(trainable) + dumped = cloudpickle.dumps(trainable) assert sys.getsizeof(dumped) < 100 * 1024 def testReturnAnonymous(self): @@ -496,8 +492,8 @@ def train(config): "a": tune.grid_search([4, 8]) }).trials - self.assertEqual(trial_1.last_result[DEFAULT_METRIC], 4) - self.assertEqual(trial_2.last_result[DEFAULT_METRIC], 8) + self.assertEquals(trial_1.last_result[DEFAULT_METRIC], 4) + self.assertEquals(trial_2.last_result[DEFAULT_METRIC], 8) def testReturnSpecific(self): def train(config): @@ -508,8 +504,8 @@ def train(config): "a": tune.grid_search([4, 8]) }).trials - self.assertEqual(trial_1.last_result["m"], 4) - self.assertEqual(trial_2.last_result["m"], 8) + self.assertEquals(trial_1.last_result["m"], 4) + self.assertEquals(trial_2.last_result["m"], 8) def testYieldAnonymous(self): def train(config): @@ -521,8 +517,8 @@ def train(config): "a": tune.grid_search([4, 8]) }).trials - self.assertEqual(trial_1.last_result[DEFAULT_METRIC], 4 + 9) - self.assertEqual(trial_2.last_result[DEFAULT_METRIC], 8 + 9) + self.assertEquals(trial_1.last_result[DEFAULT_METRIC], 4 + 9) + self.assertEquals(trial_2.last_result[DEFAULT_METRIC], 8 + 9) def testYieldSpecific(self): def train(config): @@ -534,10 +530,5 @@ def train(config): "a": tune.grid_search([4, 8]) }).trials - self.assertEqual(trial_1.last_result["m"], 4 + 9) - self.assertEqual(trial_2.last_result["m"], 8 + 9) - - -if __name__ == "__main__": - import pytest - sys.exit(pytest.main(["-v", __file__])) + self.assertEquals(trial_1.last_result["m"], 4 + 9) + self.assertEquals(trial_2.last_result["m"], 8 + 9) diff --git a/python/ray/tune/tests/test_remote.py b/python/ray/tune/tests/test_remote.py deleted file mode 100644 index 1e521c54b7a6..000000000000 --- a/python/ray/tune/tests/test_remote.py +++ /dev/null @@ -1,77 +0,0 @@ -import unittest - -import ray -from ray.tune import register_trainable, run_experiments, run -from ray.tune.result import TIMESTEPS_TOTAL -from ray.tune.experiment import Experiment -from ray.tune.trial import Trial -from ray.util.client.ray_client_helpers import ray_start_client_server - - -class RemoteTest(unittest.TestCase): - def tearDown(self): - ray.shutdown() - - def testRemoteRunExperiments(self): - def train(config, reporter): - for i in range(100): - reporter(timesteps_total=i) - - register_trainable("f1", train) - exp1 = Experiment(**{ - "name": "foo", - "run": "f1", - }) - [trial] = run_experiments(exp1, _remote=True) - self.assertEqual(trial.status, Trial.TERMINATED) - self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99) - - def testRemoteRun(self): - def train(config, reporter): - for i in range(100): - reporter(timesteps_total=i) - - analysis = run(train, _remote=True) - [trial] = analysis.trials - self.assertEqual(trial.status, Trial.TERMINATED) - self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99) - - def testRemoteRunExperimentsInClient(self): - ray.init() - assert not ray.util.client.ray.is_connected() - with ray_start_client_server(): - assert ray.util.client.ray.is_connected() - - def train(config, reporter): - for i in range(100): - reporter(timesteps_total=i) - - register_trainable("f1", train) - exp1 = Experiment(**{ - "name": "foo", - "run": "f1", - }) - [trial] = run_experiments(exp1) - self.assertEqual(trial.status, Trial.TERMINATED) - self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99) - - def testRemoteRunInClient(self): - ray.init() - assert not ray.util.client.ray.is_connected() - with ray_start_client_server(): - assert ray.util.client.ray.is_connected() - - def train(config, reporter): - for i in range(100): - reporter(timesteps_total=i) - - analysis = run(train) - [trial] = analysis.trials - self.assertEqual(trial.status, Trial.TERMINATED) - self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99) - - -if __name__ == "__main__": - import pytest - import sys - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/test_sample.py b/python/ray/tune/tests/test_sample.py index b631dc2b15b5..378a2c1ef565 100644 --- a/python/ray/tune/tests/test_sample.py +++ b/python/ray/tune/tests/test_sample.py @@ -193,32 +193,6 @@ def testQuantized(self): samples = tune.sample.Float(0, 33).quantized(3).sample(size=1000) self.assertTrue(all(0 <= s <= 33 for s in samples)) - def testCategoricalSeedInTrainingLoop(self): - def train(config): - return 0 - - config = { - "integer": tune.randint(0, 100_000), - "choice": tune.choice(list(range(100_000))) - } - - np.random.seed(1000) - - out_1 = tune.run(train, config=config, num_samples=8, verbose=0) - - integers_1 = [t.config["integer"] for t in out_1.trials] - choices_1 = [t.config["choice"] for t in out_1.trials] - - np.random.seed(1000) - - out_2 = tune.run(train, config=config, num_samples=8, verbose=0) - - integers_2 = [t.config["integer"] for t in out_2.trials] - choices_2 = [t.config["choice"] for t in out_2.trials] - - self.assertSequenceEqual(integers_1, integers_2) - self.assertSequenceEqual(choices_1, choices_2) - def testConvertAx(self): from ray.tune.suggest.ax import AxSearch from ax.service.ax_client import AxClient @@ -263,14 +237,12 @@ def testConvertAx(self): ] client1 = AxClient(random_seed=1234) - client1.create_experiment( - parameters=converted_config, objective_name="a", minimize=False) - searcher1 = AxSearch(ax_client=client1) + client1.create_experiment(parameters=converted_config) + searcher1 = AxSearch(ax_client=client1, metric="a", mode="max") client2 = AxClient(random_seed=1234) - client2.create_experiment( - parameters=ax_config, objective_name="a", minimize=False) - searcher2 = AxSearch(ax_client=client2) + client2.create_experiment(parameters=ax_config) + searcher2 = AxSearch(ax_client=client2, metric="a", mode="max") config1 = searcher1.suggest("0") config2 = searcher2.suggest("0") @@ -980,11 +952,9 @@ def testPointsToEvaluateSkOpt(self): return self._testPointsToEvaluate(SkOptSearch, config) def testPointsToEvaluateZoOpt(self): - self.skipTest( - "ZOOpt's latest release (0.4.1) does not support sampling " - "initial points. Please re-enable this test after the next " - "release.") - + # https://github.com/polixir/ZOOpt/issues/5 + self.skipTest("ZoOpt currently ignores initial points. This test " + "will be enabled after this has been fixed.") config = { "metric": tune.sample.Categorical([1, 2, 3, 4]).uniform(), "a": tune.sample.Categorical(["t1", "t2", "t3", "t4"]).uniform(), diff --git a/python/ray/tune/tests/test_searchers.py b/python/ray/tune/tests/test_searchers.py index 403b11276dcc..0b50be49db90 100644 --- a/python/ray/tune/tests/test_searchers.py +++ b/python/ray/tune/tests/test_searchers.py @@ -49,10 +49,8 @@ def testAx(self): # At least one nan, inf, -inf and float client = AxClient(random_seed=4321) client.create_experiment( - parameters=converted_config, - objective_name="_metric", - minimize=False) - searcher = AxSearch(ax_client=client) + parameters=converted_config, objective_name="_metric") + searcher = AxSearch(ax_client=client, metric="_metric", mode="max") out = tune.run( _invalid_objective, diff --git a/python/ray/tune/tests/test_trainable_util.py b/python/ray/tune/tests/test_trainable_util.py index 23dfb35733e7..25860eb1c569 100644 --- a/python/ray/tune/tests/test_trainable_util.py +++ b/python/ray/tune/tests/test_trainable_util.py @@ -1,14 +1,10 @@ -from collections import OrderedDict import os -import sys +import pickle import shutil import unittest -from unittest.mock import patch import ray.utils -import ray.cloudpickle as cloudpickle -from ray.tune.utils.util import wait_for_gpu -from ray.tune.utils.util import unflatten_dict + from ray.tune.utils.trainable import TrainableUtil @@ -16,15 +12,13 @@ class TrainableUtilTest(unittest.TestCase): def setUp(self): self.checkpoint_dir = os.path.join(ray.utils.get_user_temp_dir(), "tune", "MyTrainable123") - self.checkpoint_dir = TrainableUtil.make_checkpoint_dir( - self.checkpoint_dir, "0") + TrainableUtil.make_checkpoint_dir(self.checkpoint_dir) def tearDown(self): self.addCleanup(shutil.rmtree, self.checkpoint_dir) def testFindCheckpointDir(self): - checkpoint_path = os.path.join(self.checkpoint_dir, - "0/my/nested/chkpt") + checkpoint_path = os.path.join(self.checkpoint_dir, "my/nested/chkpt") os.makedirs(checkpoint_path) found_dir = TrainableUtil.find_checkpoint_dir(checkpoint_path) self.assertEquals(self.checkpoint_dir, found_dir) @@ -42,7 +36,7 @@ def testPickleCheckpoint(self): checkpoint_path = os.path.join(self.checkpoint_dir, "0") data_dict = TrainableUtil.pickle_checkpoint(checkpoint_path) - loaded = cloudpickle.loads(data_dict) + loaded = pickle.loads(data_dict) checkpoint_name = os.path.basename(checkpoint_path) self.assertEqual(loaded["checkpoint_name"], checkpoint_name) @@ -50,94 +44,3 @@ def testPickleCheckpoint(self): for i in range(5): path = os.path.join(self.checkpoint_dir, str(i)) self.assertEquals(loaded["data"][str(i)], open(path, "rb").read()) - - -class UnflattenDictTest(unittest.TestCase): - def test_output_type(self): - in_ = OrderedDict({"a/b": 1, "c/d": 2, "e": 3}) - out = unflatten_dict(in_) - assert type(in_) is type(out) - - def test_one_level_nested(self): - result = unflatten_dict({"a/b": 1, "c/d": 2, "e": 3}) - assert result == {"a": {"b": 1}, "c": {"d": 2}, "e": 3} - - def test_multi_level_nested(self): - result = unflatten_dict({"a/b/c/d": 1, "b/c/d": 2, "c/d": 3, "e": 4}) - assert result == { - "a": { - "b": { - "c": { - "d": 1, - }, - }, - }, - "b": { - "c": { - "d": 2, - }, - }, - "c": { - "d": 3, - }, - "e": 4, - } - - -class GPUUtilMock: - class GPU: - def __init__(self, id, uuid, util=None): - self.id = id - self.uuid = uuid - self.util = [0.5, 0.0] - - @property - def memoryUtil(self): - if self.util: - return self.util.pop(0) - return 0 - - def __init__(self, gpus, gpu_uuids): - self.gpus = gpus - self.uuids = gpu_uuids - self.gpu_list = [ - self.GPU(gpu, uuid) for gpu, uuid in zip(self.gpus, self.uuids) - ] - - def getGPUs(self): - return self.gpu_list - - -class GPUTest(unittest.TestCase): - def setUp(self): - sys.modules["GPUtil"] = GPUUtilMock([0, 1], ["GPU-aaa", "GPU-bbb"]) - - def testGPUWait1(self): - wait_for_gpu(0, delay_s=0) - - def testGPUWait2(self): - wait_for_gpu("1", delay_s=0) - - def testGPUWait3(self): - wait_for_gpu("GPU-aaa", delay_s=0) - - def testGPUWaitFail(self): - with self.assertRaises(ValueError): - wait_for_gpu(2, delay_s=0) - - with self.assertRaises(ValueError): - wait_for_gpu("4", delay_s=0) - - with self.assertRaises(ValueError): - wait_for_gpu(1.23, delay_s=0) - - @patch("ray.get_gpu_ids", lambda: ["0"]) - def testDefaultGPU(self): - import sys - sys.modules["GPUtil"] = GPUUtilMock([0], ["GPU-aaa"]) - wait_for_gpu(delay_s=0) - - -if __name__ == "__main__": - import pytest - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/test_trial_runner_3.py b/python/ray/tune/tests/test_trial_runner_3.py index 3c2d05981677..ab10112d47d4 100644 --- a/python/ray/tune/tests/test_trial_runner_3.py +++ b/python/ray/tune/tests/test_trial_runner_3.py @@ -695,27 +695,6 @@ def num_checkpoints(trial): self.assertTrue(trials[0].has_checkpoint()) self.assertEqual(num_checkpoints(trials[0]), 2) - @patch("ray.tune.syncer.CLOUD_SYNC_PERIOD", 0) - def testCheckpointAutoPeriod(self): - ray.init(num_cpus=3) - - # This makes checkpointing take 2 seconds. - def sync_up(source, target): - time.sleep(2) - return True - - runner = TrialRunner( - local_checkpoint_dir=self.tmpdir, - checkpoint_period="auto", - sync_to_cloud=sync_up, - remote_checkpoint_dir="fake") - runner.add_trial(Trial("__fake", config={"user_checkpoint_freq": 1})) - - runner.step() # Run one step, this will trigger checkpointing - - self.assertGreaterEqual(runner._checkpoint_manager._checkpoint_period, - 38.) - class SearchAlgorithmTest(unittest.TestCase): @classmethod diff --git a/python/ray/tune/tests/test_trial_runner_callbacks.py b/python/ray/tune/tests/test_trial_runner_callbacks.py index 6211220c2458..75b06d0e34c8 100644 --- a/python/ray/tune/tests/test_trial_runner_callbacks.py +++ b/python/ray/tune/tests/test_trial_runner_callbacks.py @@ -73,7 +73,6 @@ def get_next_failed_trial(self): class TrialRunnerCallbacks(unittest.TestCase): def setUp(self): - ray.init() self.tmpdir = tempfile.mkdtemp() self.callback = TestCallback() self.executor = _MockTrialExecutor() diff --git a/python/ray/tune/tests/test_trial_scheduler_pbt.py b/python/ray/tune/tests/test_trial_scheduler_pbt.py index 48ba7322958b..300ea0bfbc25 100644 --- a/python/ray/tune/tests/test_trial_scheduler_pbt.py +++ b/python/ray/tune/tests/test_trial_scheduler_pbt.py @@ -29,14 +29,7 @@ def __call__(self, *args, **kwargs): class PopulationBasedTrainingMemoryTest(unittest.TestCase): def setUp(self): - ray.init( - num_cpus=1, - object_store_memory=100 * MB, - _system_config={ - # This test uses ray.objects(), which only works with the - # GCS-based object directory - "ownership_based_object_directory_enabled": False, - }) + ray.init(num_cpus=1, object_store_memory=100 * MB) def tearDown(self): ray.shutdown() @@ -97,13 +90,7 @@ def save(self, *args, **kwargs): class PopulationBasedTrainingFileDescriptorTest(unittest.TestCase): def setUp(self): - ray.init( - num_cpus=2, - _system_config={ - # This test uses ray.objects(), which only works with the - # GCS-based object directory - "ownership_based_object_directory_enabled": False, - }) + ray.init(num_cpus=2) os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "0" def tearDown(self): diff --git a/python/ray/tune/tests/test_tune_restore.py b/python/ray/tune/tests/test_tune_restore.py index 5f9e5a41fbed..baabd2b03939 100644 --- a/python/ray/tune/tests/test_tune_restore.py +++ b/python/ray/tune/tests/test_tune_restore.py @@ -1,10 +1,8 @@ # coding: utf-8 -import signal from collections import Counter import os import shutil import tempfile -import time import unittest import skopt import numpy as np @@ -89,66 +87,6 @@ def testPostRestoreCheckpointExistence(self): self.assertTrue(os.path.isfile(self.checkpoint_path)) -class TuneInterruptionTest(unittest.TestCase): - def testExperimentInterrupted(self): - import multiprocessing - - trainer_semaphore = multiprocessing.Semaphore() - driver_semaphore = multiprocessing.Semaphore() - - class SteppingCallback(Callback): - def on_step_end(self, iteration, trials, **info): - driver_semaphore.release() # Driver should continue - trainer_semaphore.acquire() # Wait until released - - def _run(local_dir): - def _train(config): - for i in range(7): - tune.report(val=i) - - tune.run( - _train, - local_dir=local_dir, - name="interrupt", - callbacks=[SteppingCallback()]) - - local_dir = tempfile.mkdtemp() - process = multiprocessing.Process(target=_run, args=(local_dir, )) - process.daemon = False - process.start() - - exp_dir = os.path.join(local_dir, "interrupt") - - # Skip first five steps - for i in range(5): - driver_semaphore.acquire() # Wait for callback - trainer_semaphore.release() # Continue training - - driver_semaphore.acquire() - - experiment_state_file = None - for file in os.listdir(exp_dir): - if file.startswith("experiment_state"): - experiment_state_file = os.path.join(exp_dir, file) - break - - self.assertTrue(experiment_state_file) - last_mtime = os.path.getmtime(experiment_state_file) - - # Now send kill signal - os.kill(process.pid, signal.SIGINT) - # Release trainer. It should handle the signal and try to - # checkpoint the experiment - trainer_semaphore.release() - - time.sleep(2) # Wait for checkpoint - new_mtime = os.path.getmtime(experiment_state_file) - - self.assertNotEqual(last_mtime, new_mtime) - - shutil.rmtree(local_dir) - - class TuneFailResumeGridTest(unittest.TestCase): class FailureInjectorCallback(Callback): """Adds random failure injection to the TrialExecutor.""" diff --git a/python/ray/tune/trial.py b/python/ray/tune/trial.py index 7507ab50dfb0..fc6152f97a40 100644 --- a/python/ray/tune/trial.py +++ b/python/ray/tune/trial.py @@ -1,7 +1,6 @@ from typing import Callable, Dict, Sequence, Union import json -import ray import ray.cloudpickle as cloudpickle from collections import deque import copy @@ -167,13 +166,6 @@ class Trial: """ - _nonjson_fields = [ - "results", - "best_result", - "param_config", - "extra_arg", - ] - PENDING = "PENDING" RUNNING = "RUNNING" PAUSED = "PAUSED" @@ -297,6 +289,12 @@ def __init__(self, self.param_config = None self.extra_arg = None + self._nonjson_fields = [ + "results", + "best_result", + "param_config", + "extra_arg", + ] if trial_name_creator: self.custom_trial_name = trial_name_creator(self) @@ -641,9 +639,4 @@ def __setstate__(self, state): self.__dict__.update(state) validate_trainable(self.trainable_name) - - # Avoid creating logdir in client mode for returned trial results, - # since the dir might not be creatable locally. TODO(ekl) thsi is kind - # of a hack. - if not ray.util.client.ray.is_connected(): - self.init_logdir() # Create logdir if it does not exist + self.init_logdir() # Create logdir if it does not exist diff --git a/python/ray/tune/trial_runner.py b/python/ray/tune/trial_runner.py index d8b45b19bc7f..c487190f7f66 100644 --- a/python/ray/tune/trial_runner.py +++ b/python/ray/tune/trial_runner.py @@ -1,4 +1,4 @@ -from typing import Optional, Union +from typing import Optional import click from datetime import datetime @@ -16,11 +16,11 @@ from ray.tune.ray_trial_executor import RayTrialExecutor from ray.tune.result import (DEFAULT_METRIC, TIME_THIS_ITER_S, RESULT_DUPLICATE, SHOULD_CHECKPOINT) -from ray.tune.syncer import CloudSyncer, get_cloud_syncer +from ray.tune.syncer import get_cloud_syncer from ray.tune.trial import Checkpoint, Trial from ray.tune.schedulers import FIFOScheduler, TrialScheduler -from ray.tune.suggest import BasicVariantGenerator, SearchAlgorithm -from ray.tune.utils import warn_if_slow, flatten_dict +from ray.tune.suggest import BasicVariantGenerator +from ray.tune.utils import warn_if_slow, flatten_dict, env_integer from ray.tune.utils.log import Verbosity, has_verbosity from ray.tune.utils.placement_groups import TUNE_MAX_PENDING_TRIALS_PG from ray.tune.utils.serialization import TuneFunctionDecoder, \ @@ -42,106 +42,6 @@ def _find_newest_ckpt(ckpt_dir): return max(full_paths) -class _ExperimentCheckpointManager: - """Helper class for managing experiment-level checkpoints. - - This class implements the ``checkpoint()`` method used to checkpoint - experiment state. When called, this will serialize and write to disk - the state of the trial runner, trial executor, and search algorithm, to - a specified checkpoint file. - - The checkpoint period is automatically adjusted to - ``max(10, time_per_checkpoint * 19)``. This means that at most 5% of the - time (1/20) will be used for writing checkpoints, while 95% of the time - (19/20) will be used to handle the rest of the training loop. - - """ - - def __init__(self, checkpoint_dir: str, - checkpoint_period: Union[int, float, str], start_time: float, - session_str: str, syncer: CloudSyncer): - self._checkpoint_dir = checkpoint_dir - self._auto_checkpoint_enabled = checkpoint_period == "auto" - if self._auto_checkpoint_enabled: - self._checkpoint_period = 10. # Initial value - else: - self._checkpoint_period = float(checkpoint_period) - - self._start_time = start_time - self._session_str = session_str - - self._syncer = syncer - - self._last_checkpoint_time = 0. - - @property - def auto_checkpoint_enabled(self): - return self._auto_checkpoint_enabled - - def checkpoint(self, - checkpoint_file: str, - trial_runner: "TrialRunner", - trial_executor: RayTrialExecutor, - search_alg: SearchAlgorithm, - force=False): - """Saves execution state to `self._local_checkpoint_dir`. - - Overwrites the current session checkpoint, which starts when self - is instantiated. Throttle depends on self._checkpoint_period. - - Also automatically saves the search algorithm to the local - checkpoint dir. - - Args: - force (bool): Forces a checkpoint despite checkpoint_period. - """ - if not self._checkpoint_dir: - return - - now = time.time() - if now - self._last_checkpoint_time < self._checkpoint_period and ( - not force): - return - - def _serialize_and_write(): - runner_state = { - "checkpoints": list(trial_executor.get_checkpoints().values()), - "runner_data": trial_runner.__getstate__(), - "stats": { - "start_time": self._start_time, - "timestamp": self._last_checkpoint_time - } - } - tmp_file_name = os.path.join(self._checkpoint_dir, - ".tmp_checkpoint") - with open(tmp_file_name, "w") as f: - json.dump(runner_state, f, indent=2, cls=TuneFunctionEncoder) - - os.replace(tmp_file_name, checkpoint_file) - search_alg.save_to_dir( - self._checkpoint_dir, session_str=self._session_str) - - checkpoint_time_start = time.monotonic() - _serialize_and_write() - if force: - self._syncer.sync_up() - else: - self._syncer.sync_up_if_needed() - checkpoint_time_taken = time.monotonic() - checkpoint_time_start - - if self._auto_checkpoint_enabled: - # Multiplying this time by 19 means we spend ~5% of the time - # writing global checkpoints and 95% of the time processing trials - self._checkpoint_period = max(10., checkpoint_time_taken * 19) - logger.debug(f"Global experiment checkpointing took " - f"{checkpoint_time_taken:.2f} seconds. " - f"Adjusting checkpoint period to " - f"{self._checkpoint_period:.2f} seconds.") - - self._last_checkpoint_time = time.time() - return self._checkpoint_dir - - class TrialRunner: """A TrialRunner implements the event loop for scheduling trials on Ray. @@ -182,10 +82,8 @@ class TrialRunner: If fail_fast='raise' provided, Tune will automatically raise the exception received by the Trainable. fail_fast='raise' can easily leak resources and should be used with caution. - checkpoint_period (int|str): Trial runner checkpoint periodicity in - seconds. Defaults to ``"auto"``, which adjusts checkpointing - time so that at most 5% of the time is spent on writing - checkpoints. + checkpoint_period (int): Trial runner checkpoint periodicity in + seconds. Defaults to 10. trial_executor (TrialExecutor): Defaults to RayTrialExecutor. callbacks (list): List of callbacks that will be called at different times in the training loop. Must be instances of the @@ -285,7 +183,9 @@ def __init__(self, self._start_time = time.time() self._last_checkpoint_time = -float("inf") - + if checkpoint_period is None: + checkpoint_period = env_integer("TUNE_GLOBAL_CHECKPOINT_S", 10) + self._checkpoint_period = checkpoint_period self._session_str = datetime.fromtimestamp( self._start_time).strftime("%Y-%m-%d_%H-%M-%S") self.checkpoint_file = None @@ -296,20 +196,6 @@ def __init__(self, self._callbacks = CallbackList(callbacks or []) - if checkpoint_period is None: - checkpoint_period = os.getenv("TUNE_GLOBAL_CHECKPOINT_S", "auto") - - self._checkpoint_period = checkpoint_period - self._checkpoint_manager = self._create_checkpoint_manager() - - def _create_checkpoint_manager(self): - return _ExperimentCheckpointManager( - checkpoint_dir=self._local_checkpoint_dir, - checkpoint_period=self._checkpoint_period, - start_time=self._start_time, - session_str=self._session_str, - syncer=self._syncer) - @property def resumed(self): return self._resumed @@ -383,23 +269,36 @@ def checkpoint(self, force=False): Args: force (bool): Forces a checkpoint despite checkpoint_period. """ - with warn_if_slow( - "experiment_checkpoint", - message="Checkpointing the experiment state took " - "{duration:.3f} s, which may be a performance " - "bottleneck. Please ensure the " - "`TUNE_GLOBAL_CHECKPOINT_S` environment variable is " - "something significantly higher than this duration " - "to ensure compute time is mostly spent on the main " - "training loop.", - disable=self._checkpoint_manager.auto_checkpoint_enabled): - - self._checkpoint_manager.checkpoint( - checkpoint_file=self.checkpoint_file, - trial_runner=self, - trial_executor=self.trial_executor, - search_alg=self._search_alg, - force=force) + if not self._local_checkpoint_dir: + return + now = time.time() + if now - self._last_checkpoint_time < self._checkpoint_period and ( + not force): + return + self._last_checkpoint_time = now + runner_state = { + "checkpoints": list( + self.trial_executor.get_checkpoints().values()), + "runner_data": self.__getstate__(), + "stats": { + "start_time": self._start_time, + "timestamp": self._last_checkpoint_time + } + } + tmp_file_name = os.path.join(self._local_checkpoint_dir, + ".tmp_checkpoint") + with open(tmp_file_name, "w") as f: + json.dump(runner_state, f, indent=2, cls=TuneFunctionEncoder) + + os.replace(tmp_file_name, self.checkpoint_file) + self._search_alg.save_to_dir( + self._local_checkpoint_dir, session_str=self._session_str) + + if force: + self._syncer.sync_up() + else: + self._syncer.sync_up_if_needed() + return self._local_checkpoint_dir def resume(self, run_errored_only=False): """Resumes all checkpointed trials from previous run. @@ -507,7 +406,16 @@ def _start_trial(trial: Trial) -> bool: self._stop_experiment_if_needed() try: - self.checkpoint() + with warn_if_slow( + "experiment_checkpoint", + message="Checkpointing the experiment state took " + "{duration:.3f} s, which may be a performance " + "bottleneck. Please ensure the " + "`TUNE_GLOBAL_CHECKPOINT_S` environment variable is " + "something significantly higher than this duration " + "to ensure compute time is mostly spent on the main " + "training loop."): + self.checkpoint() except Exception as e: logger.warning(f"Trial Runner checkpointing failed: {str(e)}") self._iteration += 1 @@ -1120,8 +1028,7 @@ def __getstate__(self): for k in [ "_trials", "_stop_queue", "_server", "_search_alg", "_scheduler_alg", "_pending_trial_queue_times", - "trial_executor", "_syncer", "_callbacks", - "_checkpoint_manager" + "trial_executor", "_syncer", "_callbacks" ]: del state[k] state["launch_web_server"] = bool(self._server) @@ -1138,7 +1045,5 @@ def __setstate__(self, state): self.__dict__.setdefault("_start_time", start_time) self.__dict__.update(state) - self._checkpoint_manager = self._create_checkpoint_manager() - if launch_web_server: self._server = TuneServer(self, self._server_port) diff --git a/python/ray/tune/tune.py b/python/ray/tune/tune.py index 6ce115126e8a..fab7b79bf5e5 100644 --- a/python/ray/tune/tune.py +++ b/python/ray/tune/tune.py @@ -1,38 +1,25 @@ -from typing import Any, Callable, Dict, Mapping, Optional, Sequence, Type, \ - Union - -import datetime import logging -import os -import signal import sys import time -import ray -from ray.tune.analysis import ExperimentAnalysis -from ray.tune.callback import Callback from ray.tune.error import TuneError -from ray.tune.experiment import Experiment, convert_to_experiment_list -from ray.tune.logger import Logger -from ray.tune.progress_reporter import CLIReporter, JupyterNotebookReporter, \ - ProgressReporter -from ray.tune.ray_trial_executor import RayTrialExecutor -from ray.tune.registry import get_trainable_cls -from ray.tune.stopper import Stopper -from ray.tune.suggest import BasicVariantGenerator, SearchAlgorithm, \ - SearchGenerator +from ray.tune.experiment import convert_to_experiment_list, Experiment +from ray.tune.analysis import ExperimentAnalysis +from ray.tune.suggest import BasicVariantGenerator, SearchGenerator from ray.tune.suggest.suggestion import Searcher from ray.tune.suggest.variant_generator import has_unresolved_values -from ray.tune.syncer import SyncConfig, set_sync_periods, wait_for_sync -from ray.tune.trainable import Trainable from ray.tune.trial import Trial -from ray.tune.trial_runner import TrialRunner +from ray.tune.trainable import Trainable +from ray.tune.ray_trial_executor import RayTrialExecutor from ray.tune.utils.callback import create_default_callbacks +from ray.tune.registry import get_trainable_cls +from ray.tune.syncer import wait_for_sync, set_sync_periods, \ + SyncConfig +from ray.tune.trial_runner import TrialRunner +from ray.tune.progress_reporter import CLIReporter, JupyterNotebookReporter +from ray.tune.schedulers import FIFOScheduler from ray.tune.utils.log import Verbosity, has_verbosity, set_verbosity -# Must come last to avoid circular imports -from ray.tune.schedulers import FIFOScheduler, TrialScheduler - logger = logging.getLogger(__name__) try: @@ -68,58 +55,52 @@ def _report_progress(runner, reporter, done=False): def run( - run_or_experiment: Union[str, Callable, Type], - name: Optional[str] = None, - metric: Optional[str] = None, - mode: Optional[str] = None, - stop: Union[None, Mapping, Stopper, Callable[[str, Mapping], - bool]] = None, - time_budget_s: Union[None, int, float, datetime.timedelta] = None, - config: Optional[Dict[str, Any]] = None, - resources_per_trial: Optional[Mapping[str, Union[float, int]]] = None, - num_samples: int = 1, - local_dir: Optional[str] = None, - search_alg: Optional[Union[Searcher, SearchAlgorithm]] = None, - scheduler: Optional[TrialScheduler] = None, - keep_checkpoints_num: Optional[int] = None, - checkpoint_score_attr: Optional[str] = None, - checkpoint_freq: int = 0, - checkpoint_at_end: bool = False, - verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS, - progress_reporter: Optional[ProgressReporter] = None, - log_to_file: bool = False, - trial_name_creator: Optional[Callable[[Trial], str]] = None, - trial_dirname_creator: Optional[Callable[[Trial], str]] = None, - sync_config: Optional[SyncConfig] = None, - export_formats: Optional[Sequence] = None, - max_failures: int = 0, - fail_fast: bool = False, - restore: Optional[str] = None, - server_port: Optional[int] = None, - resume: bool = False, - queue_trials: bool = False, - reuse_actors: bool = False, - trial_executor: Optional[RayTrialExecutor] = None, - raise_on_failed_trial: bool = True, - callbacks: Optional[Sequence[Callback]] = None, + run_or_experiment, + name=None, + metric=None, + mode=None, + stop=None, + time_budget_s=None, + config=None, + resources_per_trial=None, + num_samples=1, + local_dir=None, + search_alg=None, + scheduler=None, + keep_checkpoints_num=None, + checkpoint_score_attr=None, + checkpoint_freq=0, + checkpoint_at_end=False, + verbose=Verbosity.V3_TRIAL_DETAILS, + progress_reporter=None, + log_to_file=False, + trial_name_creator=None, + trial_dirname_creator=None, + sync_config=None, + export_formats=None, + max_failures=0, + fail_fast=False, + restore=None, + server_port=None, + resume=False, + queue_trials=False, + reuse_actors=False, + trial_executor=None, + raise_on_failed_trial=True, + callbacks=None, # Deprecated args - loggers: Optional[Sequence[Type[Logger]]] = None, - ray_auto_init: Optional = None, - run_errored_only: Optional = None, - global_checkpoint_period: Optional = None, - with_server: Optional = None, - upload_dir: Optional = None, - sync_to_cloud: Optional = None, - sync_to_driver: Optional = None, - sync_on_checkpoint: Optional = None, - _remote: bool = None, -) -> ExperimentAnalysis: + loggers=None, + ray_auto_init=None, + run_errored_only=None, + global_checkpoint_period=None, + with_server=None, + upload_dir=None, + sync_to_cloud=None, + sync_to_driver=None, + sync_on_checkpoint=None, +): """Executes training. - When a SIGINT signal is received (e.g. through Ctrl+C), the tuning run - will gracefully shut down and checkpoint the latest experiment state. - Sending SIGINT again (or SIGKILL/SIGTERM instead) will skip this step. - Examples: .. code-block:: python @@ -272,9 +253,7 @@ def run( ``ray.tune.callback.Callback`` class. If not passed, `LoggerCallback` and `SyncerCallback` callbacks are automatically added. - _remote (bool): Whether to run the Tune driver in a remote function. - This is disabled automatically if a custom trial executor is - passed in. This is enabled by default in Ray client mode. + Returns: ExperimentAnalysis: Object for experiment analysis. @@ -282,64 +261,6 @@ def run( Raises: TuneError: Any trials failed and `raise_on_failed_trial` is True. """ - - if _remote is None: - _remote = ray.util.client.ray.is_connected() - - if _remote is True and trial_executor: - raise ValueError("cannot use custom trial executor") - - if not trial_executor or isinstance(trial_executor, RayTrialExecutor): - _ray_auto_init() - - if _remote: - return ray.get( - ray.remote(num_cpus=0)(run).remote( - run_or_experiment, - name, - metric, - mode, - stop, - time_budget_s, - config, - resources_per_trial, - num_samples, - local_dir, - search_alg, - scheduler, - keep_checkpoints_num, - checkpoint_score_attr, - checkpoint_freq, - checkpoint_at_end, - verbose, - progress_reporter, - log_to_file, - trial_name_creator, - trial_dirname_creator, - sync_config, - export_formats, - max_failures, - fail_fast, - restore, - server_port, - resume, - queue_trials, - reuse_actors, - trial_executor, - raise_on_failed_trial, - callbacks, - # Deprecated args - loggers, - ray_auto_init, - run_errored_only, - global_checkpoint_period, - with_server, - upload_dir, - sync_to_cloud, - sync_to_driver, - sync_on_checkpoint, - _remote=False)) - all_start = time.time() if global_checkpoint_period: raise ValueError("global_checkpoint_period is deprecated. Set env var " @@ -495,24 +416,8 @@ def run( "`Trainable.default_resource_request` if using the " "Trainable API.") - original_handler = signal.getsignal(signal.SIGINT) - state = {signal.SIGINT: False} - - def sigint_handler(sig, frame): - logger.warning( - "SIGINT received (e.g. via Ctrl+C), ending Ray Tune run. " - "This will try to checkpoint the experiment state one last time. " - "Press CTRL+C one more time (or send SIGINT/SIGKILL/SIGTERM) " - "to skip. ") - state[signal.SIGINT] = True - # Restore original signal handler to react to future SIGINT signals - signal.signal(signal.SIGINT, original_handler) - - if not int(os.getenv("TUNE_DISABLE_SIGINT_HANDLER", "0")): - signal.signal(signal.SIGINT, sigint_handler) - tune_start = time.time() - while not runner.is_finished() and not state[signal.SIGINT]: + while not runner.is_finished(): runner.step() if has_verbosity(Verbosity.V1_EXPERIMENT): _report_progress(runner, progress_reporter) @@ -535,7 +440,7 @@ def sigint_handler(sig, frame): incomplete_trials += [trial] if incomplete_trials: - if raise_on_failed_trial and not state[signal.SIGINT]: + if raise_on_failed_trial: raise TuneError("Trials did not complete", incomplete_trials) else: logger.error("Trials did not complete: %s", incomplete_trials) @@ -545,12 +450,6 @@ def sigint_handler(sig, frame): logger.info(f"Total run time: {all_taken:.2f} seconds " f"({tune_taken:.2f} seconds for the tuning loop).") - if state[signal.SIGINT]: - logger.warning( - "Experiment has been interrupted, but the most recent state was " - "saved. You can continue running this experiment by passing " - "`resume=True` to `tune.run()`") - trials = runner.get_trials() return ExperimentAnalysis( runner.checkpoint_file, @@ -559,21 +458,18 @@ def sigint_handler(sig, frame): default_mode=mode) -def run_experiments( - experiments: Union[Experiment, Mapping, Sequence[Union[Experiment, - Mapping]]], - scheduler: Optional[TrialScheduler] = None, - server_port: Optional[int] = None, - verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS, - progress_reporter: Optional[ProgressReporter] = None, - resume: bool = False, - queue_trials: bool = False, - reuse_actors: bool = False, - trial_executor: Optional[RayTrialExecutor] = None, - raise_on_failed_trial: bool = True, - concurrent: bool = True, - callbacks: Optional[Sequence[Callback]] = None, - _remote: bool = None): +def run_experiments(experiments, + scheduler=None, + server_port=None, + verbose=Verbosity.V3_TRIAL_DETAILS, + progress_reporter=None, + resume=False, + queue_trials=False, + reuse_actors=False, + trial_executor=None, + raise_on_failed_trial=True, + concurrent=True, + callbacks=None): """Runs and blocks until all trials finish. Examples: @@ -587,32 +483,6 @@ def run_experiments( List of Trial objects, holding data for each executed trial. """ - if _remote is None: - _remote = ray.util.client.ray.is_connected() - - if _remote is True and trial_executor: - raise ValueError("cannot use custom trial executor") - - if not trial_executor or isinstance(trial_executor, RayTrialExecutor): - _ray_auto_init() - - if _remote: - return ray.get( - ray.remote(num_cpus=0)(run_experiments).remote( - experiments, - scheduler, - server_port, - verbose, - progress_reporter, - resume, - queue_trials, - reuse_actors, - trial_executor, - raise_on_failed_trial, - concurrent, - callbacks, - _remote=False)) - # This is important to do this here # because it schematize the experiments # and it conducts the implicit registration. @@ -647,14 +517,3 @@ def run_experiments( scheduler=scheduler, callbacks=callbacks).trials return trials - - -def _ray_auto_init(): - """Initialize Ray unless already configured.""" - if os.environ.get("TUNE_DISABLE_AUTO_INIT") == "1": - logger.info("'TUNE_DISABLE_AUTO_INIT=1' detected.") - elif not ray.is_initialized(): - logger.info("Initializing Ray automatically." - "For cluster usage or custom Ray initialization, " - "call `ray.init(...)` before `tune.run`.") - ray.init() diff --git a/python/ray/tune/utils/mock.py b/python/ray/tune/utils/mock.py index eea7b194d9ea..cc92fae26dee 100644 --- a/python/ray/tune/utils/mock.py +++ b/python/ray/tune/utils/mock.py @@ -1,6 +1,4 @@ -import logging import os - import numpy as np import json import random @@ -20,8 +18,6 @@ LOCAL_SYNC_TEMPLATE = "mkdir -p {target} && rsync -avz {source}/ {target}/" LOCAL_DELETE_TEMPLATE = "rm -rf {target}" -logger = logging.getLogger(__name__) - def mock_storage_client(): """Mocks storage client that treats a local dir as durable storage.""" @@ -114,25 +110,13 @@ def __init__(self, self.disable = disable def on_step_begin(self, **info): - import click from ray.autoscaler._private.commands import kill_node - failures = 0 - max_failures = 3 # With 10% probability inject failure to a worker. if random.random() < self.probability and not self.disable: # With 10% probability fully terminate the node. should_terminate = random.random() < self.probability - while failures < max_failures: - try: - kill_node( - self.config_path, - yes=True, - hard=should_terminate, - override_cluster_name=None) - except click.exceptions.ClickException: - failures += 1 - logger.exception("Killing random node failed in attempt " - "{}. " - "Retrying {} more times".format( - str(failures), - str(max_failures - failures))) + kill_node( + self.config_path, + yes=True, + hard=should_terminate, + override_cluster_name=None) diff --git a/python/ray/tune/utils/util.py b/python/ray/tune/utils/util.py index 73c56a013279..47a6b648eb1a 100644 --- a/python/ray/tune/utils/util.py +++ b/python/ray/tune/utils/util.py @@ -21,14 +21,10 @@ logger = logging.getLogger(__name__) - -def _import_gputil(): - try: - import GPUtil - except ImportError: - GPUtil = None - return GPUtil - +try: + import GPUtil +except ImportError: + GPUtil = None _pinned_objects = [] PINNED_OBJECT_PREFIX = "ray.tune.PinnedObject:" @@ -47,8 +43,6 @@ class UtilMonitor(Thread): def __init__(self, start=True, delay=0.7): self.stopped = True - GPUtil = _import_gputil() - self.GPUtil = GPUtil if GPUtil is None and start: logger.warning("Install gputil for GPU system monitoring.") @@ -73,10 +67,10 @@ def _read_utilization(self): float(psutil.cpu_percent(interval=None))) self.values["ram_util_percent"].append( float(getattr(psutil.virtual_memory(), "percent"))) - if self.GPUtil is not None: + if GPUtil is not None: gpu_list = [] try: - gpu_list = self.GPUtil.getGPUs() + gpu_list = GPUtil.getGPUs() except Exception: logger.debug("GPUtil failed to retrieve GPUs.") for gpu in gpu_list: @@ -139,13 +133,11 @@ class warn_if_slow: def __init__(self, name: str, threshold: Optional[float] = None, - message: Optional[str] = None, - disable: bool = False): + message: Optional[str] = None): self.name = name self.threshold = threshold or self.DEFAULT_THRESHOLD self.message = message or self.DEFAULT_MESSAGE self.too_slow = False - self.disable = disable def __enter__(self): self.start = time.time() @@ -153,8 +145,6 @@ def __enter__(self): def __exit__(self, type, value, traceback): now = time.time() - if self.disable: - return if now - self.start > self.threshold and now - START_OF_TIME > 60.0: self.too_slow = True duration = now - self.start @@ -468,31 +458,27 @@ def load_newest_checkpoint(dirpath: str, ckpt_pattern: str) -> dict: return checkpoint_state -def wait_for_gpu(gpu_id=None, - target_util=0.01, - retry=20, - delay_s=5, - gpu_memory_limit=None): +def wait_for_gpu(gpu_id=None, gpu_memory_limit=0.1, retry=20): """Checks if a given GPU has freed memory. Requires ``gputil`` to be installed: ``pip install gputil``. Args: - gpu_id (Optional[Union[int, str]]): GPU id or uuid to check. - Must be found within GPUtil.getGPUs(). If none, resorts to + gpu_id (Optional[str]): GPU id to check. Must be found + within GPUtil.getGPUs(). If none, resorts to the first item returned from `ray.get_gpu_ids()`. - target_util (float): The utilization threshold to reach to unblock. - Set this to 0 to block until the GPU is completely free. - retry (int): Number of times to check GPU limit. Sleeps `delay_s` + gpu_memory_limit (float): If memory usage is below + this quantity, the check will break. + retry (int): Number of times to check GPU limit. Sleeps 5 seconds between checks. - delay_s (int): Seconds to wait before check. - gpu_memory_limit (float): Deprecated. Returns: - bool: True if free. + bool + True if free. Raises: - RuntimeError: If GPUtil is not found, if no GPUs are detected + RuntimeError + If GPUtil is not found, if no GPUs are detected or if the check fails. Example: @@ -505,54 +491,21 @@ def tune_func(config): tune.run(tune_func, resources_per_trial={"GPU": 1}, num_samples=10) """ - GPUtil = _import_gputil() - if gpu_memory_limit: - raise ValueError("'gpu_memory_limit' is deprecated. " - "Use 'target_util' instead.") if GPUtil is None: raise RuntimeError( "GPUtil must be installed if calling `wait_for_gpu`.") - - if gpu_id is None: + if not gpu_id: gpu_id_list = ray.get_gpu_ids() if not gpu_id_list: - raise RuntimeError("No GPU ids found from `ray.get_gpu_ids()`. " + raise RuntimeError(f"No GPU ids found from {ray.get_gpu_ids()}. " "Did you set Tune resources correctly?") gpu_id = gpu_id_list[0] - - gpu_attr = "id" - if isinstance(gpu_id, str): - if gpu_id.isdigit(): - # GPU ID returned from `ray.get_gpu_ids()` is a str representation - # of the int GPU ID - gpu_id = int(gpu_id) - else: - # Could not coerce gpu_id to int, so assume UUID - # and compare against `uuid` attribute e.g., - # 'GPU-04546190-b68d-65ac-101b-035f8faed77d' - gpu_attr = "uuid" - elif not isinstance(gpu_id, int): - raise ValueError(f"gpu_id ({type(gpu_id)}) must be type str/int.") - - def gpu_id_fn(g): - # Returns either `g.id` or `g.uuid` depending on - # the format of the input `gpu_id` - return getattr(g, gpu_attr) - - gpu_ids = {gpu_id_fn(g) for g in GPUtil.getGPUs()} - if gpu_id not in gpu_ids: - raise ValueError( - f"{gpu_id} not found in set of available GPUs: {gpu_ids}. " - "`wait_for_gpu` takes either GPU ordinal ID (e.g., '0') or " - "UUID (e.g., 'GPU-04546190-b68d-65ac-101b-035f8faed77d').") - + gpu_object = GPUtil.getGPUs()[gpu_id] for i in range(int(retry)): - gpu_object = next( - g for g in GPUtil.getGPUs() if gpu_id_fn(g) == gpu_id) - if gpu_object.memoryUtil > target_util: - logger.info(f"Waiting for GPU util to reach {target_util}. " - f"Util: {gpu_object.memoryUtil:0.3f}") - time.sleep(delay_s) + if gpu_object.memoryUsed > gpu_memory_limit: + logger.info(f"Waiting for GPU {gpu_id} memory to free. " + f"Mem: {gpu_object.memoryUsed:0.3f}") + time.sleep(5) else: return True raise RuntimeError("GPU memory was not freed.") diff --git a/python/ray/tune/utils/util_test.py b/python/ray/tune/utils/util_test.py new file mode 100644 index 000000000000..534061f686d0 --- /dev/null +++ b/python/ray/tune/utils/util_test.py @@ -0,0 +1,43 @@ +from collections import OrderedDict + +import unittest + +from .util import unflatten_dict + + +class UnflattenDictTest(unittest.TestCase): + def test_output_type(self): + in_ = OrderedDict({"a/b": 1, "c/d": 2, "e": 3}) + out = unflatten_dict(in_) + assert type(in_) is type(out) + + def test_one_level_nested(self): + result = unflatten_dict({"a/b": 1, "c/d": 2, "e": 3}) + assert result == {"a": {"b": 1}, "c": {"d": 2}, "e": 3} + + def test_multi_level_nested(self): + result = unflatten_dict({"a/b/c/d": 1, "b/c/d": 2, "c/d": 3, "e": 4}) + assert result == { + "a": { + "b": { + "c": { + "d": 1, + }, + }, + }, + "b": { + "c": { + "d": 2, + }, + }, + "c": { + "d": 3, + }, + "e": 4, + } + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/util/__init__.py b/python/ray/util/__init__.py index d20bac2a3ef4..b2dc97bbd41a 100644 --- a/python/ray/util/__init__.py +++ b/python/ray/util/__init__.py @@ -4,10 +4,9 @@ from ray.util.debug import log_once, disable_log_once_globally, \ enable_periodic_logging from ray.util.placement_group import (placement_group, placement_group_table, - remove_placement_group, - get_placement_group) + remove_placement_group) from ray.util import rpdb as pdb -from ray.util.serialization import register_serializer, deregister_serializer +from ray.util.serialization import register_serializer from ray.util.client_connect import connect, disconnect @@ -20,12 +19,10 @@ "pdb", "placement_group", "placement_group_table", - "get_placement_group", "remove_placement_group", "inspect_serializability", "collective", "connect", "disconnect", "register_serializer", - "deregister_serializer", ] diff --git a/python/ray/util/client/__init__.py b/python/ray/util/client/__init__.py index 94a664a80e33..02aab93ff5ae 100644 --- a/python/ray/util/client/__init__.py +++ b/python/ray/util/client/__init__.py @@ -1,14 +1,9 @@ from typing import List, Tuple, Dict, Any -import sys import logging logger = logging.getLogger(__name__) -# This version string is incremented to indicate breaking changes in the -# protocol that require upgrading the client version. -CURRENT_PROTOCOL_VERSION = "2020-02-01" - class RayAPIStub: """This class stands in as the replacement API for the `import ray` module. @@ -30,18 +25,13 @@ def connect(self, conn_str: str, secure: bool = False, metadata: List[Tuple[str, str]] = None, - connection_retries: int = 3, - *, - ignore_version: bool = False) -> Dict[str, Any]: + connection_retries: int = 3) -> Dict[str, Any]: """Connect the Ray Client to a server. Args: conn_str: Connection string, in the form "[host]:port" secure: Whether to use a TLS secured gRPC channel metadata: gRPC metadata to send on connect - connection_retries: number of connection attempts to make - ignore_version: whether to ignore Python or Ray version mismatches. - This should only be used for debugging purposes. Returns: Dictionary of connection info, e.g., {"num_clients": 1}. @@ -66,34 +56,11 @@ def connect(self, metadata=metadata, connection_retries=connection_retries) self.api.worker = self.client_worker - conn_info = self.client_worker.connection_info() - self._check_versions(conn_info, ignore_version) - return conn_info + return self.client_worker.connection_info() except Exception: self.disconnect() raise - def _check_versions(self, conn_info: Dict[str, Any], - ignore_version: bool) -> None: - local_major_minor = f"{sys.version_info[0]}.{sys.version_info[1]}" - if not conn_info["python_version"].startswith(local_major_minor): - version_str = f"{local_major_minor}.{sys.version_info[2]}" - msg = "Python minor versions differ between client and server:" + \ - f" client is {version_str}," + \ - f" server is {conn_info['python_version']}" - if ignore_version: - logger.warning(msg) - else: - raise RuntimeError(msg) - if CURRENT_PROTOCOL_VERSION != conn_info["protocol_version"]: - msg = "Client Ray installation incompatible with server:" + \ - f" client is {CURRENT_PROTOCOL_VERSION}," + \ - f" server is {conn_info['protocol_version']}" - if ignore_version: - logger.warning(msg) - else: - raise RuntimeError(msg) - def disconnect(self): """Disconnect the Ray Client. """ @@ -122,17 +89,14 @@ def __getattr__(self, key: str): return getattr(self.api, key) def is_connected(self) -> bool: - if self.client_worker is None: - return False - return self.client_worker.is_connected() + return self.client_worker is not None def init(self, *args, **kwargs): if self._server is not None: raise Exception("Trying to start two instances of ray via client") import ray.util.client.server.server as ray_client_server - server_handle, address_info = ray_client_server.init_and_serve( + self._server, address_info = ray_client_server.init_and_serve( "localhost:50051", *args, **kwargs) - self._server = server_handle.grpc_server self.connect("localhost:50051") self._connected_with_init = True return address_info diff --git a/python/ray/util/client/api.py b/python/ray/util/client/api.py index 5b1ae881e5cd..7d8576d1f276 100644 --- a/python/ray/util/client/api.py +++ b/python/ray/util/client/api.py @@ -4,8 +4,6 @@ from ray.util.client.runtime_context import ClientWorkerPropertyAPI from typing import TYPE_CHECKING if TYPE_CHECKING: - from ray.actor import ActorClass - from ray.remote_function import RemoteFunction from ray.util.client.common import ClientStub from ray.util.client.common import ClientActorHandle from ray.util.client.common import ClientObjectRef @@ -267,18 +265,6 @@ def _internal_kv_list(self, prefix: bytes) -> bytes: """Hook for internal_kv._internal_kv_list.""" return self.worker.internal_kv_list(as_bytes(prefix)) - def _convert_actor(self, actor: "ActorClass") -> str: - """Register a ClientActorClass for the ActorClass and return a UUID""" - return self.worker._convert_actor(actor) - - def _convert_function(self, func: "RemoteFunction") -> str: - """Register a ClientRemoteFunc for the ActorClass and return a UUID""" - return self.worker._convert_function(func) - - def _get_converted(self, key: str) -> "ClientStub": - """Given a UUID, return the converted object""" - return self.worker._get_converted(key) - def __getattr__(self, key: str): if not key.startswith("_"): raise NotImplementedError( diff --git a/python/ray/util/client/common.py b/python/ray/util/client/common.py index 8eac0983a390..2bcd14f3f586 100644 --- a/python/ray/util/client/common.py +++ b/python/ray/util/client/common.py @@ -82,11 +82,7 @@ def remote(self, *args, **kwargs): def options(self, **kwargs): return OptionWrapper(self, kwargs) - def _remote(self, args=None, kwargs=None, **option_args): - if args is None: - args = [] - if kwargs is None: - kwargs = {} + def _remote(self, args=[], kwargs={}, **option_args): return self.options(**option_args).remote(*args, **kwargs) def __repr__(self): @@ -154,11 +150,7 @@ def remote(self, *args, **kwargs) -> "ClientActorHandle": def options(self, **kwargs): return ActorOptionWrapper(self, kwargs) - def _remote(self, args=None, kwargs=None, **option_args): - if args is None: - args = [] - if kwargs is None: - kwargs = {} + def _remote(self, args=[], kwargs={}, **option_args): return self.options(**option_args).remote(*args, **kwargs) def __repr__(self): @@ -238,11 +230,7 @@ def __repr__(self): def options(self, **kwargs): return OptionWrapper(self, kwargs) - def _remote(self, args=None, kwargs=None, **option_args): - if args is None: - args = [] - if kwargs is None: - kwargs = {} + def _remote(self, args=[], kwargs={}, **option_args): return self.options(**option_args).remote(*args, **kwargs) def _prepare_client_task(self) -> ray_client_pb2.ClientTask: diff --git a/python/ray/util/client/dataclient.py b/python/ray/util/client/dataclient.py index a0750b790bb6..6e29ea927b83 100644 --- a/python/ray/util/client/dataclient.py +++ b/python/ray/util/client/dataclient.py @@ -37,7 +37,6 @@ def __init__(self, channel: "grpc._channel.Channel", client_id: str, self._req_id = 0 self._client_id = client_id self._metadata = metadata - self._in_shutdown = False self.data_thread.start() def _next_id(self) -> int: @@ -68,19 +67,9 @@ def _data_main(self) -> None: self.ready_data[response.req_id] = response self.cv.notify_all() except grpc.RpcError as e: - with self.cv: - self._in_shutdown = True - self.cv.notify_all() - if e.code() == grpc.StatusCode.CANCELLED: + if grpc.StatusCode.CANCELLED == e.code(): # Gracefully shutting down logger.info("Cancelling data channel") - elif e.code() == grpc.StatusCode.UNAVAILABLE: - # TODO(barakmich): The server may have - # dropped. In theory, we can retry, as per - # https://grpc.github.io/grpc/core/md_doc_statuscodes.html but - # in practice we may need to think about the correct semantics - # here. - logger.info("Server disconnected from data channel") else: logger.error( f"Got Error from data channel -- shutting down: {e}") @@ -99,11 +88,7 @@ def _blocking_send(self, req: ray_client_pb2.DataRequest self.request_queue.put(req) data = None with self.cv: - self.cv.wait_for( - lambda: req_id in self.ready_data or self._in_shutdown) - if self._in_shutdown: - raise ConnectionError( - f"cannot send request {req}: data channel shutting down") + self.cv.wait_for(lambda: req_id in self.ready_data) data = self.ready_data[req_id] del self.ready_data[req_id] return data diff --git a/python/ray/util/client/logsclient.py b/python/ray/util/client/logsclient.py index f7902024d256..0e4d02846a37 100644 --- a/python/ray/util/client/logsclient.py +++ b/python/ray/util/client/logsclient.py @@ -44,18 +44,8 @@ def _log_main(self) -> None: self.stdstream(level=record.level, msg=record.msg) self.log(level=record.level, msg=record.msg) except grpc.RpcError as e: - if e.code() == grpc.StatusCode.CANCELLED: - # Graceful shutdown. We've cancelled our own connection. - logger.info("Cancelling logs channel") - elif e.code() == grpc.StatusCode.UNAVAILABLE: - # TODO(barakmich): The server may have - # dropped. In theory, we can retry, as per - # https://grpc.github.io/grpc/core/md_doc_statuscodes.html but - # in practice we may need to think about the correct semantics - # here. - logger.info("Server disconnected from logs channel") - else: - # Some other, unhandled, gRPC error + if grpc.StatusCode.CANCELLED != e.code(): + # Not just shutting down normally logger.error( f"Got Error from logger channel -- shutting down: {e}") raise e diff --git a/python/ray/util/client/options.py b/python/ray/util/client/options.py index b2f1dae8138a..79727b126473 100644 --- a/python/ray/util/client/options.py +++ b/python/ray/util/client/options.py @@ -46,10 +46,9 @@ def validate_options( raise TypeError(f"Invalid option passed to remote(): {k}") validator = options[k] if len(validator) != 0: - if v is not None: - if not isinstance(v, validator[0]): - raise ValueError(validator[2]) - if not validator[1](v): - raise ValueError(validator[2]) + if not isinstance(v, validator[0]): + raise ValueError(validator[2]) + if not validator[1](v): + raise ValueError(validator[2]) out[k] = v return out diff --git a/python/ray/util/client/ray_client_helpers.py b/python/ray/util/client/ray_client_helpers.py index a7f16c246aa7..be5a2918c3b2 100644 --- a/python/ray/util/client/ray_client_helpers.py +++ b/python/ray/util/client/ray_client_helpers.py @@ -1,42 +1,16 @@ from contextlib import contextmanager -import ray as real_ray import ray.util.client.server.server as ray_client_server from ray.util.client import ray @contextmanager def ray_start_client_server(): - with ray_start_client_server_pair() as pair: - client, server = pair - yield client - - -@contextmanager -def ray_start_client_server_pair(): ray._inside_client_test = True server = ray_client_server.serve("localhost:50051") ray.connect("localhost:50051") try: - yield ray, server - finally: - ray._inside_client_test = False - ray.disconnect() - server.stop(0) - - -@contextmanager -def ray_start_cluster_client_server_pair(address): - ray._inside_client_test = True - - def ray_connect_handler(): - real_ray.init(address=address) - - server = ray_client_server.serve( - "localhost:50051", ray_connect_handler=ray_connect_handler) - ray.connect("localhost:50051") - try: - yield ray, server + yield ray finally: ray._inside_client_test = False ray.disconnect() diff --git a/python/ray/util/client/server/dataservicer.py b/python/ray/util/client/server/dataservicer.py index c9e345219a9b..7a7fb3eae73f 100644 --- a/python/ray/util/client/server/dataservicer.py +++ b/python/ray/util/client/server/dataservicer.py @@ -3,13 +3,11 @@ import grpc import sys -from typing import TYPE_CHECKING, Callable +from typing import TYPE_CHECKING from threading import Lock import ray.core.generated.ray_client_pb2 as ray_client_pb2 import ray.core.generated.ray_client_pb2_grpc as ray_client_pb2_grpc -from ray.util.client import CURRENT_PROTOCOL_VERSION -from ray._private.client_mode_hook import disable_client_hook if TYPE_CHECKING: from ray.util.client.server.server import RayletServicer @@ -18,12 +16,10 @@ class DataServicer(ray_client_pb2_grpc.RayletDataStreamerServicer): - def __init__(self, basic_service: "RayletServicer", - ray_connect_handler: Callable): + def __init__(self, basic_service: "RayletServicer"): self.basic_service = basic_service self._clients_lock = Lock() self._num_clients = 0 # guarded by self._clients_lock - self.ray_connect_handler = ray_connect_handler def Datapath(self, request_iterator, context): metadata = {k: v for k, v in context.invocation_metadata()} @@ -34,9 +30,6 @@ def Datapath(self, request_iterator, context): logger.info(f"New data connection from client {client_id}") try: with self._clients_lock: - with disable_client_hook(): - if self._num_clients == 0 and not ray.is_initialized(): - self.ray_connect_handler() self._num_clients += 1 for req in request_iterator: resp = None @@ -57,8 +50,16 @@ def Datapath(self, request_iterator, context): resp = ray_client_pb2.DataResponse( release=ray_client_pb2.ReleaseResponse(ok=released)) elif req_type == "connection_info": - resp = ray_client_pb2.DataResponse( - connection_info=self._build_connection_response()) + with self._clients_lock: + cur_num_clients = self._num_clients + info = ray_client_pb2.ConnectionInfoResponse( + num_clients=cur_num_clients, + python_version="{}.{}.{}".format( + sys.version_info[0], sys.version_info[1], + sys.version_info[2]), + ray_version=ray.__version__, + ray_commit=ray.__commit__) + resp = ray_client_pb2.DataResponse(connection_info=info) else: raise Exception(f"Unreachable code: Request type " f"{req_type} not handled in Datapath") @@ -69,21 +70,5 @@ def Datapath(self, request_iterator, context): finally: logger.info(f"Lost data connection from client {client_id}") self.basic_service.release_all(client_id) - with self._clients_lock: self._num_clients -= 1 - - with disable_client_hook(): - if self._num_clients == 0: - ray.shutdown() - - def _build_connection_response(self): - with self._clients_lock: - cur_num_clients = self._num_clients - return ray_client_pb2.ConnectionInfoResponse( - num_clients=cur_num_clients, - python_version="{}.{}.{}".format( - sys.version_info[0], sys.version_info[1], sys.version_info[2]), - ray_version=ray.__version__, - ray_commit=ray.__commit__, - protocol_version=CURRENT_PROTOCOL_VERSION) diff --git a/python/ray/util/client/server/server.py b/python/ray/util/client/server/server.py index 6e65c929b8d8..19a192337105 100644 --- a/python/ray/util/client/server/server.py +++ b/python/ray/util/client/server/server.py @@ -3,7 +3,6 @@ import grpc import base64 from collections import defaultdict -from dataclasses import dataclass from typing import Any from typing import Dict @@ -408,32 +407,22 @@ def decode_options( return opts -@dataclass -class ClientServerHandle: - """Holds the handles to the registered gRPC servicers and their server.""" - task_servicer: RayletServicer - data_servicer: DataServicer - logs_servicer: LogstreamServicer - grpc_server: grpc.Server +_current_servicer: Optional[RayletServicer] = None - # Add a hook for all the cases that previously - # expected simply a gRPC server - def __getattr__(self, attr): - return getattr(self.grpc_server, attr) +# Used by tests to peek inside the servicer +def _get_current_servicer(): + global _current_servicer + return _current_servicer -def serve(connection_str, ray_connect_handler=None): - def default_connect_handler(): - with disable_client_hook(): - if not ray.is_initialized(): - return ray.init() - ray_connect_handler = ray_connect_handler or default_connect_handler +def serve(connection_str): server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) task_servicer = RayletServicer() - data_servicer = DataServicer( - task_servicer, ray_connect_handler=ray_connect_handler) + data_servicer = DataServicer(task_servicer) logs_servicer = LogstreamServicer() + global _current_servicer + _current_servicer = task_servicer ray_client_pb2_grpc.add_RayletDriverServicer_to_server( task_servicer, server) ray_client_pb2_grpc.add_RayletDataStreamerServicer_to_server( @@ -441,32 +430,16 @@ def default_connect_handler(): ray_client_pb2_grpc.add_RayletLogStreamerServicer_to_server( logs_servicer, server) server.add_insecure_port(connection_str) - current_handle = ClientServerHandle( - task_servicer=task_servicer, - data_servicer=data_servicer, - logs_servicer=logs_servicer, - grpc_server=server, - ) server.start() - return current_handle + return server def init_and_serve(connection_str, *args, **kwargs): with disable_client_hook(): # Disable client mode inside the worker's environment info = ray.init(*args, **kwargs) - - def ray_connect_handler(): - # Ray client will disconnect from ray when - # num_clients == 0. - if ray.is_initialized(): - return info - else: - return ray.init(*args, **kwargs) - - server_handle = serve( - connection_str, ray_connect_handler=ray_connect_handler) - return (server_handle, info) + server = serve(connection_str) + return (server, info) def shutdown_with_server(server, _exiting_interpreter=False): @@ -475,19 +448,6 @@ def shutdown_with_server(server, _exiting_interpreter=False): ray.shutdown(_exiting_interpreter) -def create_ray_handler(redis_address, redis_password): - def ray_connect_handler(): - if redis_address: - if redis_password: - ray.init(address=redis_address, _redis_password=redis_password) - else: - ray.init(address=redis_address) - else: - ray.init() - - return ray_connect_handler - - def main(): import argparse parser = argparse.ArgumentParser() @@ -507,13 +467,18 @@ def main(): help="Password for connecting to Redis") args = parser.parse_args() logging.basicConfig(level="INFO") - - ray_connect_handler = create_ray_handler(args.redis_address, - args.redis_password) - + if args.redis_address: + if args.redis_password: + ray.init( + address=args.redis_address, + _redis_password=args.redis_password) + else: + ray.init(address=args.redis_address) + else: + ray.init() hostport = "%s:%d" % (args.host, args.port) logger.info(f"Starting Ray Client server on {hostport}") - server = serve(hostport, ray_connect_handler) + server = serve(hostport) try: while True: time.sleep(1000) diff --git a/python/ray/util/client/worker.py b/python/ray/util/client/worker.py index db9a1cc63052..3c6401fdafd6 100644 --- a/python/ray/util/client/worker.py +++ b/python/ray/util/client/worker.py @@ -5,7 +5,6 @@ import base64 import json import logging -import time import uuid from collections import defaultdict from typing import Any @@ -13,7 +12,6 @@ from typing import List from typing import Tuple from typing import Optional -from typing import TYPE_CHECKING import grpc @@ -23,32 +21,18 @@ from ray.util.client.client_pickler import convert_to_arg from ray.util.client.client_pickler import dumps_from_client from ray.util.client.client_pickler import loads_from_server -from ray.util.client.common import ClientStub from ray.util.client.common import ClientActorHandle -from ray.util.client.common import ClientActorClass -from ray.util.client.common import ClientRemoteFunc from ray.util.client.common import ClientActorRef from ray.util.client.common import ClientObjectRef from ray.util.client.dataclient import DataClient from ray.util.client.logsclient import LogstreamClient -if TYPE_CHECKING: - from ray.actor import ActorClass - from ray.remote_function import RemoteFunction - logger = logging.getLogger(__name__) INITIAL_TIMEOUT_SEC = 5 MAX_TIMEOUT_SEC = 30 -def backoff(timeout: int) -> int: - timeout = timeout + 5 - if timeout > MAX_TIMEOUT_SEC: - timeout = MAX_TIMEOUT_SEC - return timeout - - class Worker: def __init__(self, conn_str: str = "", @@ -68,61 +52,30 @@ def __init__(self, """ self.metadata = metadata if metadata else [] self.channel = None - self.server = None - self._conn_state = grpc.ChannelConnectivity.IDLE self._client_id = make_client_id() - self._converted: Dict[str, ClientStub] = {} if secure: credentials = grpc.ssl_channel_credentials() self.channel = grpc.secure_channel(conn_str, credentials) else: self.channel = grpc.insecure_channel(conn_str) - self.channel.subscribe(self._on_channel_state_change) - - # Retry the connection until the channel responds to something - # looking like a gRPC connection, though it may be a proxy. conn_attempts = 0 timeout = INITIAL_TIMEOUT_SEC - service_ready = False - while conn_attempts < max(connection_retries, 1): + while conn_attempts < connection_retries + 1: conn_attempts += 1 try: - # Let gRPC wait for us to see if the channel becomes ready. - # If it throws, we couldn't connect. grpc.channel_ready_future(self.channel).result(timeout=timeout) - # The HTTP2 channel is ready. Wrap the channel with the - # RayletDriverStub, allowing for unary requests. - self.server = ray_client_pb2_grpc.RayletDriverStub( - self.channel) - service_ready = bool(self.ping_server()) - if service_ready: - break - # Ray is not ready yet, wait a timeout - time.sleep(timeout) + break except grpc.FutureTimeoutError: - logger.info( - f"Couldn't connect channel in {timeout} seconds, retrying") - # Note that channel_ready_future constitutes its own timeout, - # which is why we do not sleep here. - except grpc.RpcError as e: - logger.info("Ray client server unavailable, " - f"retrying in {timeout}s...") - logger.debug(f"Received when checking init: {e.details()}") - # Ray is not ready yet, wait a timeout. - time.sleep(timeout) - # Fallthrough, backoff, and retry at the top of the loop - logger.info("Waiting for Ray to become ready on the server, " - f"retry in {timeout}s...") - timeout = backoff(timeout) - - # If we made it through the loop without service_ready - # it means we've used up our retries and - # should error back to the user. - if not service_ready: - raise ConnectionError("ray client connection timeout") - - # Initialize the streams to finish protocol negotiation. + if conn_attempts >= connection_retries: + raise ConnectionError("ray client connection timeout") + logger.info(f"Couldn't connect in {timeout} seconds, retrying") + timeout = timeout + 5 + if timeout > MAX_TIMEOUT_SEC: + timeout = MAX_TIMEOUT_SEC + + self.server = ray_client_pb2_grpc.RayletDriverStub(self.channel) + self.data_client = DataClient(self.channel, self._client_id, self.metadata) self.reference_count: Dict[bytes, int] = defaultdict(int) @@ -131,10 +84,6 @@ def __init__(self, self.log_client.set_logstream_level(logging.INFO) self.closed = False - def _on_channel_state_change(self, conn_state: grpc.ChannelConnectivity): - logger.debug(f"client gRPC channel state change: {conn_state}") - self._conn_state = conn_state - def connection_info(self): try: data = self.data_client.ConnectionInfo() @@ -145,7 +94,6 @@ def connection_info(self): "python_version": data.python_version, "ray_version": data.ray_version, "ray_commit": data.ray_commit, - "protocol_version": data.protocol_version, } def get(self, vals, *, timeout: Optional[float] = None) -> Any: @@ -173,11 +121,7 @@ def _get(self, ref: ClientObjectRef, timeout: float): except grpc.RpcError as e: raise e.details() if not data.valid: - try: - err = cloudpickle.loads(data.error) - except Exception: - logger.exception("Failed to deserialize {}".format(data.error)) - raise + err = cloudpickle.loads(data.error) logger.error(err) raise err return loads_from_server(data.data) @@ -261,12 +205,7 @@ def _call_schedule_for_task( except grpc.RpcError as e: raise decode_exception(e.details) if not ticket.valid: - try: - raise cloudpickle.loads(ticket.error) - except Exception: - logger.exception("Failed to deserialize {}".format( - ticket.error)) - raise + raise cloudpickle.loads(ticket.error) return ticket.return_ids def call_release(self, id: bytes) -> None: @@ -374,62 +313,6 @@ def is_initialized(self) -> bool: ray_client_pb2.ClusterInfoType.IS_INITIALIZED) return False - def ping_server(self) -> bool: - """Simple health check. - - Piggybacks the IS_INITIALIZED call to check if the server provides - an actual response. - """ - if self.server is not None: - result = self.get_cluster_info( - ray_client_pb2.ClusterInfoType.IS_INITIALIZED) - return result is not None - return False - - def is_connected(self) -> bool: - return self._conn_state == grpc.ChannelConnectivity.READY - - def _convert_actor(self, actor: "ActorClass") -> str: - """Register a ClientActorClass for the ActorClass and return a UUID""" - key = uuid.uuid4().hex - md = actor.__ray_metadata__ - cls = md.modified_class - self._converted[key] = ClientActorClass( - cls, - options={ - "max_restarts": md.max_restarts, - "max_task_retries": md.max_task_retries, - "num_cpus": md.num_cpus, - "num_gpus": md.num_gpus, - "memory": md.memory, - "object_store_memory": md.object_store_memory, - "resources": md.resources, - "accelerator_type": md.accelerator_type, - }) - return key - - def _convert_function(self, func: "RemoteFunction") -> str: - """Register a ClientRemoteFunc for the ActorClass and return a UUID""" - key = uuid.uuid4().hex - f = func._function - self._converted[key] = ClientRemoteFunc( - f, - options={ - "num_cpus": func._num_cpus, - "num_gpus": func._num_gpus, - "max_calls": func._max_calls, - "max_retries": func._max_retries, - "resources": func._resources, - "accelerator_type": func._accelerator_type, - "num_returns": func._num_returns, - "memory": func._memory - }) - return key - - def _get_converted(self, key: str) -> "ClientStub": - """Given a UUID, return the converted object""" - return self._converted[key] - def make_client_id() -> str: id = uuid.uuid4() diff --git a/python/ray/util/collective/__init__.py b/python/ray/util/collective/__init__.py index 694698474062..4ae88660702f 100644 --- a/python/ray/util/collective/__init__.py +++ b/python/ray/util/collective/__init__.py @@ -1,15 +1,11 @@ -from ray.util.collective.collective import nccl_available, gloo_available, \ +from ray.util.collective.collective import nccl_available, mpi_available, \ is_group_initialized, init_collective_group, destroy_collective_group, \ - declare_collective_group, get_rank, get_world_size, allreduce, \ - allreduce_multigpu, barrier, reduce, reduce_multigpu, broadcast, \ - broadcast_multigpu, allgather, allgather_multigpu, reducescatter, \ - reducescatter_multigpu, send, send_multigpu, recv, recv_multigpu + get_rank, get_world_size, allreduce, barrier, reduce, broadcast, \ + allgather, reducescatter, send, recv __all__ = [ - "nccl_available", "gloo_available", "is_group_initialized", - "init_collective_group", "destroy_collective_group", - "declare_collective_group", "get_rank", "get_world_size", "allreduce", - "allreduce_multigpu", "barrier", "reduce", "reduce_multigpu", "broadcast", - "broadcast_multigpu", "allgather", "allgather_multigpu", "reducescatter", - "reducescatter_multigpu", "send", "send_multigpu", "recv", "recv_multigpu" + "nccl_available", "mpi_available", "is_group_initialized", + "init_collective_group", "destroy_collective_group", "get_rank", + "get_world_size", "allreduce", "barrier", "reduce", "broadcast", + "allgather", "reducescatter", "send", "recv" ] diff --git a/python/ray/util/collective/collective.py b/python/ray/util/collective/collective.py index afd523e6bf37..08f9026b0467 100644 --- a/python/ray/util/collective/collective.py +++ b/python/ray/util/collective/collective.py @@ -7,9 +7,14 @@ import ray from ray.util.collective import types -_GLOO_AVAILABLE = False +_MPI_AVAILABLE = False _NCCL_AVAILABLE = True +# try: +# from ray.util.collective.collective_group.mpi_collective_group \ +# import MPIGroup +# except ImportError: +# _MPI_AVAILABLE = False try: from ray.util.collective.collective_group import NCCLGroup except ImportError: @@ -22,8 +27,8 @@ def nccl_available(): return _NCCL_AVAILABLE -def gloo_available(): - return _GLOO_AVAILABLE +def mpi_available(): + return _MPI_AVAILABLE class GroupManager(object): @@ -46,11 +51,9 @@ def create_collective_group(self, backend, world_size, rank, group_name): """ backend = types.Backend(backend) if backend == types.Backend.MPI: - raise RuntimeError("Ray does not support MPI.") - elif backend == types.Backend.GLOO: raise NotImplementedError() elif backend == types.Backend.NCCL: - logger.debug("Creating NCCL group: '{}'...".format(group_name)) + logger.debug("creating NCCL group: '{}'".format(group_name)) g = NCCLGroup(world_size, rank, group_name) self._name_group_map[group_name] = g self._group_name_map[g] = group_name @@ -97,9 +100,9 @@ def init_collective_group(world_size: int, """Initialize a collective group inside an actor process. Args: - world_size (int): the total number of processes in the group. + world_size (int): the total number of processed in the group. rank (int): the rank of the current process. - backend: the CCL backend to use, NCCL or GLOO. + backend: the CCL backend to use, NCCL or MPI. group_name (str): the name of the collective group. Returns: @@ -134,13 +137,10 @@ def declare_collective_group(actors, Args: actors (list): a list of actors to be set in a collective group. - world_size (int): the total number of processes in the group. - ranks (List[int]): the rank of each actor. - backend: the CCL backend to use, NCCL or GLOO. - group_name (str): the name of the collective group. - - Returns: - None + group_options (dict): a dictionary that contains group_name(str), + world_size(int), rank(list of int, e.g. [0,1] + means the first actor is rank 0, and the second + actor is rank 1), backend(str). """ backend = types.Backend(backend) _check_backend_availability(backend) @@ -162,25 +162,18 @@ def declare_collective_group(actors, "Ranks must be a permutation from 0 to '{}'. Got '{}'.".format( len(ranks), "".join([str(r) for r in ranks]))) - if world_size <= 0: - raise RuntimeError("World size must be greater than zero. " - "Got '{}'.".format(world_size)) - if not all(ranks) >= 0: - raise RuntimeError("Ranks must be non-negative.") - if not all(ranks) < world_size: - raise RuntimeError("Ranks cannot be greater than world_size.") + assert world_size > 0 + assert all(ranks) >= 0 and all(ranks) < world_size # avoid a circular dependency from ray.util.collective.util import Info - # store the information into a NamedActor that can be accessed later. + # store the information into a NamedActor that can be accessed later/ name = "info_" + group_name actors_id = [a._ray_actor_id for a in actors] - # TODO (Dacheng): how do we recycle this name actor? info = Info.options(name=name, lifetime="detached").remote() ray.get([info.set_info.remote(actors_id, world_size, ranks, backend)]) -# TODO (we need a declarative destroy() API here.) def destroy_collective_group(group_name: str = "default") -> None: """Destroy a collective group given its group name.""" _check_inside_actor() @@ -213,8 +206,9 @@ def get_world_size(group_name: str = "default") -> int: group_name: the name of the group to query Returns: - The world size of the collective group, -1 if the group does - not exist or the process does not belong to the group. + The world size of the collective group, + -1 if the group does not exist or the process does + not belong to the group. """ _check_inside_actor() if not is_group_initialized(group_name): @@ -238,29 +232,7 @@ def allreduce(tensor, group_name: str = "default", op=types.ReduceOp.SUM): g = _check_and_get_group(group_name) opts = types.AllReduceOptions opts.reduceOp = op - g.allreduce([tensor], opts) - - -def allreduce_multigpu(tensor_list: list, - group_name: str = "default", - op=types.ReduceOp.SUM): - """Collective allreduce a list of tensors across the group. - - Args: - tensor_list (List[tensor]): list of tensors to be allreduced, - each on a GPU. - group_name (str): the collective group name to perform allreduce. - - Returns: - None - """ - if not types.cupy_available(): - raise RuntimeError("Multigpu calls requires NCCL and Cupy.") - _check_tensor_list_input(tensor_list) - g = _check_and_get_group(group_name) - opts = types.AllReduceOptions - opts.reduceOp = op - g.allreduce(tensor_list, opts) + g.allreduce(tensor, opts) def barrier(group_name: str = "default"): @@ -284,8 +256,8 @@ def reduce(tensor, Args: tensor: the tensor to be reduced on this process. - dst_rank (int): the rank of the destination process. - group_name (str): the collective group name to perform reduce. + dst_rank: the rank of the destination process. + group_name: the collective group name to perform reduce. op: The reduce operation. Returns: @@ -299,42 +271,7 @@ def reduce(tensor, opts = types.ReduceOptions() opts.reduceOp = op opts.root_rank = dst_rank - opts.root_tensor = 0 - g.reduce([tensor], opts) - - -def reduce_multigpu(tensor_list: list, - dst_rank: int = 0, - dst_tensor: int = 0, - group_name: str = "default", - op=types.ReduceOp.SUM): - """Reduce the tensor across the group to the destination rank - and destination tensor. - - Args: - tensor_list: the list of tensors to be reduced on this process; - each tensor located on a GPU. - dst_rank (int): the rank of the destination process. - dst_tensor: the index of GPU at the destination. - group_name (str): the collective group name to perform reduce. - op: The reduce operation. - - Returns: - None - """ - if not types.cupy_available(): - raise RuntimeError("Multigpu calls requires NCCL and Cupy.") - _check_tensor_list_input(tensor_list) - g = _check_and_get_group(group_name) - - # check dst rank - _check_rank_valid(g, dst_rank) - _check_root_tensor_valid(len(tensor_list), dst_tensor) - opts = types.ReduceOptions() - opts.reduceOp = op - opts.root_rank = dst_rank - opts.root_tensor = dst_tensor - g.reduce(tensor_list, opts) + g.reduce(tensor, opts) def broadcast(tensor, src_rank: int = 0, group_name: str = "default"): @@ -342,8 +279,8 @@ def broadcast(tensor, src_rank: int = 0, group_name: str = "default"): Args: tensor: the tensor to be broadcasted (src) or received (destination). - src_rank (int): the rank of the source process. - group_name (str): the collective group name to perform broadcast. + src_rank: the rank of the source process. + group_name: he collective group name to perform broadcast. Returns: None @@ -355,37 +292,7 @@ def broadcast(tensor, src_rank: int = 0, group_name: str = "default"): _check_rank_valid(g, src_rank) opts = types.BroadcastOptions() opts.root_rank = src_rank - opts.root_tensor = 0 - g.broadcast([tensor], opts) - - -def broadcast_multigpu(tensor_list, - src_rank: int = 0, - src_tensor: int = 0, - group_name: str = "default"): - """Broadcast the tensor from a source GPU to all other GPUs. - - Args: - tensor_list: the tensors to broadcast (src) or receive (dst). - src_rank (int): the rank of the source process. - src_tensor (int): the index of the source GPU on the source process. - group_name (str): the collective group name to perform broadcast. - - Returns: - None - """ - if not types.cupy_available(): - raise RuntimeError("Multigpu calls requires NCCL and Cupy.") - _check_tensor_list_input(tensor_list) - g = _check_and_get_group(group_name) - - # check src rank - _check_rank_valid(g, src_rank) - _check_root_tensor_valid(len(tensor_list), src_tensor) - opts = types.BroadcastOptions() - opts.root_rank = src_rank - opts.root_tensor = src_tensor - g.broadcast(tensor_list, opts) + g.broadcast(tensor, opts) def allgather(tensor_list: list, tensor, group_name: str = "default"): @@ -394,7 +301,7 @@ def allgather(tensor_list: list, tensor, group_name: str = "default"): Args: tensor_list (list): the results, stored as a list of tensors. tensor: the tensor (to be gathered) in the current process - group_name (str): the name of the collective group. + group_name: the name of the collective group. Returns: None @@ -407,33 +314,9 @@ def allgather(tensor_list: list, tensor, group_name: str = "default"): # Here we make it more strict: len(tensor_list) == world_size. raise RuntimeError( "The length of the tensor list operands to allgather " - "must be equal to world_size.") - opts = types.AllGatherOptions() - g.allgather([tensor_list], [tensor], opts) - - -def allgather_multigpu(output_tensor_lists: list, - input_tensor_list: list, - group_name: str = "default"): - """Allgather tensors from each gpus of the group into lists. - - Args: - output_tensor_lists (List[List[tensor]]): gathered results, with shape - must be num_gpus * world_size * shape(tensor). - input_tensor_list: (List[tensor]): a list of tensors, with shape - num_gpus * shape(tensor). - group_name (str): the name of the collective group. - - Returns: - None - """ - if not types.cupy_available(): - raise RuntimeError("Multigpu calls requires NCCL and Cupy.") - _check_tensor_lists_input(output_tensor_lists) - _check_tensor_list_input(input_tensor_list) - g = _check_and_get_group(group_name) + "must not be equal to world_size.") opts = types.AllGatherOptions() - g.allgather(output_tensor_lists, input_tensor_list, opts) + g.allgather(tensor_list, tensor, opts) def reducescatter(tensor, @@ -463,38 +346,11 @@ def reducescatter(tensor, "must not be equal to world_size.") opts = types.ReduceScatterOptions() opts.reduceOp = op - g.reducescatter([tensor], [tensor_list], opts) - - -def reducescatter_multigpu(output_tensor_list, - input_tensor_lists, - group_name: str = "default", - op=types.ReduceOp.SUM): - """Reducescatter a list of tensors across all GPUs. - - Args: - output_tensor_list: the resulted list of tensors, with - shape: num_gpus * shape(tensor). - input_tensor_lists: the original tensors, with shape: - num_gpus * world_size * shape(tensor). - group_name (str): the name of the collective group. - op: The reduce operation. - - Returns: - None. - """ - if not types.cupy_available(): - raise RuntimeError("Multigpu calls requires NCCL and Cupy.") - _check_tensor_lists_input(input_tensor_lists) - _check_tensor_list_input(output_tensor_list) - g = _check_and_get_group(group_name) - opts = types.ReduceScatterOptions() - opts.reduceOp = op - g.reducescatter(output_tensor_list, input_tensor_lists, opts) + g.reducescatter(tensor, tensor_list, opts) def send(tensor, dst_rank: int, group_name: str = "default"): - """Send a tensor to a remote process synchronously. + """Send a tensor to a remote processes synchronously. Args: tensor: the tensor to send. @@ -510,41 +366,7 @@ def send(tensor, dst_rank: int, group_name: str = "default"): if dst_rank == g.rank: raise RuntimeError( "The destination rank '{}' is self.".format(dst_rank)) - opts = types.SendOptions() - opts.dst_rank = dst_rank - g.send([tensor], opts) - - -def send_multigpu(tensor, - dst_rank: int, - dst_gpu_index: int, - group_name: str = "default"): - """Send a tensor to a remote GPU synchronously. - - The function asssume each process owns >1 GPUs, and the sender - process and receiver process has equal nubmer of GPUs. - - Args: - tensor: the tensor to send, located on a GPU. - dst_rank (int): the rank of the destination process. - dst_gpu_index (int): the destination gpu index. - group_name (str): the name of the collective group. - - Returns: - None - """ - if not types.cupy_available(): - raise RuntimeError("send_multigpu call requires NCCL.") - _check_single_tensor_input(tensor) - g = _check_and_get_group(group_name) - _check_rank_valid(g, dst_rank) - if dst_rank == g.rank: - raise RuntimeError("The dst_rank '{}' is self. Considering " - "doing GPU to GPU memcpy instead?".format(dst_rank)) - opts = types.SendOptions() - opts.dst_rank = dst_rank - opts.dst_gpu_index = dst_gpu_index - g.send([tensor], opts) + g.send(tensor, dst_rank) def recv(tensor, src_rank: int, group_name: str = "default"): @@ -564,41 +386,7 @@ def recv(tensor, src_rank: int, group_name: str = "default"): if src_rank == g.rank: raise RuntimeError( "The destination rank '{}' is self.".format(src_rank)) - opts = types.RecvOptions() - opts.src_rank = src_rank - g.recv([tensor], opts) - - -def recv_multigpu(tensor, - src_rank: int, - src_gpu_index: int, - group_name: str = "default"): - """Receive a tensor from a remote GPU synchronously. - - The function asssume each process owns >1 GPUs, and the sender - process and receiver process has equal nubmer of GPUs. - - Args: - tensor: the received tensor, located on a GPU. - src_rank (int): the rank of the source process. - src_gpu_index (int): the index of the source gpu on the src process. - group_name (str): the name of the collective group. - - Returns: - None - """ - if not types.cupy_available(): - raise RuntimeError("recv_multigpu call requires NCCL.") - _check_single_tensor_input(tensor) - g = _check_and_get_group(group_name) - _check_rank_valid(g, src_rank) - if src_rank == g.rank: - raise RuntimeError("The dst_rank '{}' is self. Considering " - "doing GPU to GPU memcpy instead?".format(src_rank)) - opts = types.RecvOptions() - opts.src_rank = src_rank - opts.src_gpu_index = src_gpu_index - g.recv([tensor], opts) + g.recv(tensor, src_rank) def _check_and_get_group(group_name): @@ -635,6 +423,16 @@ def _check_and_get_group(group_name): return g +def _check_backend_availability(backend: types.Backend): + """Check whether the backend is available.""" + if backend == types.Backend.MPI: + if not mpi_available(): + raise RuntimeError("MPI is not available.") + elif backend == types.Backend.NCCL: + if not nccl_available(): + raise RuntimeError("NCCL is not available.") + + def _check_single_tensor_input(tensor): """Check if the tensor is with a supported type.""" if isinstance(tensor, np.ndarray): @@ -650,16 +448,6 @@ def _check_single_tensor_input(tensor): type(tensor))) -def _check_backend_availability(backend: types.Backend): - """Check whether the backend is available.""" - if backend == types.Backend.GLOO: - if not gloo_available(): - raise RuntimeError("GLOO is not available.") - elif backend == types.Backend.NCCL: - if not nccl_available(): - raise RuntimeError("NCCL is not available.") - - def _check_inside_actor(): """Check if currently it is inside a Ray actor/task.""" worker = ray.worker.global_worker @@ -674,8 +462,8 @@ def _check_rank_valid(g, rank: int): """Check the rank: 0 <= rank < world_size.""" if rank < 0: raise ValueError("rank '{}' is negative.".format(rank)) - if rank >= g.world_size: - raise ValueError("rank '{}' must be less than world size " + if rank > g.world_size: + raise ValueError("rank '{}' is greater than world size " "'{}'".format(rank, g.world_size)) @@ -688,24 +476,3 @@ def _check_tensor_list_input(tensor_list): raise RuntimeError("Got an empty list of tensors.") for t in tensor_list: _check_single_tensor_input(t) - - -def _check_tensor_lists_input(tensor_lists): - """Check if the input is a list of lists of supported tensor types.""" - if not isinstance(tensor_lists, list): - raise RuntimeError("The input must be a list of lists of tensors. " - "Got '{}'.".format(type(tensor_lists))) - if not tensor_lists: - raise RuntimeError(f"Did not receive tensors. Got: {tensor_lists}") - for t in tensor_lists: - _check_tensor_list_input(t) - - -def _check_root_tensor_valid(length, root_tensor): - """Check the root_tensor device is 0 <= root_tensor < length""" - if root_tensor < 0: - raise ValueError("root_tensor '{}' is negative.".format(root_tensor)) - if root_tensor >= length: - raise ValueError( - "root_tensor '{}' is greater than the number of GPUs: " - "'{}'".format(root_tensor, length)) diff --git a/python/ray/util/collective/collective_group/nccl_collective_group.py b/python/ray/util/collective/collective_group/nccl_collective_group.py index 4cc693f11479..ba8c7d2dbb08 100644 --- a/python/ray/util/collective/collective_group/nccl_collective_group.py +++ b/python/ray/util/collective/collective_group/nccl_collective_group.py @@ -11,11 +11,15 @@ from ray.util.collective.const import get_nccl_store_name from ray.util.collective.types import AllReduceOptions, \ BarrierOptions, Backend, ReduceOptions, BroadcastOptions, \ - AllGatherOptions, ReduceScatterOptions, SendOptions, \ - RecvOptions + AllGatherOptions, ReduceScatterOptions logger = logging.getLogger(__name__) +# TODO(Hao): +# (1) stream management, instead of using the default stream, +# using a dedicate stream +# (2) communicator management and support num_gpus > 2 per actor. + class Rendezvous: """A rendezvous class for different actor/task processes to meet. @@ -27,18 +31,13 @@ class Rendezvous: process. Args: - store_key (str): the unique store key, usually as a concatanation - of group_name and communicator key. See `get_nccl_communicator` - for more details. + group_name (str): the unique user-specified group name. """ - def __init__(self, store_key): - if not store_key: - raise ValueError( - "Invalid store_key. The store_key is a concatenation of " - "'group_name' and the 'communicator_key'. See the " - "docstring of `get_nccl_communicator` for details.") - self._store_key = store_key + def __init__(self, group_name): + if not group_name: + raise ValueError("Invalid group name.") + self._group_name = group_name self._store_name = None self._store = None @@ -54,7 +53,7 @@ def meet(self, timeout_s=180): if timeout_s <= 0: raise ValueError("The 'timeout' argument must be positive. " "Got '{}'.".format(timeout_s)) - self._store_name = get_nccl_store_name(self._store_key) + self._store_name = get_nccl_store_name(self._group_name) timeout_delta = datetime.timedelta(seconds=timeout_s) elapsed = datetime.timedelta(seconds=0) start_time = datetime.datetime.now() @@ -73,9 +72,7 @@ def meet(self, timeout_s=180): break if not self._store: raise RuntimeError("Unable to meet other processes " - "at the rendezvous store. If you are using " - "P2P communication, please check if tensors " - "are put in the correct GPU. ") + "at the rendezvous store.") @property def store(self): @@ -86,9 +83,8 @@ def get_nccl_id(self, timeout_s=180): Args: timeout_s: timeout in seconds. - Return: - uid (str): the NCCLUniqueID if successful. + str: the NCCLUniqueID if successful. """ if not self._store: raise ValueError("Rendezvous store is not setup.") @@ -114,52 +110,55 @@ def __init__(self, world_size, rank, group_name): """Init an NCCL collective group.""" super(NCCLGroup, self).__init__(world_size, rank, group_name) - # communicator and stream cache. - # TODO (Hao): we need a lock here... - self._dev_comm_map = {} - self._dev_streams_map = {} - - # record the used GPU IDs. - self._used_gpu_indices = set() + # TODO(Hao): change this to a be a cache + self._collective_comm_cache = None + self._p2p_comm_cache = {} if nccl_util.get_nccl_build_version() < 2000: raise RuntimeError("NCCL in Ray requires NCCL >= 2.0.") + # TODO(Hao): check version here if nccl_util.get_nccl_runtime_version() < 2704: logger.warning("NCCL send/recv calls requires NCCL>=2.7.4") + # Setup a tensor for barrier calls + self._barrier_tensor = cupy.array([1]) + def destroy_group(self): """Destroy the group and release NCCL communicators.""" - if len(self._dev_comm_map.keys()) > 0: - - # TODO(Hao): check this barrier call - # self.barrier() - - # Destroy the communicators and streams. - for comm_key, comms in self._dev_comm_map.items(): - for c in comms: - c.destroy() - self._dev_comm_map[comm_key] = None - - if self.rank == 0: - for comm_key in self._dev_comm_map: - assert not self._dev_comm_map[comm_key] - group_key = self._generate_group_key(comm_key) - self._destroy_store(group_key) - self._barrier_tensor = None - self._dev_comm_map = None - self._dev_streams_map = None + if self._collective_comm_cache: + self.barrier() + # We also need a barrier call here. + stream = self._get_cuda_stream() + stream.synchronize() + # destroy the communicator + self._collective_comm_cache.destroy() + self._collective_comm_cache = None + + if self.rank == 0: + self._destroy_store(self.group_name) + + if self._p2p_comm_cache: + for key, comm in self._p2p_comm_cache.items(): + comm.destroy() + min_rank, max_rank = self._parse_p2p_group_key(key) + if self.rank == min_rank: + self._destroy_store(key) + self._p2p_comm_cache[key] = None + for key in list(self._p2p_comm_cache.keys()): + del self._p2p_comm_cache[key] + self._p2p_comm_cache = None + super(NCCLGroup, self).destroy_group() @classmethod def backend(cls): return Backend.NCCL - def allreduce(self, tensors, allreduce_options=AllReduceOptions()): - """AllReduce tensors across the collective group following options. + def allreduce(self, tensor, allreduce_options=AllReduceOptions()): + """AllReduce the tensor across the collective group following options. Args: - tensors (List): the list of tensors to be reduced. Each tensor must - reside on one GPU of the current process. + tensor: the tensor to be reduced, each tensor locates on a GPU. allreduce_options: allreduce options. Returns: @@ -175,41 +174,29 @@ def collective_fn(input_tensor, output_tensor, comm, stream): nccl_util.get_nccl_reduce_op(allreduce_options.reduceOp), stream.ptr) - self._collective(tensors, tensors, collective_fn) + self._collective(tensor, tensor, collective_fn) def barrier(self, barrier_options=BarrierOptions()): """Blocks until all processes reach this barrier. Args: - barrier_options: barrier options. + barrier_options: Returns: None """ - # Get the device list. - if self._used_gpu_indices: - devices = list(self._used_gpu_indices) - else: - devices = list(range(nccl_util.get_num_gpus())) - barrier_tensors = [None] * len(devices) - for i, d in enumerate(devices): - with nccl_util.Device(d): - barrier_tensors[i] = cupy.array([1]) - self.allreduce(barrier_tensors) - - def reduce(self, tensors, reduce_options=ReduceOptions()): - """Reduce tensors to a destination gpu following options. + self.allreduce(self._barrier_tensor) + + def reduce(self, tensor, reduce_options=ReduceOptions()): + """Reduce tensor to a destination process following options. Args: - tensors (List): the list of tensors to be reduced, each tensor - must reside on one gpu of the current process. - reduce_options: reduce options. + tensor: the tensor to be reduced. + reduce_options: reduce options Returns: None """ - root_rank = len(tensors) * reduce_options.root_rank \ - + reduce_options.root_tensor def collective_fn(input_tensor, output_tensor, comm, stream): comm.reduce( @@ -218,43 +205,40 @@ def collective_fn(input_tensor, output_tensor, comm, stream): nccl_util.get_tensor_n_elements(input_tensor), nccl_util.get_nccl_tensor_dtype(input_tensor), nccl_util.get_nccl_reduce_op(reduce_options.reduceOp), - root_rank, stream.ptr) + reduce_options.root_rank, stream.ptr) - self._collective(tensors, tensors, collective_fn) + self._collective(tensor, tensor, collective_fn) - def broadcast(self, tensors, broadcast_options=BroadcastOptions()): - """Broadcast tensors to all other gpus following options. + def broadcast(self, tensor, broadcast_options=BroadcastOptions()): + """Broadcast tensor to all other processes following options. Args: - tensors (List): tensors to be broadcast or received. + tensor: the tensor to be broadcasted. broadcast_options: broadcast options. Returns: None """ - root_rank = len(tensors) * broadcast_options.root_rank \ - + broadcast_options.root_tensor def collective_fn(input_tensor, output_tensor, comm, stream): comm.broadcast( nccl_util.get_tensor_ptr(input_tensor), nccl_util.get_tensor_ptr(output_tensor), nccl_util.get_tensor_n_elements(input_tensor), - nccl_util.get_nccl_tensor_dtype(input_tensor), root_rank, - stream.ptr) + nccl_util.get_nccl_tensor_dtype(input_tensor), + broadcast_options.root_rank, stream.ptr) - self._collective(tensors, tensors, collective_fn) + self._collective(tensor, tensor, collective_fn) def allgather(self, - tensor_lists, - tensors, + tensor_list, + tensor, allgather_options=AllGatherOptions()): - """Allgather tensors across gpus into a list of tensors. + """Allgather tensors across the group into a list of tensors. Args: - tensor_lists (List[List[Tensor]]): allgathered tensors. - tensors: the list of tensors to allgather across the group. - Each tensor must lolcate on a GPU of the process. + tensor_list: the tensor list to store the results. + tensor: the tensor to be allgather-ed across the group. allgather_options: allgather options. Returns: @@ -268,36 +252,30 @@ def collective_fn(input_tensor, output_tensor, comm, stream): nccl_util.get_tensor_n_elements(input_tensor), nccl_util.get_nccl_tensor_dtype(input_tensor), stream.ptr) - _check_inputs_compatibility_for_scatter_gather(tensors, tensor_lists) - output_flattened = [ - _flatten_for_scatter_gather(tensor_list, copy=False) - for tensor_list in tensor_lists - ] + _check_inputs_compatibility_for_scatter_gather(tensor, tensor_list) + flattened_output_tensor = _flatten_for_scatter_gather( + tensor_list, copy=False) def postprocess_fn(stream): - # TODO(Hao): designate a copy stream. - for i, tensor_list in enumerate(tensor_lists): - for j, tensor in enumerate(tensor_list): - nccl_util.copy_tensor(tensor, output_flattened[i][j]) + for i, tensor in enumerate(tensor_list): + nccl_util.copy_tensor(tensor, flattened_output_tensor[i]) self._collective( - tensors, - output_flattened, + tensor, + flattened_output_tensor, collective_fn, postprocess_fn=postprocess_fn) def reducescatter(self, - tensors, - tensor_lists, + tensor, + tensor_list, reducescatter_options=ReduceScatterOptions()): - """Reduce the scatter a list of tensors across the group. + """Reducescatter a list of tensors across the group. Args: - tensors (List): the output tensors (could be unspecified), each - located on a GPU of the current process. - tensor_lists (List[List]): the list of tensors to be reduced then - scattered. - reducescatter_options: reduce-scatter options. + tensor: the output tensor (could be unspecified). + tensor_list: the list of tensor to be reduced then scattered. + reducescatter_options: reducescatter options. Returns: None @@ -312,30 +290,26 @@ def collective_fn(input_tensor, output_tensor, comm, stream): nccl_util.get_nccl_reduce_op(reducescatter_options.reduceOp), stream.ptr) - _check_inputs_compatibility_for_scatter_gather(tensors, tensor_lists) - input_flattened = [ - _flatten_for_scatter_gather(tensor_list, copy=False) - for tensor_list in tensor_lists - ] + _check_inputs_compatibility_for_scatter_gather(tensor, tensor_list) + flattened_input_tensor = _flatten_for_scatter_gather( + tensor_list, copy=False) def preprocess_fn(stream): - # TODO(Hao): designate a copy stream. - for i, tensor_list in enumerate(tensor_lists): - for j, tensor in enumerate(tensor_list): - nccl_util.copy_tensor(input_flattened[i][j], tensor) + for i, tensor in enumerate(tensor_list): + nccl_util.copy_tensor(flattened_input_tensor[i], tensor) self._collective( - input_flattened, - tensors, + flattened_input_tensor, + tensor, collective_fn, preprocess_fn=preprocess_fn) - def send(self, tensors, send_options=SendOptions()): - """Send a tensor to a destination gpu in the group. + def send(self, tensor, dst_rank): + """Send tensor to a destination process in the group. Args: - tensors (List): the tensor to send. - send_options: send options. + tensor: the tensor to send. + dst_rank: the rank of the destination process. Returns: None @@ -347,15 +321,14 @@ def p2p_fn(tensor, comm, stream, peer): nccl_util.get_tensor_n_elements(tensor), nccl_util.get_nccl_tensor_dtype(tensor), peer, stream.ptr) - self._point2point(tensors, p2p_fn, send_options.dst_rank, - send_options.dst_gpu_index) + self._point2point(tensor, p2p_fn, dst_rank) - def recv(self, tensors, recv_options=RecvOptions()): - """Receive a tensor from a source gpu in the group. + def recv(self, tensor, src_rank): + """Receive tensor from a source process in the group. Args: - tensors (List): the received tensor. - recv_options: Receive options. + tensor: the received tensor. + src_rank: the rank of the source process. Returns: None @@ -367,218 +340,128 @@ def p2p_fn(tensor, comm, stream, peer): nccl_util.get_tensor_n_elements(tensor), nccl_util.get_nccl_tensor_dtype(tensor), peer, stream.ptr) - self._point2point(tensors, p2p_fn, recv_options.src_rank, - recv_options.src_gpu_index) - - def _get_nccl_collective_communicator(self, comm_key, device_list): - """Create or retrieve an NCCL communicator from cache. - - If the communicator is found in cache, return the communicator. If not, - a communicator and a stream will be created and put in cache. - TODO(Hao): this function is not thread-safe now. + self._point2point(tensor, p2p_fn, src_rank) - Args: - comm_key (str): the key to query the communicator cache. - device_list (List): a list of GPU devices of the current process - that participates into the collective. + def _get_nccl_collective_communicator(self): + """Create or retrieve a cached NCCL communicator. Returns: - communicator: the NCCL communicator corresponded to the devices. + communicator """ - if not comm_key: - raise RuntimeError("Got empty communicator key.") - for d in device_list: - self._used_gpu_indices.add(d) - - # TODO(Hao): lock the _dev_comm_map here. - if comm_key in self._dev_comm_map: - return self._dev_comm_map[comm_key] - - group_key = self._generate_group_key(comm_key) - if self.rank == 0: - nccl_uid = self._generate_nccl_uid(group_key) - else: - rendezvous = Rendezvous(group_key) - rendezvous.meet() - nccl_uid = rendezvous.get_nccl_id() - - # Now create the communicators - actual_world_size = len(device_list) * self.world_size - comms = [None] * len(device_list) - streams = [None] * len(device_list) - nccl_util.groupStart() - for i, device in enumerate(device_list): - actual_rank = self.rank * len(device_list) + i - with nccl_util.Device(device): - comms[i] = nccl_util.create_nccl_communicator( - actual_world_size, nccl_uid, actual_rank) - streams[i] = cupy.cuda.Stream.null - # Stream(non_blocking=True) - nccl_util.groupEnd() - self._dev_comm_map[comm_key] = comms - self._dev_streams_map[comm_key] = streams - return comms - - @staticmethod - def _sync_streams(): - """Let NCCL streams wait for current streams for every device.""" - # FIXME: This behavior is different from nccl document. It seems like - # cupy allocate tensors on null streams. - cupy.cuda.Stream.null.synchronize() - - def _get_nccl_p2p_communicator(self, comm_key, my_gpu_idx, peer_rank, - peer_gpu_idx): + if not self._collective_comm_cache: + # create the communicator + if self.rank == 0: + group_uid = self._generate_nccl_uid(self.group_name) + else: + rendezvous = Rendezvous(self.group_name) + rendezvous.meet() + group_uid = rendezvous.get_nccl_id() + self._collective_comm_cache = \ + nccl_util.create_nccl_communicator(self.world_size, + group_uid, + self.rank) + return self._collective_comm_cache + + def _get_nccl_p2p_communicator(self, rank1, rank2): """Create or retrieve an NCCL communicator for p2p tasks. - Note(Hao): this function is not thread-safe now. - Args: - comm_key (str): communicator key. - my_gpu_idx (int): the gpu index on the current process. - peer_rank (int): the rank of the destination process. - peer_gpu_idx (int): the gpu index on the peer process. + rank1 (int): source rank. + rank2 (int): destination rank. + Returns: communicator """ - if not comm_key: - raise RuntimeError("Got empty communicator key.") - - # TODO(Hao): lock the _dev_comm_map here. - if comm_key in self._dev_comm_map: - return self._dev_comm_map[comm_key] - - # Note (Hao): This is a bit complex so I decide to take a note here. - # Here we need to consider three cases: - # Case 1: src_rank != dst_rank, hence the send and recv happen on - # different process (actors/tasks); each process makes independent - # collective calls and manages corresponding communicators. - # Case 2: src_rank == dst_rank, src_gpu_idx == dst_gpu_idx; for - # this case, we simply throw a RuntimeError; - # Case 3: src_rank == dst_rank, src_gpu_idx != dst_gpu_idx, which - # means the send and recv will be called on the same process. We - # DO NOT support this case for now. We need to properly scope: - # (1) communicators creation, and - # (2) send/recv calls - # using groupStart(( and groupEnd() calls to avoid deadlocks. - if self.rank < peer_rank: - my_p2p_rank = 0 - elif self.rank > peer_rank: - my_p2p_rank = 1 - else: - raise RuntimeError( - "Send and recv happens on the same process! " - "ray.util.collective does not support this case as of now. " - "Alternatively, consider doing GPU to GPU memcpy?") - - group_key = self._generate_group_key(comm_key) - if my_p2p_rank == 0: - nccl_uid = self._generate_nccl_uid(group_key) - else: - rendezvous = Rendezvous(group_key) - rendezvous.meet() - nccl_uid = rendezvous.get_nccl_id() - - # create the p2p communicators - with nccl_util.Device(my_gpu_idx): - comm = nccl_util.create_nccl_communicator(2, nccl_uid, my_p2p_rank) - stream = cupy.cuda.Stream.null - # Stream(non_blocking=True) - self._dev_comm_map[comm_key] = [comm] - self._dev_streams_map[comm_key] = [stream] - return [comm] - - def _generate_group_key(self, comm_key): - """Generate a unique key used to initialize the KV store. - - The group key is a concatenation of the communicator key and - the group name, following: [comm_key]@[group_name]. - """ - return comm_key + "@" + self.group_name + min_rank = min(rank1, rank2) + max_rank = max(rank1, rank2) + my_rank = 0 if self.rank == min_rank else 1 + p2p_group_key = self._generate_p2p_group_key(min_rank, max_rank) + comm = self._p2p_comm_cache.get(p2p_group_key) + if not comm: + if self.rank == min_rank: + group_uid = self._generate_nccl_uid(p2p_group_key) + else: + rendezvous = Rendezvous(p2p_group_key) + rendezvous.meet() + group_uid = rendezvous.get_nccl_id() + comm = nccl_util.create_nccl_communicator(2, group_uid, my_rank) + self._p2p_comm_cache[p2p_group_key] = comm + return comm + + def _generate_p2p_group_key(self, min_rank, max_rank): + return self.group_name + "_" + str(min_rank) + "_" + str(max_rank) @staticmethod - def _destroy_store(group_key): - """Destroy the KV store (Ray named actor). - - Args: - group_key (str): the unique key to retrieve the KV store. + def _parse_p2p_group_key(key): + strs = key.split("_") + return int(strs[-2]), int(strs[-1]) - Returns: - None - """ - store_name = get_nccl_store_name(group_key) + @staticmethod + def _destroy_store(group_name): + store_name = get_nccl_store_name(group_name) store = ray.get_actor(store_name) # ray.get([store.__ray_terminate__.remote()]) ray.kill(store) - def _generate_nccl_uid(self, key): - """Generate an NCCL unique ID for initializing communicators. - - The method will also create a KV store using Ray named actor and store - the NCCLUniqueID in the store. The store needs to be garbage collected - when destroying the collective group. + def _generate_nccl_uid(self, name): + """Generate an NCCL UID by calling the NCCL API. Args: - key (str): the key of the . + name: the name of the collective group. Returns: - NCCLUniqueID (str): NCCL unique ID. + str: NCCL uid. """ group_uid = nccl_util.get_nccl_unique_id() - store_name = get_nccl_store_name(key) + store_name = get_nccl_store_name(name) # Avoid a potential circular dependency in ray/actor.py from ray.util.collective.util import NCCLUniqueIDStore store = NCCLUniqueIDStore.options( name=store_name, lifetime="detached").remote(store_name) - ray.get([store.set_id.remote(group_uid)]) + ray.wait([store.set_id.remote(group_uid)]) return group_uid + @staticmethod + def _get_cuda_stream(): + """Obtain an idle stream from a stream pool for the collective task.""" + # TODO: implement a simple stream manager. + return cupy.cuda.Stream.null + def _collective(self, - input_tensors, - output_tensors, + input_tensor, + output_tensor, collective_fn, preprocess_fn=None, postprocess_fn=None): """A method to encapsulate all collective calls. Args: - input_tensors: the list of the input tensors. - output_tensors: the list of the output tensors. + input_tensor: the input tensor. + output_tensor: the output tensor. collective_fn: the collective function call. - preprocess_fn: preprocess procedures before collective calls. - postprocess_fn: postprocess procedures after collective calls. + preprocess_fn: preprocess function to call before collectives. + postprocess_fn: postprocess function to call after collectives. Returns: None """ - _check_gpu_tensors(input_tensors) - _check_gpu_tensors(output_tensors) - - devices = nccl_util.get_tensor_device_list(input_tensors) - key = _get_comm_key_from_devices(devices) - comms = self._get_nccl_collective_communicator(key, devices) - streams = self._dev_streams_map[key] - - # TODO(Hao): sync streams and events - self._sync_streams() + comm = self._get_nccl_collective_communicator() + stream = self._get_cuda_stream() # Make the collective call if preprocess_fn: - preprocess_fn(streams) - nccl_util.groupStart() - for i, tensor in enumerate(input_tensors): - collective_fn(tensor, output_tensors[i], comms[i], streams[i]) - nccl_util.groupEnd() + preprocess_fn(stream) + collective_fn(input_tensor, output_tensor, comm, stream) if postprocess_fn: - postprocess_fn(streams) + postprocess_fn(stream) - def _point2point(self, tensors, p2p_fn, peer_rank: int, peer_gpu_idx: int): - """A method to encapsulate all peer-to-peer calls (i.e., send/recv). + def _point2point(self, tensor, p2p_fn, peer_rank: int): + """A method to encapsulate all p2p calls. Args: - tensors: the tensor to send or receive. + tensor: the tensor to be sent/received. p2p_fn: the p2p function call. - peer_rank (int): the rank of the peer process. - peer_gpu_idx (int): the index of the gpu on the peer process. + peer_rank (int): the peer rank of the current process. Returns: None @@ -588,24 +471,13 @@ def _point2point(self, tensors, p2p_fn, peer_rank: int, peer_gpu_idx: int): raise RuntimeError("P2p send/recv requires NCCL >= 2.7.4. " "Got '{}'.".format( nccl_util.get_nccl_runtime_version())) - _check_gpu_tensors(tensors) - - # we currently only support single device to single device send/recv. - assert len(tensors) == 1 - my_gpu_idx = nccl_util.get_tensor_device(tensors[0]) - comm_key = _get_comm_key_send_recv(self.rank, my_gpu_idx, peer_rank, - peer_gpu_idx) - comms = self._get_nccl_p2p_communicator(comm_key, my_gpu_idx, - peer_rank, peer_gpu_idx) - streams = self._dev_streams_map[comm_key] - - # TODO(Hao): sync streams and events - self._sync_streams() # We have made sure that self.rank != peer_rank during API check. peer_p2p_rank = 0 if self.rank > peer_rank else 1 - for i, tensor in enumerate(tensors): - p2p_fn(tensors[i], comms[i], streams[i], peer_p2p_rank) + comm = self._get_nccl_p2p_communicator(self.rank, peer_rank) + stream = self._get_cuda_stream() + # Make the p2p call: + p2p_fn(tensor, comm, stream, peer_p2p_rank) def _flatten_for_scatter_gather(tensor_list, copy=False): @@ -624,130 +496,29 @@ def _flatten_for_scatter_gather(tensor_list, copy=False): # note we need a cupy dtype here. dtype = nccl_util.get_cupy_tensor_dtype(t) buffer_shape = [len(tensor_list)] + nccl_util.get_tensor_shape(t) - device = nccl_util.get_tensor_device(t) - with nccl_util.Device(device): - buffer = cupy.empty(buffer_shape, dtype=dtype) + buffer = cupy.empty(buffer_shape, dtype=dtype) if copy: for i, tensor in enumerate(tensor_list): nccl_util.copy_tensor(buffer[i], tensor) return buffer -def _check_inputs_compatibility_for_scatter_gather(tensors, tensor_lists): - """Check the compatibility between tensor input and tensor list input.""" - if not tensors or not isinstance(tensors, list): - raise RuntimeError( - "The first argument 'tensors' expects a list of tensors.") - if not tensor_lists or not isinstance(tensor_lists, list): - raise RuntimeError("The second argument 'tensor_lists' " - "expects a list of tensor list.") - dtype = nccl_util.get_nccl_tensor_dtype(tensors[0]) - shape = nccl_util.get_tensor_shape(tensors[0]) - for i, tensor_list in enumerate(tensor_lists): - # check all tensor in `tensors` match. - dt = nccl_util.get_nccl_tensor_dtype(tensors[i]) +def _check_inputs_compatibility_for_scatter_gather(tensor, tensor_list): + """Check the compatibility between tensor input and tensor list inputs.""" + if not tensor_list: + raise RuntimeError("Got empty list of tensors.") + dtype = nccl_util.get_nccl_tensor_dtype(tensor) + shape = nccl_util.get_tensor_shape(tensor) + for t in tensor_list: + # check dtype + dt = nccl_util.get_nccl_tensor_dtype(t) if dt != dtype: raise RuntimeError("All tensor operands to scatter/gather must " - "have the same dtype. Got '{}' and '{}'." - .format(dt, dtype)) + "have the same dtype. Got '{}' and '{}'" + "".format(dt, dtype)) # Note: typically CCL libraries only requires they have the same - # number of elements; Here we make it more strict -- we require - # exact shape match. - s = nccl_util.get_tensor_shape(tensors[i]) - if s != shape: + # number of elements; + # Here we make it more strict -- we require exact shape match. + if nccl_util.get_tensor_shape(t) != shape: raise RuntimeError("All tensor operands to scatter/gather must " - "have the same shape. Got '{}' and '{}'." - .format(s, shape)) - # check all tensors in `tensor_lists` match. - for t in tensor_lists[i]: - # check dtype - dt = nccl_util.get_nccl_tensor_dtype(t) - if dt != dtype: - raise RuntimeError( - "All tensor operands to scatter/gather must " - "have the same dtype. Got '{}' and '{}'.".format( - dt, dtype)) - s = nccl_util.get_tensor_shape(t) - if s != shape: - raise RuntimeError( - "All tensor operands to scatter/gather must " - "have the same shape. Got '{}' and '{}'.".format(s, shape)) - - -def _check_gpu_tensors(tensors): - """Check all tensors are distributed on different GPUs.""" - if not tensors or not isinstance(tensors, list): - raise RuntimeError("'tensors' must be a nonempty list.") - if len(tensors) > nccl_util.get_num_gpus(): - raise RuntimeError("Tensor list cannot be larger than the number" - "of available GPUs. Got {} > {}.".format( - len(tensors), nccl_util.get_num_gpus())) - t0 = tensors[0] - dt = nccl_util.get_nccl_tensor_dtype(t0) - s = nccl_util.get_tensor_shape(t0) - d = nccl_util.get_tensor_device(t0) - for i, t in enumerate(tensors): - if i == 0: - continue - # We need to check the following: - # (1) tensor is cuda (already checked during API) - # (2) tensor dtype - # (3) tensor shape match - # (4) each tensor is on a different GPU - dtype = nccl_util.get_nccl_tensor_dtype(t) - if dt != dtype: - raise RuntimeError("Tensors must have identical dtype. Got: '{}'." - .format(dtype)) - shape = nccl_util.get_tensor_shape(t) - if s != shape: - raise RuntimeError("Tensor must have identical shape. Got: '{}'." - .format(shape)) - device = nccl_util.get_tensor_device(t) - if device == d: - raise RuntimeError("Tensor must be on distinct GPUs.") - - -def _get_comm_key_from_devices(devices): - """Return a key from a list of devices for collective calls. - - For example, if the tensors are on gpus 0, 1, 2, 3, - then the key would be "0,1,2,3". - - Args: - devices(list): a list of GPU device indices - - Returns: - str: a string represents the key to query the communicator cache. - - """ - return ",".join([str(d) for d in devices]) - - -def _get_comm_key_send_recv(my_rank, my_gpu_idx, peer_rank, peer_gpu_idx): - """Return a key given source and destination ranks for p2p tasks. - - The p2p key is in the following form: - [min_rank]_[gpu_index]:[max_rank]_[gpu_index]. - - Args: - my_rank (int): the rank of the source process. - my_gpu_idx (int): the source gpu index on the process. - peer_rank (int): the rank of the destination process. - peer_gpu_idx (int): the destination gpu index on the process. - - Returns: - comm_key (str): a string key to query the communication cache. - """ - if my_rank < peer_rank: - lower_key = str(my_rank) + "_" + str(my_gpu_idx) - higher_key = str(peer_rank) + "_" + str(peer_gpu_idx) - elif my_rank > peer_rank: - lower_key = str(peer_rank) + "_" + str(peer_gpu_idx) - higher_key = str(my_rank) + "_" + str(my_gpu_idx) - else: - raise RuntimeError( - "Send and recv happens on the same process. ray.util.collective " - "does not support this case as of now. Alternatively, consider " - "doing GPU to GPU memcpy?") - comm_key = lower_key + ":" + higher_key - return comm_key + "have the same shape.") diff --git a/python/ray/util/collective/collective_group/nccl_util.py b/python/ray/util/collective/collective_group/nccl_util.py index 36895d79b884..889c8c443f36 100644 --- a/python/ray/util/collective/collective_group/nccl_util.py +++ b/python/ray/util/collective/collective_group/nccl_util.py @@ -3,12 +3,9 @@ try: import cupy from cupy.cuda import nccl - from cupy.cuda import Device # noqa: F401 from cupy.cuda.nccl import get_version from cupy.cuda.nccl import get_build_version from cupy.cuda.nccl import NcclCommunicator - from cupy.cuda.nccl import groupStart # noqa: F401 - from cupy.cuda.nccl import groupEnd # noqa: F401 except ImportError: raise ImportError("NCCL in Ray requires Cupy being available!") @@ -77,11 +74,6 @@ } -def get_num_gpus(): - """Returns the number of compute-capable GPUs.""" - return cupy.cuda.runtime.getDeviceCount() - - def get_nccl_build_version(): return get_build_version() @@ -98,12 +90,14 @@ def create_nccl_communicator(world_size, nccl_unique_id, rank): """Create an NCCL communicator using NCCL APIs. Args: - world_size (int): the number of processes of this communicator group. + world_size (int): the number of processes of this communcator group. nccl_unique_id (str): the NCCLUniqueID for this group. rank (int): the rank of this process. Returns: comm (nccl.ncclComm_t): an NCCL communicator. """ + # TODO(Hao): make this inside the NCCLComm class, + # and implement the abort method. Make it RAII. comm = NcclCommunicator(world_size, nccl_unique_id, rank) return comm @@ -155,7 +149,7 @@ def get_tensor_ptr(tensor): if torch_available(): if isinstance(tensor, torch.Tensor): if not tensor.is_cuda: - raise RuntimeError("Torch tensor must be on GPU.") + raise RuntimeError("torch tensor must be on gpu.") return tensor.data_ptr() raise ValueError("Unsupported tensor type. Got: {}. Supported " "GPU tensor types are: torch.Tensor, " @@ -200,24 +194,6 @@ def get_tensor_strides(tensor): "cupy.ndarray.".format(type(tensor))) -def get_tensor_device(tensor): - """Return the GPU index of a tensor.""" - if isinstance(tensor, cupy.ndarray): - try: - device = tensor.device.id - except AttributeError as exec: - raise RuntimeError("The tensor is not on a valid GPU.") \ - from exec - elif torch_available() and isinstance(tensor, torch.Tensor): - device = tensor.device.index - if not isinstance(device, int): - raise RuntimeError("The tensor is not on a valid GPU.") - else: - raise ValueError("Unsupported tensor type. " - "Got: {}.".format(type(tensor))) - return device - - def copy_tensor(dst_tensor, src_tensor): """Copy the content from src_tensor to dst_tensor. @@ -252,21 +228,3 @@ def copy_tensor(dst_tensor, src_tensor): raise ValueError("Unsupported tensor type. Got: {} and {}. Supported " "GPU tensor types are: torch.Tensor, cupy.ndarray." .format(type(dst_tensor), type(src_tensor))) - - -def get_tensor_device_list(tensors): - """Returns the gpu devices of the list of input tensors. - - Args: - tensors(list): a list of tensors, each locates on a GPU. - - Returns: - list: the list of GPU devices. - - """ - if not isinstance(tensors, list): - raise RuntimeError( - "Expect a list of tensors each locates on a GPU device. " - "Got: '{}'.".format(type(tensors))) - devices = [get_tensor_device(t) for t in tensors] - return devices diff --git a/python/ray/util/collective/examples/nccl_allreduce_example.py b/python/ray/util/collective/examples/nccl_allreduce_example.py index 797924621a52..7010d69249f2 100644 --- a/python/ray/util/collective/examples/nccl_allreduce_example.py +++ b/python/ray/util/collective/examples/nccl_allreduce_example.py @@ -11,11 +11,12 @@ def __init__(self): self.recv = cp.zeros((4, ), dtype=cp.float32) def setup(self, world_size, rank): - collective.init_collective_group(world_size, rank, "nccl", "default") + collective.init_collective_group("nccl", world_size, rank, "default") return True def compute(self): collective.allreduce(self.send, "default") + print(self.send) return self.send def destroy(self): @@ -23,8 +24,11 @@ def destroy(self): if __name__ == "__main__": + send = cp.ones((4, ), dtype=cp.float32) + ray.init(num_gpus=2) + num_workers = 2 workers = [] init_rets = [] @@ -34,4 +38,5 @@ def destroy(self): init_rets.append(w.setup.remote(num_workers, i)) _ = ray.get(init_rets) results = ray.get([w.compute.remote() for w in workers]) + # print(results) ray.shutdown() diff --git a/python/ray/util/collective/examples/nccl_allreduce_example_declare_collective_group.py b/python/ray/util/collective/examples/nccl_allreduce_example_declare_collective_group.py index 106ea31b2b7f..9d0335dbab11 100644 --- a/python/ray/util/collective/examples/nccl_allreduce_example_declare_collective_group.py +++ b/python/ray/util/collective/examples/nccl_allreduce_example_declare_collective_group.py @@ -30,4 +30,5 @@ def compute(self): } collective.declare_collective_group(workers, **_options) results = ray.get([w.compute.remote() for w in workers]) + print(results) ray.shutdown() diff --git a/python/ray/util/collective/examples/nccl_allreduce_multigpu_example.py b/python/ray/util/collective/examples/nccl_allreduce_multigpu_example.py deleted file mode 100644 index 88b75802e880..000000000000 --- a/python/ray/util/collective/examples/nccl_allreduce_multigpu_example.py +++ /dev/null @@ -1,43 +0,0 @@ -import ray -import cupy as cp - -import ray.util.collective as collective -from cupy.cuda import Device - - -@ray.remote(num_gpus=2) -class Worker: - def __init__(self): - with Device(0): - self.send1 = cp.ones((4, ), dtype=cp.float32) - with Device(1): - self.send2 = cp.ones((4, ), dtype=cp.float32) * 2 - - self.recv = cp.zeros((4, ), dtype=cp.float32) - - def setup(self, world_size, rank): - collective.init_collective_group(world_size, rank, "nccl", "177") - return True - - def compute(self): - collective.allreduce_multigpu([self.send1, self.send2], "177") - return [self.send1, self.send2], self.send1.device, self.send2.device - - def destroy(self): - collective.destroy_collective_group("177") - - -if __name__ == "__main__": - ray.init(address="auto") - num_workers = 2 - workers = [] - init_rets = [] - for i in range(num_workers): - w = Worker.remote() - workers.append(w) - init_rets.append(w.setup.remote(num_workers, i)) - a = ray.get(init_rets) - results = ray.get([w.compute.remote() for w in workers]) - print(results) - ray.get([w.destroy.remote() for w in workers]) - ray.shutdown() diff --git a/python/ray/util/collective/examples/nccl_p2p_example_multigpu.py b/python/ray/util/collective/examples/nccl_p2p_example_multigpu.py deleted file mode 100644 index 7ff637a5bd68..000000000000 --- a/python/ray/util/collective/examples/nccl_p2p_example_multigpu.py +++ /dev/null @@ -1,53 +0,0 @@ -import ray -import cupy as cp - -import ray.util.collective as collective -from cupy.cuda import Device - - -@ray.remote(num_gpus=2) -class Worker: - def __init__(self): - with Device(0): - self.send1 = cp.ones((4, ), dtype=cp.float32) - with Device(1): - self.send2 = cp.ones((4, ), dtype=cp.float32) * 2 - - with Device(0): - self.recv1 = cp.zeros((4, ), dtype=cp.float32) - with Device(1): - self.recv2 = cp.zeros((4, ), dtype=cp.float32) - self.rank = -1 - - def setup(self, world_size, rank): - self.rank = rank - collective.init_collective_group(world_size, rank, "nccl", "8") - return True - - def compute(self): - if self.rank == 0: - with Device(0): - collective.send_multigpu(self.send1 * 2, 1, 1, "8") - else: - # with Device(1): - collective.recv_multigpu(self.recv2, 0, 0, "8") - return self.recv2 - - def destroy(self): - collective.destroy_collective_group("8") - - -if __name__ == "__main__": - ray.init(address="auto") - num_workers = 2 - workers = [] - init_rets = [] - for i in range(num_workers): - w = Worker.remote() - workers.append(w) - init_rets.append(w.setup.remote(num_workers, i)) - a = ray.get(init_rets) - results = ray.get([w.compute.remote() for w in workers]) - print(results) - ray.get([w.destroy.remote() for w in workers]) - ray.shutdown() diff --git a/python/ray/util/collective/tests/conftest.py b/python/ray/util/collective/tests/conftest.py index 341142ec050d..ab5b3765d166 100644 --- a/python/ray/util/collective/tests/conftest.py +++ b/python/ray/util/collective/tests/conftest.py @@ -1,41 +1,30 @@ """Some fixtures for collective tests.""" -import logging - import pytest + import ray -from ray.util.collective.collective_group.nccl_collective_group \ - import _get_comm_key_from_devices, _get_comm_key_send_recv from ray.util.collective.const import get_nccl_store_name -logger = logging.getLogger(__name__) -logger.setLevel("INFO") - # TODO (Hao): remove this clean_up function as it sometimes crashes Ray. def clean_up(): group_names = ["default", "test", "123?34!", "default2", "random"] group_names.extend([str(i) for i in range(10)]) max_world_size = 4 - all_keys = [] + p2p_group_names = [] for name in group_names: - devices = [[0], [0, 1], [1, 0]] - for d in devices: - collective_communicator_key = _get_comm_key_from_devices(d) - all_keys.append(collective_communicator_key + "@" + name) for i in range(max_world_size): for j in range(max_world_size): - if i < j: - p2p_communicator_key = _get_comm_key_send_recv(i, 0, j, 0) - all_keys.append(p2p_communicator_key + "@" + name) - for group_key in all_keys: - store_name = get_nccl_store_name(group_key) + if i <= j: + p2p_group_name = name + "_" + str(i) + "_" + str(j) + p2p_group_names.append(p2p_group_name) + all_names = group_names + p2p_group_names + for group_name in all_names: + store_name = get_nccl_store_name(group_name) try: actor = ray.get_actor(store_name) except ValueError: actor = None if actor: - logger.debug("Killing actor with group_key: '{}' and store: '{}'." - .format(group_key, store_name)) ray.kill(actor) @@ -52,18 +41,6 @@ def ray_start_single_node_2_gpus(): # my own on-premise cluster before run this fixture. @pytest.fixture def ray_start_distributed_2_nodes_4_gpus(): - # The cluster has a setup of 2 nodes, each node with 2 - # GPUs. Each actor will be allocated 1 GPU. - ray.init("auto") - yield - clean_up() - ray.shutdown() - - -@pytest.fixture -def ray_start_distributed_multigpu_2_nodes_4_gpus(): - # The cluster has a setup of 2 nodes, each node with 2 - # GPUs. Each actor will be allocated 2 GPUs. ray.init("auto") yield clean_up() diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/__init__.py b/python/ray/util/collective/tests/distributed_multigpu_tests/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allgather.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allgather.py deleted file mode 100644 index c4cabcd45524..000000000000 --- a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allgather.py +++ /dev/null @@ -1,82 +0,0 @@ -"""Test the allgather API on a distributed Ray cluster.""" -import pytest -import ray - -import cupy as cp -import torch - -from ray.util.collective.tests.util import \ - create_collective_multigpu_workers, \ - init_tensors_for_gather_scatter_multigpu - - -@pytest.mark.parametrize("tensor_backend", ["cupy", "torch"]) -@pytest.mark.parametrize("array_size", - [2, 2**5, 2**10, 2**15, 2**20, [2, 2], [5, 5, 5]]) -def test_allgather_different_array_size( - ray_start_distributed_multigpu_2_nodes_4_gpus, array_size, - tensor_backend): - world_size = 2 - num_gpu_per_worker = 2 - actual_world_size = world_size * num_gpu_per_worker - actors, _ = create_collective_multigpu_workers(world_size) - init_tensors_for_gather_scatter_multigpu( - actors, array_size=array_size, tensor_backend=tensor_backend) - results = ray.get([a.do_allgather_multigpu.remote() for a in actors]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - for k in range(actual_world_size): - if tensor_backend == "cupy": - assert (results[i][j][k] == cp.ones( - array_size, dtype=cp.float32)).all() - else: - assert (results[i][j][k] == torch.ones( - array_size, dtype=torch.float32).cuda(j)).all() - - -def test_allgather_torch_cupy(ray_start_distributed_multigpu_2_nodes_4_gpus): - world_size = 2 - num_gpu_per_worker = 2 - actual_world_size = world_size * num_gpu_per_worker - shape = [10, 10] - actors, _ = create_collective_multigpu_workers(world_size) - - # tensor is pytorch, list is cupy - for i, a in enumerate(actors): - ray.get([ - a.set_buffer.remote( - shape, tensor_type0="torch", tensor_type1="torch") - ]) - ray.get([ - a.set_list_buffer.remote( - shape, tensor_type0="cupy", tensor_type1="cupy") - ]) - results = ray.get([a.do_allgather_multigpu.remote() for a in actors]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - for k in range(actual_world_size): - assert (results[i][j][k] == cp.ones(shape, - dtype=cp.float32)).all() - - # tensor is cupy, list is pytorch - for i, a in enumerate(actors): - ray.get([ - a.set_buffer.remote( - shape, tensor_type0="cupy", tensor_type1="cupy") - ]) - ray.get([ - a.set_list_buffer.remote( - shape, tensor_type0="torch", tensor_type1="torch") - ]) - results = ray.get([a.do_allgather_multigpu.remote() for a in actors]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - for k in range(actual_world_size): - assert (results[i][j][k] == torch.ones( - shape, dtype=torch.float32).cuda(j)).all() - - -if __name__ == "__main__": - import pytest - import sys - sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allreduce.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allreduce.py deleted file mode 100644 index b681a08490b0..000000000000 --- a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allreduce.py +++ /dev/null @@ -1,160 +0,0 @@ -"""Test the collective allreduice API on a distributed Ray cluster.""" -import pytest -import logging - -import cupy as cp - -import ray -from ray.util.collective.types import ReduceOp -from ray.util.collective.tests.util import create_collective_multigpu_workers - -logger = logging.getLogger(__name__) -logger.setLevel("DEBUG") - - -@pytest.mark.parametrize("group_name", ["default", "test", "123?34!"]) -def test_allreduce_multigpu_different_name( - ray_start_distributed_multigpu_2_nodes_4_gpus, group_name): - world_size = 2 - num_gpu_per_worker = 2 - actual_world_size = world_size * num_gpu_per_worker - actors, _ = create_collective_multigpu_workers( - num_workers=world_size, group_name=group_name) - results = ray.get( - [a.do_allreduce_multigpu.remote(group_name) for a in actors]) - assert (results[0] == cp.ones( - (10, ), dtype=cp.float32) * actual_world_size).all() - assert (results[1] == cp.ones( - (10, ), dtype=cp.float32) * actual_world_size).all() - - -@pytest.mark.parametrize("array_size", [2, 2**5, 2**10, 2**15, 2**20]) -def test_allreduce_multigpu_different_array_size( - ray_start_distributed_multigpu_2_nodes_4_gpus, array_size): - world_size = 2 - num_gpu_per_worker = 2 - actual_world_size = world_size * num_gpu_per_worker - actors, _ = create_collective_multigpu_workers(world_size) - ray.get([a.set_buffer.remote(array_size) for a in actors]) - results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) - assert (results[0] == cp.ones( - (array_size, ), dtype=cp.float32) * actual_world_size).all() - assert (results[1] == cp.ones( - (array_size, ), dtype=cp.float32) * actual_world_size).all() - - -def test_allreduce_multigpu_destroy( - ray_start_distributed_multigpu_2_nodes_4_gpus, - backend="nccl", - group_name="default"): - world_size = 2 - num_gpu_per_worker = 2 - actual_world_size = world_size * num_gpu_per_worker - actors, _ = create_collective_multigpu_workers(world_size) - - results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) - assert (results[0] == cp.ones( - (10, ), dtype=cp.float32) * actual_world_size).all() - assert (results[1] == cp.ones( - (10, ), dtype=cp.float32) * actual_world_size).all() - - # destroy the group and try do work, should fail - ray.get([a.destroy_group.remote() for a in actors]) - with pytest.raises(RuntimeError): - results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) - - # reinit the same group and all reduce - ray.get([ - actor.init_group.remote(world_size, i, backend, group_name) - for i, actor in enumerate(actors) - ]) - results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) - assert (results[0] == cp.ones((10, ), dtype=cp.float32) * actual_world_size - * actual_world_size).all() - assert (results[1] == cp.ones((10, ), dtype=cp.float32) * actual_world_size - * actual_world_size).all() - - -def test_allreduce_multigpu_multiple_group( - ray_start_distributed_multigpu_2_nodes_4_gpus, - backend="nccl", - num_groups=5): - world_size = 2 - num_gpu_per_worker = 2 - actual_world_size = world_size * num_gpu_per_worker - actors, _ = create_collective_multigpu_workers(world_size) - for group_name in range(1, num_groups): - ray.get([ - actor.init_group.remote(world_size, i, backend, str(group_name)) - for i, actor in enumerate(actors) - ]) - for i in range(num_groups): - group_name = "default" if i == 0 else str(i) - results = ray.get( - [a.do_allreduce_multigpu.remote(group_name) for a in actors]) - assert (results[0] == cp.ones( - (10, ), dtype=cp.float32) * (actual_world_size**(i + 1))).all() - - -def test_allreduce_multigpu_different_op( - ray_start_distributed_multigpu_2_nodes_4_gpus): - world_size = 2 - actors, _ = create_collective_multigpu_workers(world_size) - # check product - ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) - ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) - results = ray.get( - [a.do_allreduce_multigpu.remote(op=ReduceOp.PRODUCT) for a in actors]) - assert (results[0] == cp.ones((10, ), dtype=cp.float32) * 120).all() - assert (results[1] == cp.ones((10, ), dtype=cp.float32) * 120).all() - - # check min - ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) - ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) - results = ray.get( - [a.do_allreduce_multigpu.remote(op=ReduceOp.MIN) for a in actors]) - assert (results[0] == cp.ones((10, ), dtype=cp.float32) * 2).all() - assert (results[1] == cp.ones((10, ), dtype=cp.float32) * 2).all() - - # check max - ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) - ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) - results = ray.get( - [a.do_allreduce_multigpu.remote(op=ReduceOp.MAX) for a in actors]) - assert (results[0] == cp.ones((10, ), dtype=cp.float32) * 5).all() - assert (results[1] == cp.ones((10, ), dtype=cp.float32) * 5).all() - - -@pytest.mark.parametrize("dtype", - [cp.uint8, cp.float16, cp.float32, cp.float64]) -def test_allreduce_multigpu_different_dtype( - ray_start_distributed_multigpu_2_nodes_4_gpus, dtype): - world_size = 2 - num_gpu_per_worker = 2 - actual_world_size = world_size * num_gpu_per_worker - actors, _ = create_collective_multigpu_workers(world_size) - ray.get([a.set_buffer.remote([10], dtype=dtype) for a in actors]) - results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) - assert (results[0] == cp.ones( - (10, ), dtype=dtype) * actual_world_size).all() - assert (results[1] == cp.ones( - (10, ), dtype=dtype) * actual_world_size).all() - - -def test_allreduce_torch_cupy(ray_start_distributed_multigpu_2_nodes_4_gpus): - # import torch - world_size = 2 - actual_world_size = 4 - actors, _ = create_collective_multigpu_workers(world_size) - ray.get(actors[0].set_buffer.remote([10])) - ray.get(actors[1].set_buffer.remote( - [10], tensor_type0="torch", tensor_type1="torch")) - results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) - assert (results[0] == cp.ones((10, )) * actual_world_size).all() - - ray.get(actors[0].set_buffer.remote( - [10], tensor_type0="cupy", tensor_type1="torch")) - ray.get(actors[1].set_buffer.remote( - [10], tensor_type0="torch", tensor_type1="cupy")) - results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) - assert (results[0] == cp.ones((10, )) * actual_world_size).all() diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_basic_apis.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_basic_apis.py deleted file mode 100644 index 40be55dd2e0b..000000000000 --- a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_basic_apis.py +++ /dev/null @@ -1,117 +0,0 @@ -"""Test the collective group APIs.""" -import pytest -import ray -from random import shuffle - -from ray.util.collective.tests.util import create_collective_multigpu_workers - - -@pytest.mark.parametrize("group_name", ["default", "test", "123?34!"]) -def test_init_two_actors(ray_start_distributed_multigpu_2_nodes_4_gpus, - group_name): - world_size = 2 - actors, results = create_collective_multigpu_workers( - world_size, group_name) - for i in range(world_size): - assert (results[i]) - - -def test_report_num_gpus(ray_start_distributed_multigpu_2_nodes_4_gpus): - world_size = 2 - actors, results = create_collective_multigpu_workers(world_size) - num_gpus = ray.get([actor.report_num_gpus.remote() for actor in actors]) - assert num_gpus == [2, 2] - - -def test_get_rank(ray_start_distributed_multigpu_2_nodes_4_gpus): - world_size = 2 - actors, _ = create_collective_multigpu_workers(world_size) - actor0_rank = ray.get(actors[0].report_rank.remote()) - assert actor0_rank == 0 - actor1_rank = ray.get(actors[1].report_rank.remote()) - assert actor1_rank == 1 - - # create a second group with a different name, and different - # orders of ranks. - new_group_name = "default2" - ranks = list(range(world_size)) - shuffle(ranks) - _ = ray.get([ - actor.init_group.remote( - world_size, ranks[i], group_name=new_group_name) - for i, actor in enumerate(actors) - ]) - actor0_rank = ray.get(actors[0].report_rank.remote(new_group_name)) - assert actor0_rank == ranks[0] - actor1_rank = ray.get(actors[1].report_rank.remote(new_group_name)) - assert actor1_rank == ranks[1] - - -def test_availability(ray_start_distributed_multigpu_2_nodes_4_gpus): - world_size = 2 - actors, _ = create_collective_multigpu_workers(world_size) - actor0_nccl_availability = ray.get( - actors[0].report_nccl_availability.remote()) - assert actor0_nccl_availability - actor0_gloo_availability = ray.get( - actors[0].report_gloo_availability.remote()) - assert not actor0_gloo_availability - - -def test_is_group_initialized(ray_start_distributed_multigpu_2_nodes_4_gpus): - world_size = 2 - actors, _ = create_collective_multigpu_workers(world_size) - # check group is_init - actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote()) - assert actor0_is_init - actor0_is_init = ray.get( - actors[0].report_is_group_initialized.remote("random")) - assert not actor0_is_init - actor0_is_init = ray.get( - actors[0].report_is_group_initialized.remote("123")) - assert not actor0_is_init - actor1_is_init = ray.get(actors[0].report_is_group_initialized.remote()) - assert actor1_is_init - actor1_is_init = ray.get( - actors[0].report_is_group_initialized.remote("456")) - assert not actor1_is_init - - -def test_destroy_group(ray_start_distributed_multigpu_2_nodes_4_gpus): - world_size = 2 - actors, _ = create_collective_multigpu_workers(world_size) - # Now destroy the group at actor0 - ray.wait([actors[0].destroy_group.remote()]) - actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote()) - assert not actor0_is_init - - # should go well as the group `random` does not exist at all - ray.wait([actors[0].destroy_group.remote("random")]) - - actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote()) - assert actor1_is_init - ray.wait([actors[1].destroy_group.remote("random")]) - actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote()) - assert actor1_is_init - ray.wait([actors[1].destroy_group.remote("default")]) - actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote()) - assert not actor1_is_init - - # Now reconstruct the group using the same name - init_results = ray.get([ - actor.init_group.remote(world_size, i) - for i, actor in enumerate(actors) - ]) - for i in range(world_size): - assert init_results[i] - actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote()) - assert actor0_is_init - actor1_is_init = ray.get(actors[0].report_is_group_initialized.remote()) - assert actor1_is_init - - -if __name__ == "__main__": - import pytest - import sys - - sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_broadcast.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_broadcast.py deleted file mode 100644 index 5ded5bce35e8..000000000000 --- a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_broadcast.py +++ /dev/null @@ -1,92 +0,0 @@ -"""Test the broadcast API.""" -import pytest -import cupy as cp -import ray - -from ray.util.collective.tests.util import create_collective_multigpu_workers - - -@pytest.mark.parametrize("group_name", ["default", "test", "123?34!"]) -@pytest.mark.parametrize("src_rank", [0, 1]) -@pytest.mark.parametrize("src_gpu_index", [0, 1]) -def test_broadcast_different_name( - ray_start_distributed_multigpu_2_nodes_4_gpus, group_name, src_rank, - src_gpu_index): - world_size = 2 - num_gpu_per_worker = 2 - actors, _ = create_collective_multigpu_workers( - num_workers=world_size, group_name=group_name) - ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) - ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) - - results = ray.get([ - a.do_broadcast_multigpu.remote( - group_name=group_name, - src_rank=src_rank, - src_gpu_index=src_gpu_index) for a in actors - ]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - val = (src_rank + 1) * 2 + src_gpu_index - assert ( - results[i][j] == cp.ones([10], dtype=cp.float32) * val).all() - - -@pytest.mark.parametrize("array_size", [2, 2**5, 2**10, 2**15, 2**20]) -@pytest.mark.parametrize("src_rank", [0, 1]) -@pytest.mark.parametrize("src_gpu_index", [0, 1]) -def test_broadcast_different_array_size( - ray_start_distributed_multigpu_2_nodes_4_gpus, array_size, src_rank, - src_gpu_index): - world_size = 2 - num_gpu_per_worker = 2 - actors, _ = create_collective_multigpu_workers(world_size) - ray.get(actors[0].set_buffer.remote([array_size], value0=2, value1=3)) - ray.get(actors[1].set_buffer.remote([array_size], value0=4, value1=5)) - results = ray.get([ - a.do_broadcast_multigpu.remote( - src_rank=src_rank, src_gpu_index=src_gpu_index) for a in actors - ]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - val = (src_rank + 1) * 2 + src_gpu_index - assert (results[i][j] == cp.ones( - (array_size, ), dtype=cp.float32) * val).all() - - -@pytest.mark.parametrize("src_rank", [0, 1]) -@pytest.mark.parametrize("src_gpu_index", [0, 1]) -def test_broadcast_torch_cupy(ray_start_distributed_multigpu_2_nodes_4_gpus, - src_rank, src_gpu_index): - import torch - world_size = 2 - num_gpu_per_worker = 2 - actors, _ = create_collective_multigpu_workers(world_size) - ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) - ray.get(actors[1].set_buffer.remote( - [10], value0=4, value1=5, tensor_type0="torch", tensor_type1="torch")) - results = ray.get([ - a.do_broadcast_multigpu.remote( - src_rank=src_rank, src_gpu_index=src_gpu_index) for a in actors - ]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - val = (src_rank + 1) * 2 + src_gpu_index - if i == 0: - assert (results[i][j] == cp.ones([10], dtype=cp.float32) * - val).all() - else: - assert (results[i][j] == torch.ones([10]).cuda(j) * val).all() - - -@pytest.mark.parametrize("src_rank", [3, 4]) -@pytest.mark.parametrize("src_gpu_index", [2, 3]) -def test_broadcast_invalid_rank(ray_start_distributed_multigpu_2_nodes_4_gpus, - src_rank, src_gpu_index): - world_size = 2 - actors, _ = create_collective_multigpu_workers(world_size) - with pytest.raises(ValueError): - _ = ray.get([ - a.do_broadcast_multigpu.remote( - src_rank=src_rank, src_gpu_index=src_gpu_index) for a in actors - ]) diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reduce.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reduce.py deleted file mode 100644 index 8ac5d54c1c12..000000000000 --- a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reduce.py +++ /dev/null @@ -1,173 +0,0 @@ -"""Test the reduce API.""" -import pytest -import cupy as cp -import ray -from ray.util.collective.types import ReduceOp - -from ray.util.collective.tests.util import create_collective_multigpu_workers - - -@pytest.mark.parametrize("group_name", ["default", "test", "123?34!"]) -@pytest.mark.parametrize("dst_rank", [0, 1]) -@pytest.mark.parametrize("dst_gpu_index", [0, 1]) -def test_reduce_different_name(ray_start_distributed_multigpu_2_nodes_4_gpus, - group_name, dst_rank, dst_gpu_index): - world_size = 2 - num_gpu_per_worker = 2 - actual_world_size = world_size * num_gpu_per_worker - actors, _ = create_collective_multigpu_workers( - num_workers=world_size, group_name=group_name) - results = ray.get([ - a.do_reduce_multigpu.remote( - group_name, dst_rank=dst_rank, dst_gpu_index=dst_gpu_index) - for a in actors - ]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - if i == dst_rank and j == dst_gpu_index: - assert (results[i][j] == cp.ones( - (10, ), dtype=cp.float32) * actual_world_size).all() - else: - assert (results[i][j] == cp.ones((10, ), - dtype=cp.float32)).all() - - -@pytest.mark.parametrize("array_size", [2, 2**5, 2**10, 2**15, 2**20]) -@pytest.mark.parametrize("dst_rank", [0, 1]) -@pytest.mark.parametrize("dst_gpu_index", [0, 1]) -def test_reduce_different_array_size( - ray_start_distributed_multigpu_2_nodes_4_gpus, array_size, dst_rank, - dst_gpu_index): - world_size = 2 - num_gpu_per_worker = 2 - actual_world_size = world_size * num_gpu_per_worker - actors, _ = create_collective_multigpu_workers(num_workers=world_size) - - ray.get(actors[0].set_buffer.remote(array_size)) - ray.get(actors[1].set_buffer.remote(array_size)) - results = ray.get([ - a.do_reduce_multigpu.remote( - dst_rank=dst_rank, dst_gpu_index=dst_gpu_index) for a in actors - ]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - if i == dst_rank and j == dst_gpu_index: - assert (results[i][j] == cp.ones( - (array_size, ), dtype=cp.float32) * - actual_world_size).all() - else: - assert (results[i][j] == cp.ones( - (array_size, ), dtype=cp.float32)).all() - - -@pytest.mark.parametrize("dst_rank", [0, 1]) -@pytest.mark.parametrize("dst_gpu_index", [0, 1]) -def test_reduce_different_op(ray_start_distributed_multigpu_2_nodes_4_gpus, - dst_rank, dst_gpu_index): - world_size = 2 - num_gpu_per_worker = 2 - actors, _ = create_collective_multigpu_workers(world_size) - - # check product - ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) - ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) - results = ray.get([ - a.do_reduce_multigpu.remote( - dst_rank=dst_rank, - dst_gpu_index=dst_gpu_index, - op=ReduceOp.PRODUCT) for a in actors - ]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - if i == dst_rank and j == dst_gpu_index: - assert (results[i][j] == cp.ones( - (10, ), dtype=cp.float32) * 120).all() - else: - val = (i + 1) * 2 + j - assert (results[i][j] == cp.ones( - (10, ), dtype=cp.float32) * val).all() - - # check min - ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) - ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) - results = ray.get([ - a.do_reduce_multigpu.remote( - dst_rank=dst_rank, dst_gpu_index=dst_gpu_index, op=ReduceOp.MIN) - for a in actors - ]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - if i == dst_rank and j == dst_gpu_index: - assert (results[i][j] == cp.ones( - (10, ), dtype=cp.float32) * 2).all() - else: - val = (i + 1) * 2 + j - assert (results[i][j] == cp.ones( - (10, ), dtype=cp.float32) * val).all() - - # check max - ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) - ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) - results = ray.get([ - a.do_reduce_multigpu.remote( - dst_rank=dst_rank, dst_gpu_index=dst_gpu_index, op=ReduceOp.MAX) - for a in actors - ]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - if i == dst_rank and j == dst_gpu_index: - assert (results[i][j] == cp.ones( - (10, ), dtype=cp.float32) * 5).all() - else: - val = (i + 1) * 2 + j - assert (results[i][j] == cp.ones( - (10, ), dtype=cp.float32) * val).all() - - -@pytest.mark.parametrize("dst_rank", [0, 1]) -@pytest.mark.parametrize("dst_gpu_index", [0, 1]) -def test_reduce_torch_cupy(ray_start_distributed_multigpu_2_nodes_4_gpus, - dst_rank, dst_gpu_index): - import torch - world_size = 2 - num_gpu_per_worker = 2 - actors, _ = create_collective_multigpu_workers(world_size) - ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) - ray.get(actors[1].set_buffer.remote( - [10], value0=4, value1=5, tensor_type0="torch", tensor_type1="torch")) - - results = ray.get([ - a.do_reduce_multigpu.remote( - dst_rank=dst_rank, dst_gpu_index=dst_gpu_index) for a in actors - ]) - - for i in range(world_size): - for j in range(num_gpu_per_worker): - val = (i + 1) * 2 + j - if dst_rank == i and dst_gpu_index == j: - if i == 0: - assert (results[i][j] == cp.ones([10], dtype=cp.float32) * - 14).all() - else: - assert ( - results[i][j] == torch.ones([10]).cuda(j) * 14).all() - else: - if i == 0: - assert (results[i][j] == cp.ones([10], dtype=cp.float32) * - val).all() - else: - assert ( - results[i][j] == torch.ones([10]).cuda(j) * val).all() - - -@pytest.mark.parametrize("dst_rank", [3, 4]) -@pytest.mark.parametrize("dst_gpu_index", [2, 3]) -def test_reduce_invalid_rank(ray_start_distributed_multigpu_2_nodes_4_gpus, - dst_rank, dst_gpu_index): - world_size = 2 - actors, _ = create_collective_multigpu_workers(world_size) - with pytest.raises(ValueError): - _ = ray.get([ - a.do_reduce_multigpu.remote( - dst_rank=dst_rank, dst_gpu_index=dst_gpu_index) for a in actors - ]) diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reducescatter.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reducescatter.py deleted file mode 100644 index 48f72389bf89..000000000000 --- a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reducescatter.py +++ /dev/null @@ -1,82 +0,0 @@ -"""Test the collective reducescatter API on a distributed Ray cluster.""" -import pytest -import ray - -import cupy as cp -import torch - -from ray.util.collective.tests.util import \ - create_collective_multigpu_workers, \ - init_tensors_for_gather_scatter_multigpu - - -@pytest.mark.parametrize("tensor_backend", ["cupy", "torch"]) -@pytest.mark.parametrize("array_size", - [2, 2**5, 2**10, 2**15, 2**20, [2, 2], [5, 5, 5]]) -def test_reducescatter_different_array_size( - ray_start_distributed_multigpu_2_nodes_4_gpus, array_size, - tensor_backend): - world_size = 2 - num_gpu_per_worker = 2 - actual_world_size = world_size * num_gpu_per_worker - actors, _ = create_collective_multigpu_workers(world_size) - - init_tensors_for_gather_scatter_multigpu( - actors, array_size=array_size, tensor_backend=tensor_backend) - results = ray.get([a.do_reducescatter_multigpu.remote() for a in actors]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - if tensor_backend == "cupy": - assert (results[i][j] == cp.ones(array_size, dtype=cp.float32) - * actual_world_size).all() - else: - assert (results[i][j] == torch.ones( - array_size, dtype=torch.float32).cuda(j) * - actual_world_size).all() - - -def test_reducescatter_torch_cupy( - ray_start_distributed_multigpu_2_nodes_4_gpus): - world_size = 2 - num_gpu_per_worker = 2 - actual_world_size = world_size * num_gpu_per_worker - shape = [10, 10] - actors, _ = create_collective_multigpu_workers(world_size) - - # tensor is pytorch, list is cupy - for i, a in enumerate(actors): - ray.get([ - a.set_buffer.remote( - shape, tensor_type0="torch", tensor_type1="torch") - ]) - ray.get([ - a.set_list_buffer.remote( - shape, tensor_type0="cupy", tensor_type1="cupy") - ]) - results = ray.get([a.do_reducescatter_multigpu.remote() for a in actors]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - assert (results[i][j] == torch.ones( - shape, dtype=torch.float32).cuda(j) * actual_world_size).all() - - # tensor is cupy, list is pytorch - for i, a in enumerate(actors): - ray.get([ - a.set_buffer.remote( - shape, tensor_type0="cupy", tensor_type1="cupy") - ]) - ray.get([ - a.set_list_buffer.remote( - shape, tensor_type0="torch", tensor_type1="torch") - ]) - results = ray.get([a.do_reducescatter_multigpu.remote() for a in actors]) - for i in range(world_size): - for j in range(num_gpu_per_worker): - assert (results[i][j] == cp.ones(shape, dtype=cp.float32) * - actual_world_size).all() - - -if __name__ == "__main__": - import pytest - import sys - sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_sendrecv.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_sendrecv.py deleted file mode 100644 index a88fdb34ec8f..000000000000 --- a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_sendrecv.py +++ /dev/null @@ -1,47 +0,0 @@ -"""Test the send/recv API.""" -import cupy as cp -import pytest -import ray - -from ray.util.collective.tests.util import create_collective_multigpu_workers - - -# @pytest.mark.parametrize("group_name", ["default", "test", "123?34!"]) -@pytest.mark.parametrize("dst_rank", [0, 1]) -@pytest.mark.parametrize("src_rank", [0, 1]) -@pytest.mark.parametrize("dst_gpu_index", [0, 1]) -@pytest.mark.parametrize("src_gpu_index", [0, 1]) -@pytest.mark.parametrize("array_size", - [2**10, 2**15, 2**20, [2, 2], [5, 9, 10, 85]]) -def test_sendrecv(ray_start_distributed_multigpu_2_nodes_4_gpus, array_size, - src_rank, dst_rank, src_gpu_index, dst_gpu_index): - if src_rank == dst_rank: - return - world_size = 2 - actors, _ = create_collective_multigpu_workers(num_workers=world_size) - - ray.get(actors[0].set_buffer.remote(array_size, value0=2, value1=3)) - ray.get(actors[1].set_buffer.remote(array_size, value0=4, value1=5)) - - refs = [] - for i in range(world_size): - refs.append(actors[i].get_buffer.remote()) - refs[src_rank][src_gpu_index] = actors[src_rank].do_send_multigpu.remote( - dst_rank=dst_rank, - dst_gpu_index=dst_gpu_index, - src_gpu_index=src_gpu_index) - refs[dst_rank][dst_gpu_index] = actors[dst_rank].do_recv_multigpu.remote( - src_rank=src_rank, - src_gpu_index=src_gpu_index, - dst_gpu_index=dst_gpu_index) - results = [] - results_flattend = ray.get(refs[0] + refs[1]) - results.append([results_flattend[0], results_flattend[1]]) - results.append([results_flattend[2], results_flattend[3]]) - assert (results[src_rank][src_gpu_index] == cp.ones( - array_size, dtype=cp.float32) * ( - (src_rank + 1) * 2 + src_gpu_index)).all() - assert (results[dst_rank][dst_gpu_index] == cp.ones( - array_size, dtype=cp.float32) * ( - (src_rank + 1) * 2 + src_gpu_index)).all() - ray.get([a.destroy_group.remote() for a in actors]) diff --git a/python/ray/util/collective/tests/distributed_tests/test_distributed_basic_apis.py b/python/ray/util/collective/tests/distributed_tests/test_distributed_basic_apis.py index a0dd4508001f..0f17b79ba63e 100644 --- a/python/ray/util/collective/tests/distributed_tests/test_distributed_basic_apis.py +++ b/python/ray/util/collective/tests/distributed_tests/test_distributed_basic_apis.py @@ -69,9 +69,9 @@ def test_availability(ray_start_distributed_2_nodes_4_gpus): actor0_nccl_availability = ray.get( actors[0].report_nccl_availability.remote()) assert actor0_nccl_availability - actor0_gloo_availability = ray.get( - actors[0].report_gloo_availability.remote()) - assert not actor0_gloo_availability + actor0_mpi_availability = ray.get( + actors[0].report_mpi_availability.remote()) + assert not actor0_mpi_availability def test_is_group_initialized(ray_start_distributed_2_nodes_4_gpus): diff --git a/python/ray/util/collective/tests/distributed_tests/test_distributed_broadcast.py b/python/ray/util/collective/tests/distributed_tests/test_distributed_broadcast.py index 5c1ecd7f14d8..408ebce76b8a 100644 --- a/python/ray/util/collective/tests/distributed_tests/test_distributed_broadcast.py +++ b/python/ray/util/collective/tests/distributed_tests/test_distributed_broadcast.py @@ -60,8 +60,7 @@ def test_broadcast_torch_cupy(ray_start_distributed_2_nodes_4_gpus, src_rank): assert (results[1] == torch.ones((10, )).cuda() * world_size).all() -def test_broadcast_invalid_rank(ray_start_distributed_2_nodes_4_gpus, - src_rank=3): +def test_broadcast_invalid_rank(ray_start_single_node_2_gpus, src_rank=3): world_size = 2 actors, _ = create_collective_workers(world_size) with pytest.raises(ValueError): diff --git a/python/ray/util/collective/tests/sinlge_node_tests/__init__.py b/python/ray/util/collective/tests/sinlge_node_tests/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/python/ray/util/collective/tests/sinlge_node_tests/test_allgather.py b/python/ray/util/collective/tests/test_allgather.py similarity index 100% rename from python/ray/util/collective/tests/sinlge_node_tests/test_allgather.py rename to python/ray/util/collective/tests/test_allgather.py diff --git a/python/ray/util/collective/tests/sinlge_node_tests/test_allreduce.py b/python/ray/util/collective/tests/test_allreduce.py similarity index 100% rename from python/ray/util/collective/tests/sinlge_node_tests/test_allreduce.py rename to python/ray/util/collective/tests/test_allreduce.py diff --git a/python/ray/util/collective/tests/sinlge_node_tests/test_basic_apis.py b/python/ray/util/collective/tests/test_basic_apis.py similarity index 97% rename from python/ray/util/collective/tests/sinlge_node_tests/test_basic_apis.py rename to python/ray/util/collective/tests/test_basic_apis.py index 29a3ec3f4a15..8c23442a3b4c 100644 --- a/python/ray/util/collective/tests/sinlge_node_tests/test_basic_apis.py +++ b/python/ray/util/collective/tests/test_basic_apis.py @@ -64,9 +64,9 @@ def test_availability(ray_start_single_node_2_gpus): actor0_nccl_availability = ray.get( actors[0].report_nccl_availability.remote()) assert actor0_nccl_availability - actor0_gloo_availability = ray.get( - actors[0].report_gloo_availability.remote()) - assert not actor0_gloo_availability + actor0_mpi_availability = ray.get( + actors[0].report_mpi_availability.remote()) + assert not actor0_mpi_availability def test_is_group_initialized(ray_start_single_node_2_gpus): diff --git a/python/ray/util/collective/tests/sinlge_node_tests/test_broadcast.py b/python/ray/util/collective/tests/test_broadcast.py similarity index 100% rename from python/ray/util/collective/tests/sinlge_node_tests/test_broadcast.py rename to python/ray/util/collective/tests/test_broadcast.py diff --git a/python/ray/util/collective/tests/sinlge_node_tests/test_reduce.py b/python/ray/util/collective/tests/test_reduce.py similarity index 100% rename from python/ray/util/collective/tests/sinlge_node_tests/test_reduce.py rename to python/ray/util/collective/tests/test_reduce.py diff --git a/python/ray/util/collective/tests/sinlge_node_tests/test_reducescatter.py b/python/ray/util/collective/tests/test_reducescatter.py similarity index 100% rename from python/ray/util/collective/tests/sinlge_node_tests/test_reducescatter.py rename to python/ray/util/collective/tests/test_reducescatter.py diff --git a/python/ray/util/collective/tests/sinlge_node_tests/test_sendrecv.py b/python/ray/util/collective/tests/test_sendrecv.py similarity index 100% rename from python/ray/util/collective/tests/sinlge_node_tests/test_sendrecv.py rename to python/ray/util/collective/tests/test_sendrecv.py diff --git a/python/ray/util/collective/tests/util.py b/python/ray/util/collective/tests/util.py index a5fb97a53ad5..259ee24c9727 100644 --- a/python/ray/util/collective/tests/util.py +++ b/python/ray/util/collective/tests/util.py @@ -1,29 +1,20 @@ import cupy as cp -import logging import ray import ray.util.collective as col from ray.util.collective.types import Backend, ReduceOp -from ray.util.collective.collective_group.nccl_util import get_num_gpus import torch -logger = logging.getLogger(__name__) - @ray.remote(num_gpus=1) class Worker: def __init__(self): - self.buffer = None - self.list_buffer = None - - def init_tensors(self): self.buffer = cp.ones((10, ), dtype=cp.float32) self.list_buffer = [ - cp.ones((10, ), dtype=cp.float32) for _ in range(2) + cp.ones((10, ), dtype=cp.float32), + cp.ones((10, ), dtype=cp.float32) ] - cp.cuda.Stream.null.synchronize() - return True def init_group(self, world_size, @@ -88,8 +79,8 @@ def report_nccl_availability(self): avail = col.nccl_available() return avail - def report_gloo_availability(self): - avail = col.gloo_available() + def report_mpi_availability(self): + avail = col.mpi_available() return avail def report_is_group_initialized(self, group_name="default"): @@ -100,11 +91,7 @@ def report_is_group_initialized(self, group_name="default"): def create_collective_workers(num_workers=2, group_name="default", backend="nccl"): - actors = [None] * num_workers - for i in range(num_workers): - actor = Worker.remote() - ray.get([actor.init_tensors.remote()]) - actors[i] = actor + actors = [Worker.remote() for _ in range(num_workers)] world_size = num_workers init_results = ray.get([ actor.init_group.remote(world_size, i, backend, group_name) @@ -125,7 +112,7 @@ def init_tensors_for_gather_scatter(actors, t = torch.ones(array_size, dtype=torch.float32).cuda() * (i + 1) else: raise RuntimeError("Unsupported tensor backend.") - ray.get([a.set_buffer.remote(t)]) + ray.wait([a.set_buffer.remote(t)]) if tensor_backend == "cupy": list_buffer = [ cp.ones(array_size, dtype=dtype) for _ in range(world_size) @@ -138,250 +125,3 @@ def init_tensors_for_gather_scatter(actors, else: raise RuntimeError("Unsupported tensor backend.") ray.get([a.set_list_buffer.remote(list_buffer) for a in actors]) - - -@ray.remote(num_gpus=2) -class MultiGPUWorker: - def __init__(self): - self.buffer0 = None - self.buffer1 = None - self.list_buffer0 = None - self.list_buffer1 = None - - def __del__(self): - self.buffer0 = None - self.buffer1 = None - self.list_buffer0 = None - self.list_buffer1 = None - - def init_tensors(self): - with cp.cuda.Device(0): - self.buffer0 = cp.ones((10, ), dtype=cp.float32) - self.list_buffer0 = [ - cp.ones((10, ), dtype=cp.float32) for _ in range(4) - ] - with cp.cuda.Device(1): - self.buffer1 = cp.ones((10, ), dtype=cp.float32) - self.list_buffer1 = [ - cp.ones((10, ), dtype=cp.float32) for _ in range(4) - ] - cp.cuda.Stream.null.synchronize() - return True - - def init_group(self, - world_size, - rank, - backend=Backend.NCCL, - group_name="default"): - col.init_collective_group(world_size, rank, backend, group_name) - return True - - def set_buffer(self, - size, - value0=1.0, - value1=1.0, - dtype=cp.float32, - tensor_type0="cupy", - tensor_type1="cupy"): - if tensor_type0 == "cupy": - with cp.cuda.Device(0): - self.buffer0 = cp.ones(size, dtype=dtype) * value0 - elif tensor_type0 == "torch": - self.buffer0 = torch.ones( - size, dtype=torch.float32).cuda(0) * value0 - else: - raise RuntimeError() - - if tensor_type1 == "cupy": - with cp.cuda.Device(1): - self.buffer1 = cp.ones(size, dtype=dtype) * value1 - elif tensor_type1 == "torch": - self.buffer1 = torch.ones( - size, dtype=torch.float32).cuda(1) * value1 - else: - raise RuntimeError() - cp.cuda.Device(0).synchronize() - cp.cuda.Device(1).synchronize() - # cp.cuda.Stream.null.synchronize() - return True - - def set_list_buffer(self, - size, - value0=1.0, - value1=1.0, - dtype=cp.float32, - tensor_type0="cupy", - tensor_type1="cupy"): - if tensor_type0 == "cupy": - with cp.cuda.Device(0): - self.list_buffer0 = [ - cp.ones(size, dtype=dtype) * value0 for _ in range(4) - ] - elif tensor_type0 == "torch": - self.list_buffer0 = [ - torch.ones(size, dtype=torch.float32).cuda(0) * value0 - for _ in range(4) - ] - else: - raise RuntimeError() - - if tensor_type1 == "cupy": - with cp.cuda.Device(1): - self.list_buffer1 = [ - cp.ones(size, dtype=dtype) * value1 for _ in range(4) - ] - elif tensor_type1 == "torch": - self.list_buffer1 = [ - torch.ones(size, dtype=torch.float32).cuda(1) * value1 - for _ in range(4) - ] - else: - raise RuntimeError() - cp.cuda.Device(0).synchronize() - cp.cuda.Device(1).synchronize() - return True - - @ray.method(num_returns=2) - def get_buffer(self): - return self.buffer0, self.buffer1 - - def do_allreduce_multigpu(self, group_name="default", op=ReduceOp.SUM): - col.allreduce_multigpu([self.buffer0, self.buffer1], group_name, op) - cp.cuda.Device(0).synchronize() - cp.cuda.Device(1).synchronize() - return self.buffer0 - - def do_reduce_multigpu(self, - group_name="default", - dst_rank=0, - dst_gpu_index=0, - op=ReduceOp.SUM): - col.reduce_multigpu([self.buffer0, self.buffer1], dst_rank, - dst_gpu_index, group_name, op) - cp.cuda.Device(0).synchronize() - cp.cuda.Device(1).synchronize() - return self.buffer0, self.buffer1 - - def do_broadcast_multigpu(self, - group_name="default", - src_rank=0, - src_gpu_index=0): - col.broadcast_multigpu([self.buffer0, self.buffer1], src_rank, - src_gpu_index, group_name) - return self.buffer0, self.buffer1 - - def do_allgather_multigpu(self, group_name="default"): - col.allgather_multigpu([self.list_buffer0, self.list_buffer1], - [self.buffer0, self.buffer1], group_name) - cp.cuda.Device(0).synchronize() - cp.cuda.Device(1).synchronize() - return self.list_buffer0, self.list_buffer1 - - def do_reducescatter_multigpu(self, group_name="default", op=ReduceOp.SUM): - col.reducescatter_multigpu([self.buffer0, self.buffer1], - [self.list_buffer0, self.list_buffer1], - group_name, op) - cp.cuda.Device(0).synchronize() - cp.cuda.Device(1).synchronize() - return self.buffer0, self.buffer1 - - def do_send_multigpu(self, - group_name="default", - dst_rank=0, - dst_gpu_index=0, - src_gpu_index=0): - if src_gpu_index == 0: - col.send_multigpu(self.buffer0, dst_rank, dst_gpu_index, - group_name) - cp.cuda.Device(0).synchronize() - return self.buffer0 - elif src_gpu_index == 1: - col.send_multigpu(self.buffer1, dst_rank, dst_gpu_index, - group_name) - cp.cuda.Device(1).synchronize() - return self.buffer1 - else: - raise RuntimeError() - - def do_recv_multigpu(self, - group_name="default", - src_rank=0, - src_gpu_index=0, - dst_gpu_index=0): - if dst_gpu_index == 0: - col.recv_multigpu(self.buffer0, src_rank, src_gpu_index, - group_name) - cp.cuda.Device(0).synchronize() - return self.buffer0 - elif dst_gpu_index == 1: - col.recv_multigpu(self.buffer1, src_rank, src_gpu_index, - group_name) - cp.cuda.Device(1).synchronize() - return self.buffer1 - else: - raise RuntimeError() - - def destroy_group(self, group_name="default"): - col.destroy_collective_group(group_name) - return True - - def report_rank(self, group_name="default"): - rank = col.get_rank(group_name) - return rank - - def report_world_size(self, group_name="default"): - ws = col.get_world_size(group_name) - return ws - - def report_nccl_availability(self): - avail = col.nccl_available() - return avail - - def report_gloo_availability(self): - avail = col.gloo_available() - return avail - - def report_is_group_initialized(self, group_name="default"): - is_init = col.is_group_initialized(group_name) - return is_init - - def report_num_gpus(self): - n_gpus = get_num_gpus() - return n_gpus - - -def create_collective_multigpu_workers(num_workers=2, - group_name="default", - backend="nccl"): - actors = [None] * num_workers - for i in range(num_workers): - actor = MultiGPUWorker.remote() - ray.get([actor.set_buffer.remote([10])], timeout=10) - ray.get([actor.set_list_buffer.remote([10])], timeout=10) - actors[i] = actor - world_size = num_workers - init_results = ray.get([ - actor.init_group.remote(world_size, i, backend, group_name) - for i, actor in enumerate(actors) - ]) - return actors, init_results - - -def init_tensors_for_gather_scatter_multigpu(actors, - array_size=10, - tensor_backend="cupy"): - for i, a in enumerate(actors): - if tensor_backend == "cupy": - ray.get([a.set_buffer.remote(array_size)]) - ray.get([a.set_list_buffer.remote(array_size)]) - elif tensor_backend == "torch": - ray.get([ - a.set_buffer.remote( - array_size, tensor_type0="torch", tensor_type1="torch") - ]) - ray.get([ - a.set_list_buffer.remote( - array_size, tensor_type0="torch", tensor_type1="torch") - ]) - else: - raise RuntimeError("Unsupported tensor backend.") diff --git a/python/ray/util/collective/types.py b/python/ray/util/collective/types.py index d3e964486f77..c12dde84cb6a 100644 --- a/python/ray/util/collective/types.py +++ b/python/ray/util/collective/types.py @@ -30,7 +30,6 @@ class Backend(object): """A class to represent different backends.""" NCCL = "nccl" MPI = "mpi" - GLOO = "gloo" UNRECOGNIZED = "unrecognized" def __new__(cls, name: str): @@ -39,8 +38,6 @@ def __new__(cls, name: str): raise ValueError("Unrecognized backend: '{}'. " "Only NCCL is supported".format(name)) if backend == Backend.MPI: - raise RuntimeError("Ray does not support MPI backend.") - if backend == Backend.GLOO: raise NotImplementedError() return backend @@ -70,7 +67,6 @@ class BarrierOptions: class ReduceOptions: reduceOp = ReduceOp.SUM root_rank = 0 - root_tensor = 0 # index for multi-gpu reduce operations timeout_ms = unset_timeout_ms @@ -89,7 +85,6 @@ class AllGatherOptions: @dataclass class BroadcastOptions: root_rank = 0 - root_tensor = 0 timeout_ms = unset_timeout_ms @@ -97,17 +92,3 @@ class BroadcastOptions: class ReduceScatterOptions: reduceOp = ReduceOp.SUM timeout_ms = unset_timeout_ms - - -@dataclass -class SendOptions: - dst_rank = 0 - dst_gpu_index = 0 - timeout_ms = unset_timeout_ms - - -@dataclass -class RecvOptions: - src_rank = 0 - src_gpu_index = 0 - unset_timeout_ms = unset_timeout_ms diff --git a/python/ray/util/dask/__init__.py b/python/ray/util/dask/__init__.py index 10a08379c847..bfe28571ad75 100644 --- a/python/ray/util/dask/__init__.py +++ b/python/ray/util/dask/__init__.py @@ -4,16 +4,11 @@ local_ray_callbacks, unpack_ray_callbacks, ) -from .optimizations import dataframe_optimize __all__ = [ - # Schedulers "ray_dask_get", "ray_dask_get_sync", - # Callbacks "RayDaskCallback", "local_ray_callbacks", "unpack_ray_callbacks", - # Optimizations - "dataframe_optimize", ] diff --git a/python/ray/util/dask/optimizations.py b/python/ray/util/dask/optimizations.py deleted file mode 100644 index c36757af691f..000000000000 --- a/python/ray/util/dask/optimizations.py +++ /dev/null @@ -1,160 +0,0 @@ -import operator -import warnings - -import dask -from dask import core -from dask.core import istask -from dask.dataframe.core import _concat -from dask.dataframe.optimize import optimize -from dask.dataframe.shuffle import shuffle_group -from dask.highlevelgraph import HighLevelGraph - -from .scheduler import MultipleReturnFunc, multiple_return_get - -try: - from dask.dataframe.shuffle import SimpleShuffleLayer -except ImportError: - # SimpleShuffleLayer doesn't exist in this version of Dask. - SimpleShuffleLayer = None - -if SimpleShuffleLayer is not None: - - class MultipleReturnSimpleShuffleLayer(SimpleShuffleLayer): - @classmethod - def clone(cls, layer: SimpleShuffleLayer): - # TODO(Clark): Probably don't need this since SimpleShuffleLayer - # implements __copy__() and the shallow clone should be enough? - return cls( - name=layer.name, - column=layer.column, - npartitions=layer.npartitions, - npartitions_input=layer.npartitions_input, - ignore_index=layer.ignore_index, - name_input=layer.name_input, - meta_input=layer.meta_input, - parts_out=layer.parts_out, - annotations=layer.annotations, - ) - - def __repr__(self): - return (f"MultipleReturnSimpleShuffleLayer") - - def __reduce__(self): - attrs = [ - "name", - "column", - "npartitions", - "npartitions_input", - "ignore_index", - "name_input", - "meta_input", - "parts_out", - "annotations", - ] - return (MultipleReturnSimpleShuffleLayer, - tuple(getattr(self, attr) for attr in attrs)) - - def _cull(self, parts_out): - return MultipleReturnSimpleShuffleLayer( - self.name, - self.column, - self.npartitions, - self.npartitions_input, - self.ignore_index, - self.name_input, - self.meta_input, - parts_out=parts_out, - ) - - def _construct_graph(self): - """Construct graph for a simple shuffle operation.""" - - shuffle_group_name = "group-" + self.name - shuffle_split_name = "split-" + self.name - - dsk = {} - n_parts_out = len(self.parts_out) - for part_out in self.parts_out: - # TODO(Clark): Find better pattern than in-scheduler concat. - _concat_list = [(shuffle_split_name, part_out, part_in) - for part_in in range(self.npartitions_input)] - dsk[(self.name, part_out)] = (_concat, _concat_list, - self.ignore_index) - for _, _part_out, _part_in in _concat_list: - dsk[(shuffle_split_name, _part_out, _part_in)] = ( - multiple_return_get, - (shuffle_group_name, _part_in), - _part_out, - ) - if (shuffle_group_name, _part_in) not in dsk: - dsk[(shuffle_group_name, _part_in)] = ( - MultipleReturnFunc( - shuffle_group, - n_parts_out, - ), - (self.name_input, _part_in), - self.column, - 0, - self.npartitions, - self.npartitions, - self.ignore_index, - self.npartitions, - ) - - return dsk - - def rewrite_simple_shuffle_layer(dsk, keys): - if not isinstance(dsk, HighLevelGraph): - dsk = HighLevelGraph.from_collections( - id(dsk), dsk, dependencies=()) - else: - dsk = dsk.copy() - - layers = dsk.layers.copy() - for key, layer in layers.items(): - if type(layer) is SimpleShuffleLayer: - dsk.layers[key] = MultipleReturnSimpleShuffleLayer.clone(layer) - return dsk - - def dataframe_optimize(dsk, keys, **kwargs): - if not isinstance(keys, (list, set)): - keys = [keys] - keys = list(core.flatten(keys)) - - if not isinstance(dsk, HighLevelGraph): - dsk = HighLevelGraph.from_collections( - id(dsk), dsk, dependencies=()) - - dsk = rewrite_simple_shuffle_layer(dsk, keys=keys) - return optimize(dsk, keys, **kwargs) -else: - - def dataframe_optimize(dsk, keys, **kwargs): - warnings.warn("Custom dataframe shuffle optimization only works on " - "dask>=2020.12.0, you are on version " - f"{dask.__version__}, please upgrade Dask." - "Falling back to default dataframe optimizer.") - return optimize(dsk, keys, **kwargs) - - -# Stale approaches below. - - -def fuse_splits_into_multiple_return(dsk, keys): - if not isinstance(dsk, HighLevelGraph): - dsk = HighLevelGraph.from_collections(id(dsk), dsk, dependencies=()) - else: - dsk = dsk.copy() - dependencies = dsk.dependencies.copy() - for k, v in dsk.items(): - if istask(v) and v[0] == shuffle_group: - task_deps = dependencies[k] - # Only rewrite shuffle group split if all downstream dependencies - # are splits. - if all( - istask(dsk[dep]) and dsk[dep][0] == operator.getitem - for dep in task_deps): - for dep in task_deps: - # Rewrite split - pass diff --git a/python/ray/util/dask/scheduler.py b/python/ray/util/dask/scheduler.py index d6a8a6edc132..0614d35641ec 100644 --- a/python/ray/util/dask/scheduler.py +++ b/python/ray/util/dask/scheduler.py @@ -1,7 +1,6 @@ import atexit from collections import defaultdict from multiprocessing.pool import ThreadPool -from dataclasses import dataclass import threading import ray @@ -271,31 +270,19 @@ def _rayify_task( return alternate_return func, args = task[0], task[1:] - if func is multiple_return_get: - return _execute_task(task, deps) # If the function's arguments contain nested object references, we must # unpack said object references into a flat set of arguments so that # Ray properly tracks the object dependencies between Ray tasks. - arg_object_refs, repack = unpack_object_refs(args, deps) + object_refs, repack = unpack_object_refs(args, deps) # Submit the task using a wrapper function. - object_refs = dask_task_wrapper.options( - name=f"dask:{key!s}", - num_returns=(1 if not isinstance(func, MultipleReturnFunc) else - func.num_returns), - ).remote( - func, - repack, - key, - ray_pretask_cbs, - ray_posttask_cbs, - *arg_object_refs, - ) + object_ref = dask_task_wrapper.options(name=f"dask:{key!s}").remote( + func, repack, key, ray_pretask_cbs, ray_posttask_cbs, *object_refs) if ray_postsubmit_cbs is not None: for cb in ray_postsubmit_cbs: - cb(task, key, deps, object_refs) + cb(task, key, deps, object_ref) - return object_refs + return object_ref elif not ishashable(task): return task elif task in deps: @@ -447,16 +434,3 @@ def ray_dask_get_sync(dsk, keys, **kwargs): cb(result) return result - - -@dataclass -class MultipleReturnFunc: - func: callable - num_returns: int - - def __call__(self, *args, **kwargs): - return self.func(*args, **kwargs) - - -def multiple_return_get(multiple_returns, idx): - return multiple_returns[idx] diff --git a/python/ray/util/lightning_accelerators/BUILD b/python/ray/util/lightning_accelerators/BUILD deleted file mode 100644 index 4355c6d33bb4..000000000000 --- a/python/ray/util/lightning_accelerators/BUILD +++ /dev/null @@ -1,33 +0,0 @@ -# -------------------------------------------------------------------- -# Tests from the python/ray/util/lightning_accelerators/tests directory. -# Please keep these sorted alphabetically. -# -------------------------------------------------------------------- - -py_test( - name = "test_horovod_ray_accelerator", - size = "medium", - srcs = ["tests/test_horovod_ray_accelerator.py"], - tags = ["exclusive", "pytorch-lightning", "pytorch", "horovod"], - deps = [":accelerator_lib"], -) - -# -------------------------------------------------------------------- -# Tests from the python/ray/util/lightning_accelerators/examples directory. -# Please keep these sorted alphabetically. -# -------------------------------------------------------------------- - -py_test( - name = "ptl_horovod_ray_example", - size = "medium", - srcs = ["examples/ptl_horovod_ray_example.py"], - tags = ["exclusive", "example", "pytorch-lightning", "pytorch", "horovod"], - deps = [":accelerator_lib"], - args = ["--smoke-test"] -) - -# # This is a dummy test dependency that causes the above tests to be -# # re-run if any of these files changes. -py_library( - name = "accelerator_lib", - srcs = glob(["**/*.py"], exclude=["tests/*.py"]), -) diff --git a/python/ray/util/lightning_accelerators/__init__.py b/python/ray/util/lightning_accelerators/__init__.py deleted file mode 100644 index 038180e016ef..000000000000 --- a/python/ray/util/lightning_accelerators/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from ray.util.lightning_accelerators.horovod_ray_accelerator import \ - HorovodRayAccelerator - -__all__ = ["HorovodRayAccelerator"] diff --git a/python/ray/util/lightning_accelerators/examples/ptl_horovod_ray_example.py b/python/ray/util/lightning_accelerators/examples/ptl_horovod_ray_example.py deleted file mode 100644 index fffcfb01f54b..000000000000 --- a/python/ray/util/lightning_accelerators/examples/ptl_horovod_ray_example.py +++ /dev/null @@ -1,195 +0,0 @@ -"""Example using Pytorch Lightning with a Horovod on Ray Accelerator.""" -import os -import tempfile - -import pytorch_lightning as pl -import torch -from torch.utils.data import random_split, DataLoader -from torchvision.datasets import MNIST -from torchvision import transforms - -import ray -from ray import tune -from ray.tune.examples.mnist_ptl_mini import LightningMNISTClassifier -from ray.tune.integration.pytorch_lightning import TuneReportCallback -from ray.util.lightning_accelerators import HorovodRayAccelerator - - -class MNISTClassifier(LightningMNISTClassifier): - def prepare_data(self): - self.dataset = MNIST( - self.data_dir, - train=True, - download=True, - transform=transforms.ToTensor()) - - def train_dataloader(self): - dataset = self.dataset - train_length = len(dataset) - dataset_train, _ = random_split( - dataset, [train_length - 5000, 5000], - generator=torch.Generator().manual_seed(0)) - loader = DataLoader( - dataset_train, - batch_size=self.batch_size, - shuffle=True, - num_workers=1, - drop_last=True, - pin_memory=True, - ) - return loader - - def val_dataloader(self): - dataset = self.dataset - train_length = len(dataset) - _, dataset_val = random_split( - dataset, [train_length - 5000, 5000], - generator=torch.Generator().manual_seed(0)) - loader = DataLoader( - dataset_val, - batch_size=self.batch_size, - shuffle=False, - num_workers=1, - drop_last=True, - pin_memory=True, - ) - return loader - - -def train_mnist(config, - data_dir=None, - num_epochs=10, - num_hosts=1, - num_slots=4, - use_gpu=False, - callbacks=None): - model = MNISTClassifier(config, data_dir) - - callbacks = callbacks or [] - - trainer = pl.Trainer( - max_epochs=num_epochs, - gpus=int(use_gpu), - callbacks=callbacks, - accelerator=HorovodRayAccelerator( - num_hosts=num_hosts, num_slots=num_slots, use_gpu=use_gpu)) - trainer.fit(model) - - -def tune_mnist(data_dir, - num_samples=10, - num_epochs=10, - num_hosts=1, - num_slots=4, - use_gpu=False): - config = { - "layer_1": tune.choice([32, 64, 128]), - "layer_2": tune.choice([64, 128, 256]), - "lr": tune.loguniform(1e-4, 1e-1), - "batch_size": tune.choice([32, 64, 128]), - } - - # Add Tune callback. - metrics = {"loss": "ptl/val_loss", "acc": "ptl/val_accuracy"} - callbacks = [TuneReportCallback(metrics, on="validation_end")] - trainable = tune.with_parameters( - train_mnist, - data_dir=data_dir, - num_epochs=num_epochs, - num_hosts=num_hosts, - num_slots=num_slots, - use_gpu=use_gpu, - callbacks=callbacks) - analysis = tune.run( - trainable, - metric="loss", - mode="min", - config=config, - num_samples=num_samples, - resources_per_trial={ - "cpu": 1, - # Assume 1 cpu per slot. - "extra_cpu": num_hosts * num_slots, - # Assume 1 gpu per slot. - "extra_gpu": num_hosts * num_slots * int(use_gpu) - }, - name="tune_mnist") - - print("Best hyperparameters found were: ", analysis.best_config) - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser() - parser.add_argument( - "--num-hosts", - type=int, - help="Number of machines to train on. If using Tune, then each " - "trial will use this many machines.", - default=1) - parser.add_argument( - "--num-slots", - type=int, - help="Number of workers to " - "place on each " - "machine. If using " - "Tune, then each trial will use this many slots per machine.", - default=1) - parser.add_argument( - "--use-gpu", action="store_true", help="Use GPU for " - "training.") - parser.add_argument( - "--tune", - action="store_true", - help="Use Ray Tune " - "for " - "hyperparameter " - "tuning.") - parser.add_argument( - "--num-samples", - type=int, - default=10, - help="Number " - "of " - "samples to tune.") - parser.add_argument( - "--num-epochs", - type=int, - default=10, - help="Number " - "of " - "epochs " - "to train for.") - parser.add_argument( - "--smoke-test", action="store_true", help="Finish quickly for testing") - parser.add_argument( - "--address", - required=False, - type=str, - help="the address to use for Ray") - args, _ = parser.parse_known_args() - - num_epochs = 1 if args.smoke_test else args.num_epochs - num_hosts = 1 if args.smoke_test else args.num_hosts - num_slots = 1 if args.smoke_test else args.num_slots - use_gpu = False if args.smoke_test else args.use_gpu - num_samples = 1 if args.smoke_test else args.num_samples - - if args.smoke_test: - ray.init(num_cpus=2) - else: - ray.init(address=args.address) - - data_dir = os.path.join(tempfile.gettempdir(), "mnist_data_") - - if args.tune: - raise NotImplementedError("Using Tune + Pytorch Lightning with " - "distributed training is currently not " - "supported.") - tune_mnist(data_dir, num_samples, num_epochs, num_hosts, num_slots, - use_gpu) - else: - config = {"layer_1": 32, "layer_2": 64, "lr": 1e-1, "batch_size": 32} - train_mnist(config, data_dir, num_epochs, num_hosts, num_slots, - use_gpu) diff --git a/python/ray/util/lightning_accelerators/horovod_ray_accelerator.py b/python/ray/util/lightning_accelerators/horovod_ray_accelerator.py deleted file mode 100644 index 04f73317a923..000000000000 --- a/python/ray/util/lightning_accelerators/horovod_ray_accelerator.py +++ /dev/null @@ -1,121 +0,0 @@ -import ray -from pytorch_lightning.accelerators.horovod_accelerator import \ - HorovodAccelerator - -try: - import horovod.torch as hvd - from horovod.ray import RayExecutor -except (ModuleNotFoundError, ImportError): - HOROVOD_AVAILABLE = False -else: - HOROVOD_AVAILABLE = True - - -def get_executable_cls(): - # Only used for testing purposes, currently. - # We need to override this in tests to ensure test path is set correctly. - return None - - -class HorovodRayAccelerator(HorovodAccelerator): - """Pytorch Lightning Accelerator for Horovod training on a Ray cluster. - - This accelerator is used to manage distributed training on a Ray cluster - via the Horovod training framework. Internally, the specified number of - Ray actors are launched in the cluster and are configured as part of the - Horovod ring. The Pytorch Lightning trainer is instantiated on the - driver and sent to each of these training workers where training is - executed. The distributed training protocol is handled by Horovod. - - Each training worker is configured to reserve 1 CPU and if 1 GPU if - ``use_gpu`` is set to ``True``. - - If using this accelerator, you should run your code like a normal Python - script: ``python train.py``, and not with ``horovodrun``. - - Args: - num_hosts (int): The number of nodes/machines to execute the job on. - num_slots (int): Number of workers to be placed on each machine. - use_gpu (bool): Whether to use GPU for allocation. For GPU to be - used, you must also set the ``gpus`` arg in your Pytorch Lightning - Trainer to a value > 0. - - Example: - - .. code_block:: python - - import pytorch_lightning as ptl - from ray.util.lightning_accelerators import HorovodRayAccelerator - - ptl_model = MNISTClassifier(...) - # 2 nodes, 4 workers per node, each using 1 CPU and 1 GPU. - accelerator = HorovodRayAccelerator(num_hosts=2, num_slots=4, - use_gpu=True). - - # If using GPUs, set the ``gpus`` arg to a value > 0. - # The actual number of GPUs is determined by ``num_slots``. - trainer = pl.Trainer(..., gpus=1, accelerator=accelerator). - trainer.fit(ptl_model). - - """ - - def __init__(self, - *args, - num_hosts=1, - num_slots=1, - use_gpu=False, - **kwargs): - super().__init__(*args, trainer=None, **kwargs) - self.nickname = "horovod_ray" - self.num_hosts = num_hosts - self.num_slots = num_slots - self.use_gpu = use_gpu - - def setup(self, model): - self.trainer.use_horovod = True - settings = RayExecutor.create_settings(timeout_s=30) - self.executor = RayExecutor( - settings, - num_hosts=self.num_hosts, - num_slots=self.num_slots, - use_gpu=self.use_gpu) - self.trainer.model = model - self.executor.start(executable_cls=get_executable_cls()) - - def train(self): - trainer = self.trainer - trainer_ref = ray.put(self.trainer) - self.trainer = None - results = self.executor.run(self.train_remote, args=[trainer_ref]) - results, state_dict, best_path = results[0] - - self.trainer = trainer - self.trainer.model.load_state_dict(state_dict) - if self.trainer.checkpoint_callback: - self.trainer.checkpoint_callback.best_model_path = best_path - - return results - - def train_remote(self, trainer_ref): - self.trainer = ray.get(trainer_ref) - hvd.init() - if self.trainer.on_gpu: - # Horovod assigns one local GPU per process. - self.trainer.root_gpu = hvd.local_rank() - - # TODO: Make changes in PTL to clean this up. - super(HorovodRayAccelerator, self).setup(self.trainer.model) - results = super(HorovodRayAccelerator, self).train() - if hvd.rank() != 0: - # Only want results from the first worker. - return None - - best_model_path = None - if self.trainer.checkpoint_callback is not None: - best_model_path = self.trainer.checkpoint_callback.best_model_path - - model = self.trainer.model - return results, model.state_dict(), best_model_path - - def teardown(self): - self.executor.shutdown() diff --git a/python/ray/util/lightning_accelerators/tests/test_horovod_ray_accelerator.py b/python/ray/util/lightning_accelerators/tests/test_horovod_ray_accelerator.py deleted file mode 100644 index 1d8bb9d5e71c..000000000000 --- a/python/ray/util/lightning_accelerators/tests/test_horovod_ray_accelerator.py +++ /dev/null @@ -1,191 +0,0 @@ -import os - -import torch -import pytest -import ray -from pl_bolts.datamodules.mnist_datamodule import MNISTDataModule -from ray.util.sgd.tests.test_ptl import PTL_Module -from ray.tune.examples.mnist_ptl_mini import LightningMNISTClassifier -from ray.util.lightning_accelerators import HorovodRayAccelerator -import pytorch_lightning as pl - -try: - import horovod # noqa: F401 - from horovod.common.util import nccl_built -except ImportError: - HOROVOD_AVAILABLE = False -else: - HOROVOD_AVAILABLE = True - - -def _nccl_available(): - if not HOROVOD_AVAILABLE: - return False - try: - return nccl_built() - except AttributeError: - return False - - -@pytest.fixture -def ray_start_2_cpus(): - address_info = ray.init(num_cpus=2) - yield address_info - ray.shutdown() - - -@pytest.fixture -def ray_start_2_gpus(): - address_info = ray.init(num_cpus=2, num_gpus=2) - yield address_info - ray.shutdown() - # This env var is set by Pytorch Lightning. - # Make sure to reset it after each test. - # TODO: Upstream to PTL to not set this env var if using Ray. - del os.environ["CUDA_VISIBLE_DEVICES"] - - -@pytest.fixture -def seed(): - pl.seed_everything(0) - - -def get_model(lr=1e-2, hidden_size=1, data_size=10, val_size=10, batch_size=2): - config = { - "lr": lr, - "hidden_size": hidden_size, - "data_size": data_size, - "val_size": val_size, - "batch_size": batch_size - } - return PTL_Module(config) - - -def get_trainer(dir, - num_slots=2, - use_gpu=False, - max_epochs=1, - limit_train_batches=10, - limit_val_batches=10, - progress_bar_refresh_rate=0): - accelerator = HorovodRayAccelerator(num_slots=num_slots, use_gpu=use_gpu) - trainer = pl.Trainer( - default_root_dir=dir, - gpus=1 if use_gpu else 0, - max_epochs=max_epochs, - limit_train_batches=limit_train_batches, - limit_val_batches=limit_val_batches, - progress_bar_refresh_rate=progress_bar_refresh_rate, - checkpoint_callback=True, - accelerator=accelerator) - return trainer - - -def train_test(trainer, model): - initial_values = torch.tensor( - [torch.sum(torch.abs(x)) for x in model.parameters()]) - result = trainer.fit(model) - post_train_values = torch.tensor( - [torch.sum(torch.abs(x)) for x in model.parameters()]) - assert result == 1, "trainer failed" - # Check that the model is actually changed post-training. - assert torch.norm(initial_values - post_train_values) > 0.1 - - -@pytest.mark.parametrize("num_slots", [1, 2]) -def test_train(tmpdir, ray_start_2_cpus, seed, num_slots): - model = get_model() - - trainer = get_trainer(tmpdir, num_slots=num_slots) - train_test(trainer, model) - - -def load_test(trainer, model): - trainer.fit(model) - trained_model = PTL_Module.load_from_checkpoint( - trainer.checkpoint_callback.best_model_path, config=model.config) - assert trained_model is not None, "loading model failed" - - -@pytest.mark.parametrize("num_slots", [1, 2]) -def test_load(tmpdir, ray_start_2_cpus, seed, num_slots): - model = get_model() - trainer = get_trainer(tmpdir, num_slots=num_slots) - load_test(trainer, model) - - -def predict_test(trainer, model, dm): - trainer.fit(model, dm) - test_loader = dm.test_dataloader() - acc = pl.metrics.Accuracy() - for batch in test_loader: - x, y = batch - with torch.no_grad(): - y_hat = model(x) - y_hat = y_hat.cpu() - acc.update(y_hat, y) - average_acc = acc.compute() - assert average_acc >= 0.5, f"This model is expected to get > {0.5} in " \ - f"test set (it got {average_acc})" - - -@pytest.mark.parametrize("num_slots", [1, 2]) -def test_predict(tmpdir, ray_start_2_cpus, seed, num_slots): - config = { - "layer_1": 32, - "layer_2": 32, - "lr": 1e-2, - "batch_size": 32, - } - model = LightningMNISTClassifier(config, tmpdir) - dm = MNISTDataModule( - data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"]) - trainer = get_trainer( - tmpdir, limit_train_batches=10, max_epochs=1, num_slots=num_slots) - predict_test(trainer, model, dm) - - -@pytest.mark.skipif( - not _nccl_available(), reason="test requires Horovod with NCCL support") -@pytest.mark.skipif( - torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -@pytest.mark.parametrize("num_slots", [1, 2]) -def test_train_gpu(tmpdir, ray_start_2_gpus, seed, num_slots): - model = get_model() - trainer = get_trainer(tmpdir, num_slots=num_slots, use_gpu=True) - train_test(trainer, model) - - -@pytest.mark.skipif( - not _nccl_available(), reason="test requires Horovod with NCCL support") -@pytest.mark.skipif( - torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -@pytest.mark.parametrize("num_slots", [1, 2]) -def test_load_gpu(tmpdir, ray_start_2_gpus, seed, num_slots): - model = get_model() - trainer = get_trainer(tmpdir, num_slots=num_slots, use_gpu=True) - load_test(trainer, model) - - -@pytest.mark.skipif( - not _nccl_available(), reason="test requires Horovod with NCCL support") -@pytest.mark.skipif( - torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -@pytest.mark.parametrize("num_slots", [1, 2]) -def test_predict_gpu(tmpdir, ray_start_2_gpus, seed, num_slots): - config = { - "layer_1": 32, - "layer_2": 32, - "lr": 1e-2, - "batch_size": 32, - } - model = LightningMNISTClassifier(config, tmpdir) - dm = MNISTDataModule( - data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"]) - trainer = get_trainer( - tmpdir, - limit_train_batches=10, - max_epochs=1, - num_slots=num_slots, - use_gpu=True) - predict_test(trainer, model, dm) diff --git a/python/ray/util/metrics.py b/python/ray/util/metrics.py index 57a01cf7aa0b..d287a503fa73 100644 --- a/python/ray/util/metrics.py +++ b/python/ray/util/metrics.py @@ -147,11 +147,6 @@ def __init__(self, self._metric = CythonCount(self._name, self._description, self._unit, self._tag_keys) - def __reduce__(self): - deserializer = Count - serialized_data = (self._name, self._description, self._tag_keys) - return deserializer, serialized_data - class Histogram(Metric): """Histogram distribution of metric points. @@ -182,12 +177,6 @@ def __init__(self, self._unit, self.boundaries, self._tag_keys) - def __reduce__(self): - deserializer = Histogram - serialized_data = (self._name, self._description, self.boundaries, - self._tag_keys) - return deserializer, serialized_data - @property def info(self): """Return information about histogram metric.""" @@ -215,11 +204,6 @@ def __init__(self, self._metric = CythonGauge(self._name, self._description, self._unit, self._tag_keys) - def __reduce__(self): - deserializer = Gauge - serialized_data = (self._name, self._description, self._tag_keys) - return deserializer, serialized_data - __all__ = [ "Count", diff --git a/python/ray/util/multiprocessing/pool.py b/python/ray/util/multiprocessing/pool.py index b74e10279568..2d8f3d5fb911 100644 --- a/python/ray/util/multiprocessing/pool.py +++ b/python/ray/util/multiprocessing/pool.py @@ -9,7 +9,6 @@ import copy import ray -from ray.util import log_once logger = logging.getLogger(__name__) @@ -337,7 +336,7 @@ def __init__(self, self._maxtasksperchild = maxtasksperchild or -1 self._actor_deletion_ids = [] - if context and log_once("context_argument_warning"): + if context: logger.warning("The 'context' argument is not supported using " "ray. Please refer to the documentation for how " "to control ray initialization.") @@ -495,7 +494,7 @@ def _submit_chunk(self, def _chunk_and_run(self, func, iterable, chunksize=None, unpack_args=False): if not hasattr(iterable, "__len__"): - iterable = list(iterable) + iterable = [iterable] if chunksize is None: chunksize = self._calculate_chunksize(iterable) diff --git a/python/ray/util/placement_group.py b/python/ray/util/placement_group.py index c723f77d3ecc..be24772ab518 100644 --- a/python/ray/util/placement_group.py +++ b/python/ray/util/placement_group.py @@ -4,7 +4,6 @@ import ray from ray._raylet import PlacementGroupID, ObjectRef -from ray.utils import hex_to_binary bundle_reservation_check = None @@ -146,8 +145,7 @@ def _fill_bundle_cache_if_needed(self): def placement_group(bundles: List[Dict[str, float]], strategy: str = "PACK", - name: str = "", - lifetime=None) -> PlacementGroup: + name: str = "unnamed_group") -> PlacementGroup: """Asynchronously creates a PlacementGroup. Args: @@ -162,10 +160,6 @@ def placement_group(bundles: List[Dict[str, float]], - "STRICT_SPREAD": Packs Bundles across distinct nodes. name(str): The name of the placement group. - lifetime(str): Either `None`, which defaults to the placement group - will fate share with its creator and will be deleted once its - creator is dead, or "detached", which means the placement group - will live as a global object independent of the creator. Return: PlacementGroup: Placement group object. @@ -185,16 +179,8 @@ def placement_group(bundles: List[Dict[str, float]], "Bundles cannot be an empty dictionary or " f"resources with only 0 values. Bundles: {bundles}") - if lifetime is None: - detached = False - elif lifetime == "detached": - detached = True - else: - raise ValueError("placement group `lifetime` argument must be either" - " `None` or 'detached'") - placement_group_id = worker.core_worker.create_placement_group( - name, bundles, strategy, detached) + name, bundles, strategy) return PlacementGroup(placement_group_id) @@ -212,29 +198,6 @@ def remove_placement_group(placement_group: PlacementGroup): worker.core_worker.remove_placement_group(placement_group.id) -def get_placement_group(placement_group_name: str): - """Get a placement group object with a global name. - - Returns: - None if can't find a placement group with the given name. - The placement group object otherwise. - """ - if not placement_group_name: - raise ValueError( - "Please supply a non-empty value to get_placement_group") - worker = ray.worker.global_worker - worker.check_connected() - placement_group_info = ray.state.state.get_placement_group_by_name( - placement_group_name) - if placement_group_info is None: - raise ValueError( - f"Failed to look up actor with name: {placement_group_name}") - else: - return PlacementGroup( - PlacementGroupID( - hex_to_binary(placement_group_info["placement_group_id"]))) - - def placement_group_table(placement_group: PlacementGroup = None) -> list: """Get the state of the placement group from GCS. diff --git a/python/ray/util/serialization.py b/python/ray/util/serialization.py index cb9e2b1b9dac..a93bbab55acb 100644 --- a/python/ray/util/serialization.py +++ b/python/ray/util/serialization.py @@ -16,14 +16,3 @@ def register_serializer(cls, *, serializer, deserializer): """ context = ray.worker.global_worker.get_serialization_context() context._register_cloudpickle_serializer(cls, serializer, deserializer) - - -def deregister_serializer(cls): - """Deregister the serializer associated with the type ``cls``. - There is no effect if the serializer is unavailable. - - Args: - cls: A Python class/type. - """ - context = ray.worker.global_worker.get_serialization_context() - context._unregister_cloudpickle_reducer(cls) diff --git a/python/ray/util/sgd/BUILD b/python/ray/util/sgd/BUILD index cbdc52cb479a..896560136626 100644 --- a/python/ray/util/sgd/BUILD +++ b/python/ray/util/sgd/BUILD @@ -241,20 +241,6 @@ py_test( args = ["--smoke-test"] ) -# -------------------------------------------------------------------- -# SGD related tests from the ../../../../release directory. -# Please keep these sorted alphabetically. -# -------------------------------------------------------------------- - -py_test( - name = "pytorch_pbt_failure", - size = "medium", - srcs = ["torch/examples/pytorch_pbt_failure.py"], - tags = ["exlusive", "pytorch", "release"], - deps = [":sgd_lib"], - args = ["--smoke-test"] -) - # This is a dummy test dependency that causes the above tests to be # re-run if any of these files changes. py_library( diff --git a/python/ray/util/sgd/tf/examples/tf-example-sgd.yaml b/python/ray/util/sgd/tf/examples/tf-example-sgd.yaml index fcf31354b70e..846f5f10ce3c 100644 --- a/python/ray/util/sgd/tf/examples/tf-example-sgd.yaml +++ b/python/ray/util/sgd/tf/examples/tf-example-sgd.yaml @@ -4,8 +4,11 @@ cluster_name: sgd-tf # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 3 +initial_workers: 3 max_workers: 3 +target_utilization_fraction: 0.9 + # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 20 # docker: diff --git a/python/ray/util/sgd/torch/examples/benchmarks/README.rst b/python/ray/util/sgd/torch/examples/benchmarks/README.rst index 54b3ce192b68..78dd71a15f51 100644 --- a/python/ray/util/sgd/torch/examples/benchmarks/README.rst +++ b/python/ray/util/sgd/torch/examples/benchmarks/README.rst @@ -104,6 +104,7 @@ You can specify the number of nodes you want to use with the following configura # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: # Change this to a custom quantity + initial_workers: # same as above max_workers: # same as above You may want to install FP16 support for PyTorch with the following configuration in the YAML file: diff --git a/python/ray/util/sgd/torch/examples/benchmarks/horovod-benchmark.yaml b/python/ray/util/sgd/torch/examples/benchmarks/horovod-benchmark.yaml index 7e3db50510ff..04cbd520e135 100644 --- a/python/ray/util/sgd/torch/examples/benchmarks/horovod-benchmark.yaml +++ b/python/ray/util/sgd/torch/examples/benchmarks/horovod-benchmark.yaml @@ -4,8 +4,11 @@ cluster_name: horovod-pytorch # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 1 +initial_workers: 1 max_workers: 1 +target_utilization_fraction: 0.9 + # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 50 # docker: diff --git a/python/ray/util/sgd/torch/examples/example-sgd.yaml b/python/ray/util/sgd/torch/examples/example-sgd.yaml index 6bbc64423aab..fe9b18d191b0 100644 --- a/python/ray/util/sgd/torch/examples/example-sgd.yaml +++ b/python/ray/util/sgd/torch/examples/example-sgd.yaml @@ -4,8 +4,11 @@ cluster_name: sgd-pytorch # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 3 +initial_workers: 3 max_workers: 3 +target_utilization_fraction: 0.9 + # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 20 # docker: diff --git a/python/ray/util/sgd/torch/examples/image_models/cluster.yaml b/python/ray/util/sgd/torch/examples/image_models/cluster.yaml index 7d9ff9be89e0..fccd5f8625bd 100644 --- a/python/ray/util/sgd/torch/examples/image_models/cluster.yaml +++ b/python/ray/util/sgd/torch/examples/image_models/cluster.yaml @@ -4,8 +4,11 @@ cluster_name: sgd-pytorch-imagenet # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 1 +initial_workers: 1 max_workers: 1 +target_utilization_fraction: 0.9 + # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 10 # docker: diff --git a/python/ray/util/sgd/torch/examples/pytorch_pbt_failure.py b/python/ray/util/sgd/torch/examples/pytorch_pbt_failure.py deleted file mode 100644 index 053991885b4b..000000000000 --- a/python/ray/util/sgd/torch/examples/pytorch_pbt_failure.py +++ /dev/null @@ -1,128 +0,0 @@ -import argparse -import numpy as np -import os -import torch -import torch.nn as nn -from torch.utils.data import DataLoader, Subset -from torchvision.datasets import CIFAR10 -import torchvision.transforms as transforms - -import ray -from ray import tune -from ray.tune import CLIReporter -from ray.tune.schedulers import PopulationBasedTraining -from ray.tune.utils.mock import FailureInjectorCallback -from ray.util.sgd.torch import TorchTrainer, TrainingOperator -from ray.util.sgd.torch.resnet import ResNet18 -from ray.util.sgd.utils import BATCH_SIZE - -parser = argparse.ArgumentParser() -parser.add_argument( - "--smoke-test", - action="store_true", - default=False, - help="Finish quickly for training.") -args = parser.parse_args() - - -def initialization_hook(): - # Need this for avoiding a connection restart issue on AWS. - os.environ["NCCL_SOCKET_IFNAME"] = "^docker0,lo" - os.environ["NCCL_LL_THRESHOLD"] = "0" - - # set the below if needed - # print("NCCL DEBUG SET") - # os.environ["NCCL_DEBUG"] = "INFO" - - -def cifar_creator(config): - transform_train = transforms.Compose([ - transforms.RandomCrop(32, padding=4), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - transforms.Normalize((0.4914, 0.4822, 0.4465), - (0.2023, 0.1994, 0.2010)), - ]) # meanstd transformation - - transform_test = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.4914, 0.4822, 0.4465), - (0.2023, 0.1994, 0.2010)), - ]) - train_dataset = CIFAR10( - root="~/data", train=True, download=True, transform=transform_train) - validation_dataset = CIFAR10( - root="~/data", train=False, download=False, transform=transform_test) - - if config.get("test_mode"): - train_dataset = Subset(train_dataset, list(range(64))) - validation_dataset = Subset(validation_dataset, list(range(64))) - - train_loader = DataLoader( - train_dataset, batch_size=config[BATCH_SIZE], num_workers=2) - validation_loader = DataLoader( - validation_dataset, batch_size=config[BATCH_SIZE], num_workers=2) - return train_loader, validation_loader - - -def optimizer_creator(model, config): - """Returns optimizer""" - return torch.optim.SGD( - model.parameters(), - lr=config.get("lr", 0.1), - momentum=config.get("momentum", 0.9)) - - -ray.init(address="auto" if not args.smoke_test else None, log_to_driver=True) -num_training_workers = 1 if args.smoke_test else 3 - -CustomTrainingOperator = TrainingOperator.from_creators( - model_creator=ResNet18, - optimizer_creator=optimizer_creator, - data_creator=cifar_creator, - loss_creator=nn.CrossEntropyLoss) - -TorchTrainable = TorchTrainer.as_trainable( - training_operator_cls=CustomTrainingOperator, - initialization_hook=initialization_hook, - num_workers=num_training_workers, - config={ - "test_mode": args.smoke_test, - BATCH_SIZE: 128 * num_training_workers, - }, - use_gpu=not args.smoke_test) - -pbt_scheduler = PopulationBasedTraining( - time_attr="training_iteration", - metric="val_loss", - mode="min", - perturbation_interval=1, - hyperparam_mutations={ - # distribution for resampling - "lr": lambda: np.random.uniform(0.001, 1), - # allow perturbations within this set of categorical values - "momentum": [0.8, 0.9, 0.99], - }) - -reporter = CLIReporter() -reporter.add_metric_column("val_loss", "loss") -reporter.add_metric_column("val_accuracy", "acc") - -analysis = tune.run( - TorchTrainable, - num_samples=4, - config={ - "lr": tune.choice([0.001, 0.01, 0.1]), - "momentum": 0.8, - "head_location": None, - "worker_locations": None - }, - max_failures=-1, # used for fault tolerance - checkpoint_freq=2, # used for fault tolerance - progress_reporter=reporter, - scheduler=pbt_scheduler, - callbacks=[FailureInjectorCallback()], - queue_trials=True, - stop={"training_iteration": 1} if args.smoke_test else None) - -print(analysis.get_best_config(metric="val_loss", mode="min")) diff --git a/python/ray/util/sgd/torch/examples/segmentation/example.yaml b/python/ray/util/sgd/torch/examples/segmentation/example.yaml index 33db0f445537..78cd9bcb09ba 100644 --- a/python/ray/util/sgd/torch/examples/segmentation/example.yaml +++ b/python/ray/util/sgd/torch/examples/segmentation/example.yaml @@ -4,8 +4,10 @@ cluster_name: sgd-coco-pytorch # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 1 +initial_workers: 1 max_workers: 1 +target_utilization_fraction: 0.9 # Cloud-provider specific configuration. provider: type: aws diff --git a/python/ray/util/sgd/torch/examples/sgd-development.yaml b/python/ray/util/sgd/torch/examples/sgd-development.yaml index bc79803eeadd..590cb63b0708 100644 --- a/python/ray/util/sgd/torch/examples/sgd-development.yaml +++ b/python/ray/util/sgd/torch/examples/sgd-development.yaml @@ -4,8 +4,11 @@ cluster_name: sgd-pytorch # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 2 +initial_workers: 2 max_workers: 2 +target_utilization_fraction: 0.9 + # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 10 # docker: diff --git a/python/ray/util/sgd/torch/examples/transformers/cluster.yaml b/python/ray/util/sgd/torch/examples/transformers/cluster.yaml index 434b48d3044f..4cecd3bf86a1 100644 --- a/python/ray/util/sgd/torch/examples/transformers/cluster.yaml +++ b/python/ray/util/sgd/torch/examples/transformers/cluster.yaml @@ -4,8 +4,10 @@ cluster_name: transformer-cluster # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 3 +initial_workers: 3 max_workers: 3 +target_utilization_fraction: 0.9 # Cloud-provider specific configuration. provider: type: aws diff --git a/python/ray/worker.py b/python/ray/worker.py index 7239b80a982e..350bbc6491e5 100644 --- a/python/ray/worker.py +++ b/python/ray/worker.py @@ -9,6 +9,7 @@ import logging import os import redis +from six.moves import queue import sys import threading import time @@ -68,12 +69,6 @@ logger = logging.getLogger(__name__) -# Visible for testing. -def _unhandled_error_handler(e: Exception): - logger.error("Unhandled error (suppress with " - "RAY_IGNORE_UNHANDLED_ERRORS=1): {}".format(e)) - - class Worker: """A class used to define the control flow of a worker process. @@ -282,14 +277,6 @@ def put_object(self, value, object_ref=None): self.core_worker.put_serialized_object( serialized_value, object_ref=object_ref)) - def raise_errors(self, data_metadata_pairs, object_refs): - context = self.get_serialization_context() - out = context.deserialize_objects(data_metadata_pairs, object_refs) - if "RAY_IGNORE_UNHANDLED_ERRORS" in os.environ: - return - for e in out: - _unhandled_error_handler(e) - def deserialize_objects(self, data_metadata_pairs, object_refs): context = self.get_serialization_context() return context.deserialize_objects(data_metadata_pairs, object_refs) @@ -601,6 +588,12 @@ def init( directory for the Ray process. Defaults to an OS-specific conventional location, e.g., "/tmp/ray". _java_worker_options: Overwrite the options to start Java workers. + _lru_evict (bool): If True, when an object store is full, it will evict + objects in LRU order to make more space and when under memory + pressure, ray.ObjectLostError may be thrown. If False, then + reference counting will be used to decide which objects are safe + to evict and when under memory pressure, ray.ObjectStoreFullError + may be thrown. _metrics_export_port(int): Port number Ray exposes system metrics through a Prometheus endpoint. It is currently under active development, and the API is subject to change. @@ -738,6 +731,9 @@ def init( if _system_config is not None and len(_system_config) != 0: raise ValueError("When connecting to an existing cluster, " "_system_config must not be provided.") + if _lru_evict: + raise ValueError("When connecting to an existing cluster, " + "_lru_evict must not be provided.") if _enable_object_reconstruction: raise ValueError( "When connecting to an existing cluster, " @@ -822,8 +818,6 @@ def shutdown(_exiting_interpreter=False): # Shut down the Ray processes. global _global_node if _global_node is not None: - if _global_node.is_head(): - _global_node.destroy_external_storage() _global_node.kill_all_processes(check_alive=False, allow_graceful=True) _global_node = None @@ -867,6 +861,13 @@ def custom_excepthook(type, value, tb): sys.excepthook = custom_excepthook +# The last time we raised a TaskError in this process. We use this value to +# suppress redundant error messages pushed from the workers. +last_task_error_raise_time = 0 + +# The max amount of seconds to wait before printing out an uncaught error. +UNCAUGHT_ERROR_GRACE_PERIOD = 5 + def print_logs(redis_client, threads_stopped, job_id): """Prints log messages from workers on all of the nodes. @@ -1017,7 +1018,42 @@ def color_for(data: Dict[str, str]) -> str: file=print_file) -def listen_error_messages_raylet(worker, threads_stopped): +def print_error_messages_raylet(task_error_queue, threads_stopped): + """Prints message received in the given output queue. + + This checks periodically if any un-raised errors occurred in the + background. + + Args: + task_error_queue (queue.Queue): A queue used to receive errors from the + thread that listens to Redis. + threads_stopped (threading.Event): A threading event used to signal to + the thread that it should exit. + """ + + while True: + # Exit if we received a signal that we should stop. + if threads_stopped.is_set(): + return + + try: + error, t = task_error_queue.get(block=False) + except queue.Empty: + threads_stopped.wait(timeout=0.01) + continue + # Delay errors a little bit of time to attempt to suppress redundant + # messages originating from the worker. + while t + UNCAUGHT_ERROR_GRACE_PERIOD > time.time(): + threads_stopped.wait(timeout=1) + if threads_stopped.is_set(): + break + if t < last_task_error_raise_time + UNCAUGHT_ERROR_GRACE_PERIOD: + logger.debug(f"Suppressing error from worker: {error}") + else: + logger.error(f"Possible unhandled error from worker: {error}") + + +def listen_error_messages_raylet(worker, task_error_queue, threads_stopped): """Listen to error messages in the background on the driver. This runs in a separate thread on the driver and pushes (error, time) @@ -1025,6 +1061,8 @@ def listen_error_messages_raylet(worker, threads_stopped): Args: worker: The worker class that this thread belongs to. + task_error_queue (queue.Queue): A queue used to communicate with the + thread that prints the errors found by this thread. threads_stopped (threading.Event): A threading event used to signal to the thread that it should exit. """ @@ -1063,9 +1101,8 @@ def listen_error_messages_raylet(worker, threads_stopped): error_message = error_data.error_message if (error_data.type == ray_constants.TASK_PUSH_ERROR): - # TODO(ekl) remove task push errors entirely now that we have - # the separate unhandled exception handler. - pass + # Delay it a bit to see if we can suppress it + task_error_queue.put((error_message, time.time())) else: logger.warning(error_message) except (OSError, redis.exceptions.ConnectionError) as e: @@ -1228,12 +1265,19 @@ def connect(node, # temporarily using this implementation which constantly queries the # scheduler for new error messages. if mode == SCRIPT_MODE: + q = queue.Queue() worker.listener_thread = threading.Thread( target=listen_error_messages_raylet, name="ray_listen_error_messages", - args=(worker, worker.threads_stopped)) + args=(worker, q, worker.threads_stopped)) + worker.printer_thread = threading.Thread( + target=print_error_messages_raylet, + name="ray_print_error_messages", + args=(q, worker.threads_stopped)) worker.listener_thread.daemon = True worker.listener_thread.start() + worker.printer_thread.daemon = True + worker.printer_thread.start() if log_to_driver: global_worker_stdstream_dispatcher.add_handler( "ray_print_logs", print_to_stdstream) @@ -1286,6 +1330,8 @@ def disconnect(exiting_interpreter=False): worker.import_thread.join_import_thread() if hasattr(worker, "listener_thread"): worker.listener_thread.join() + if hasattr(worker, "printer_thread"): + worker.printer_thread.join() if hasattr(worker, "logger_thread"): worker.logger_thread.join() worker.threads_stopped.clear() @@ -1397,11 +1443,13 @@ def get(object_refs, *, timeout=None): raise ValueError("'object_refs' must either be an object ref " "or a list of object refs.") + global last_task_error_raise_time # TODO(ujvl): Consider how to allow user to retrieve the ready objects. values, debugger_breakpoint = worker.get_objects( object_refs, timeout=timeout) for i, value in enumerate(values): if isinstance(value, RayError): + last_task_error_raise_time = time.time() if isinstance(value, ray.exceptions.ObjectLostError): worker.core_worker.dump_object_store_memory_usage() if isinstance(value, RayTaskError): @@ -1718,6 +1766,7 @@ def decorator(function_or_class): return decorator +@client_mode_hook def remote(*args, **kwargs): """Defines a remote function or an actor class. diff --git a/python/ray/workers/default_worker.py b/python/ray/workers/default_worker.py index 7b9c2677bd0b..d9f7837ff2ce 100644 --- a/python/ray/workers/default_worker.py +++ b/python/ray/workers/default_worker.py @@ -109,21 +109,6 @@ help="A list of directories or jar files separated by colon that specify " "the search path for user code. This will be used as `CLASSPATH` in " "Java and `PYTHONPATH` in Python.") -parser.add_argument( - "--logging-rotate-bytes", - required=False, - type=int, - default=ray_constants.LOGGING_ROTATE_BYTES, - help="Specify the max bytes for rotating " - "log file, default is " - f"{ray_constants.LOGGING_ROTATE_BYTES} bytes.") -parser.add_argument( - "--logging-rotate-backup-count", - required=False, - type=int, - default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT, - help="Specify the backup count of rotated log file, default is " - f"{ray_constants.LOGGING_ROTATE_BACKUP_COUNT}.") if __name__ == "__main__": # NOTE(sang): For some reason, if we move the code below # to a separate function, tensorflow will capture that method diff --git a/python/requirements.txt b/python/requirements.txt index 17a3c233f26a..28c387fde7b3 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -8,7 +8,6 @@ aiohttp==3.7 aioredis click >= 7.0 -cloudpickle colorama colorful filelock diff --git a/python/requirements/linux-py3.6-requirements_tune.txt b/python/requirements/linux-py3.6-requirements_tune.txt new file mode 100644 index 000000000000..8d75554d451b --- /dev/null +++ b/python/requirements/linux-py3.6-requirements_tune.txt @@ -0,0 +1,886 @@ +# +# This file is autogenerated by pip-compile +# To update, run: +# +# pip-compile requirements_tune.in +# +--find-links https://download.pytorch.org/whl/torch_stable.html + +absl-py==0.11.0 + # via tensorboard +alembic==1.4.1 + # via + # mlflow + # optuna +argon2-cffi==20.1.0 + # via notebook +async-generator==1.10 + # via nbclient +atari-py==0.2.6 + # via + # -c ../requirements.txt + # gym +attrs==20.3.0 + # via + # cmd2 + # jsonschema + # pytest +autocfg==0.0.6 + # via gluoncv +autogluon.core==0.0.16b20210113 + # via gluoncv +autograd==1.3 + # via autogluon.core +ax-platform==0.1.9 ; python_version < "3.7" + # via -r requirements_tune.in +azure-core==1.10.0 + # via azure-storage-blob +azure-storage-blob==12.6.0 + # via mlflow +backcall==0.2.0 + # via ipython +bayesian-optimization==1.2.0 + # via + # -r requirements_tune.in + # nevergrad +bcrypt==3.2.0 + # via paramiko +bleach==3.2.1 + # via nbconvert +bokeh==2.2.3 + # via dask +boto3==1.16.53 + # via + # -c ../requirements.txt + # autogluon.core + # smart-open +botocore==1.19.53 + # via + # boto3 + # s3transfer +botorch==0.2.1 + # via ax-platform +cached-property==1.5.2 + # via h5py +cachetools==4.2.0 + # via google-auth +certifi==2020.12.5 + # via + # kubernetes + # msrest + # requests + # sentry-sdk +cffi==1.14.4 + # via + # argon2-cffi + # bcrypt + # cryptography + # pynacl +chardet==4.0.0 + # via requests +click==7.1.2 + # via + # -c ../requirements.txt + # databricks-cli + # distributed + # flask + # mlflow + # sacremoses + # wandb +cliff==3.5.0 + # via optuna +cloudpickle==1.6.0 + # via + # dask + # distributed + # gym + # hyperopt + # mlflow + # tensorflow-probability +cma==3.0.3 + # via nevergrad +cmaes==0.7.0 + # via optuna +cmd2==1.4.0 + # via cliff +colorama==0.4.4 + # via + # -c ../requirements.txt + # cmd2 +colorlog==4.6.2 + # via optuna +configparser==5.0.1 + # via wandb +configspace==0.4.10 + # via + # -r requirements_tune.in + # autogluon.core + # hpbandster +contextvars==2.4 + # via distributed +cryptography==3.3.1 + # via + # azure-storage-blob + # paramiko +cycler==0.10.0 + # via matplotlib +cython==0.29.0 + # via + # -c ../requirements.txt + # autogluon.core + # configspace +dask[complete]==2020.12.0 + # via + # -c ../requirements.txt + # autogluon.core + # distributed +databricks-cli==0.14.1 + # via mlflow +dataclasses==0.8 ; python_version < "3.7" + # via + # -c ../requirements.txt + # autocfg + # torch + # transformers +decorator==4.4.2 + # via + # ipython + # networkx + # paramz + # tensorflow-probability + # traitlets +decord==0.4.2 + # via gluoncv +defusedxml==0.6.0 + # via nbconvert +dill==0.3.3 + # via autogluon.core +distributed==2020.12.0 + # via + # autogluon.core + # dask +dm-tree==0.1.5 + # via + # -c ../requirements.txt + # tensorflow-probability +docker-pycreds==0.4.0 + # via wandb +docker==4.4.1 + # via mlflow +dragonfly-opt==0.1.6 + # via -r requirements_tune.in +entrypoints==0.3 + # via + # mlflow + # nbconvert +filelock==3.0.12 + # via + # -c ../requirements.txt + # transformers +flask==1.1.2 + # via + # -c ../requirements.txt + # mlflow + # prometheus-flask-exporter +fsspec==0.8.5 + # via + # dask + # pytorch-lightning +future==0.18.2 + # via + # autograd + # dragonfly-opt + # hyperopt + # pyglet + # pytorch-lightning + # torch +gast==0.4.0 + # via tensorflow-probability +gitdb==4.0.5 + # via gitpython +gitpython==3.1.12 + # via + # mlflow + # wandb +gluoncv==0.9.1 + # via -r requirements_tune.in +google-auth-oauthlib==0.4.2 + # via tensorboard +google-auth==1.24.0 + # via + # google-auth-oauthlib + # kubernetes + # tensorboard +gpy==1.9.9 + # via -r requirements_tune.in +gpytorch==1.3.0 + # via botorch +graphviz==0.8.4 + # via + # autogluon.core + # mxnet +grpcio==1.34.1 + # via + # -c ../requirements.txt + # tensorboard +gunicorn==20.0.4 + # via mlflow +gym[atari]==0.18.0 + # via + # -c ../requirements.txt + # -r requirements_tune.in +h5py==3.1.0 + # via + # -r requirements_tune.in + # keras +heapdict==1.0.1 + # via zict +hpbandster==0.7.4 + # via -r requirements_tune.in +hyperopt==0.2.5 + # via -r requirements_tune.in +idna==2.10 + # via requests +immutables==0.14 + # via contextvars +importlib-metadata==3.4.0 + # via + # cmd2 + # jsonschema + # markdown + # pluggy + # pytest + # stevedore +ipykernel==5.4.3 + # via + # ipywidgets + # jupyter + # jupyter-console + # notebook + # qtconsole +ipython-genutils==0.2.0 + # via + # nbformat + # notebook + # qtconsole + # traitlets +ipython==7.16.1 + # via + # ipykernel + # ipywidgets + # jupyter-console +ipywidgets==7.6.3 + # via jupyter +isodate==0.6.0 + # via msrest +itsdangerous==1.1.0 + # via flask +jedi==0.18.0 + # via ipython +jinja2==2.11.2 + # via + # ax-platform + # bokeh + # flask + # nbconvert + # notebook +jmespath==0.10.0 + # via + # boto3 + # botocore +joblib==1.0.0 + # via + # optuna + # sacremoses + # scikit-learn + # scikit-optimize +jsonschema==3.2.0 + # via + # -c ../requirements.txt + # nbformat +jupyter-client==6.1.11 + # via + # ipykernel + # jupyter-console + # nbclient + # notebook + # qtconsole +jupyter-console==6.2.0 + # via jupyter +jupyter-core==4.7.0 + # via + # jupyter-client + # nbconvert + # nbformat + # notebook + # qtconsole +jupyter==1.0.0 + # via -r requirements_tune.in +jupyterlab-pygments==0.1.2 + # via nbconvert +jupyterlab-widgets==1.0.0 + # via ipywidgets +keras==2.4.3 + # via -r requirements_tune.in +kiwisolver==1.3.1 + # via matplotlib +kubernetes==12.0.1 + # via + # -c ../requirements.txt + # -r requirements_tune.in +lightgbm==3.1.1 + # via -r requirements_tune.in +locket==0.2.0 + # via partd +mako==1.1.3 + # via alembic +markdown==3.3.3 + # via tensorboard +markupsafe==1.1.1 + # via + # jinja2 + # mako +matplotlib==3.3.3 + # via + # -r requirements_tune.in + # autogluon.core + # gluoncv + # zoopt +mistune==0.8.4 + # via nbconvert +mlflow==1.13.1 + # via -r requirements_tune.in +more-itertools==8.6.0 + # via pytest +msgpack==1.0.2 + # via + # -c ../requirements.txt + # distributed +msrest==0.6.19 + # via azure-storage-blob +mxnet==1.7.0.post1 + # via -r requirements_tune.in +nbclient==0.5.1 + # via nbconvert +nbconvert==6.0.7 + # via + # jupyter + # notebook +nbformat==5.0.8 + # via + # ipywidgets + # nbclient + # nbconvert + # notebook +nest-asyncio==1.4.3 + # via nbclient +netifaces==0.10.9 + # via hpbandster +networkx==2.5 + # via + # -c ../requirements.txt + # hyperopt +nevergrad==0.4.2.post5 + # via -r requirements_tune.in +notebook==6.2.0 + # via + # jupyter + # widgetsnbextension +numpy==1.19.5 + # via + # -c ../requirements.txt + # atari-py + # autogluon.core + # autograd + # bayesian-optimization + # bokeh + # cma + # cmaes + # configspace + # dask + # decord + # dragonfly-opt + # gluoncv + # gpy + # gym + # h5py + # hpbandster + # hyperopt + # keras + # lightgbm + # matplotlib + # mlflow + # mxnet + # nevergrad + # opencv-python + # optuna + # pandas + # paramz + # patsy + # pytorch-lightning + # scikit-learn + # scikit-optimize + # scipy + # statsmodels + # tensorboard + # tensorboardx + # tensorflow-probability + # torch + # torchvision + # transformers + # xgboost + # zoopt +oauthlib==3.1.0 + # via requests-oauthlib +opencv-python==4.5.1.48 + # via + # gluoncv + # gym +optuna==2.3.0 + # via -r requirements_tune.in +packaging==20.8 + # via + # bleach + # bokeh + # optuna + # pytest + # transformers +pandas==1.0.5 + # via + # -c ../requirements.txt + # autogluon.core + # ax-platform + # dask + # gluoncv + # mlflow + # statsmodels +pandocfilters==1.4.3 + # via nbconvert +paramiko==2.7.2 + # via autogluon.core +paramz==0.9.5 + # via gpy +parso==0.8.1 + # via jedi +partd==1.1.0 + # via dask +patsy==0.5.1 + # via statsmodels +pbr==5.5.1 + # via + # cliff + # stevedore +pexpect==4.8.0 + # via + # -c ../requirements.txt + # ipython +pickleshare==0.7.5 + # via ipython +pillow==7.2.0 ; platform_system != "Windows" + # via + # -c ../requirements.txt + # bokeh + # gluoncv + # gym + # matplotlib + # torchvision +plotly==4.14.3 + # via ax-platform +pluggy==0.13.1 + # via pytest +portalocker==2.0.0 + # via gluoncv +prettytable==0.7.2 + # via cliff +prometheus-client==0.9.0 + # via + # -c ../requirements.txt + # notebook + # prometheus-flask-exporter +prometheus-flask-exporter==0.18.1 + # via mlflow +promise==2.3 + # via wandb +prompt-toolkit==3.0.10 + # via + # ipython + # jupyter-console +protobuf==3.14.0 + # via + # -c ../requirements.txt + # mlflow + # tensorboard + # tensorboardx + # wandb +psutil==5.8.0 + # via + # distributed + # wandb +ptyprocess==0.7.0 + # via + # pexpect + # terminado +py==1.10.0 + # via pytest +pyaml==20.4.0 + # via scikit-optimize +pyasn1-modules==0.2.8 + # via google-auth +pyasn1==0.4.8 + # via + # pyasn1-modules + # rsa +pycparser==2.20 + # via cffi +pyglet==1.5.0 + # via gym +pygments==2.7.4 + # via + # -c ../requirements.txt + # ipython + # jupyter-console + # jupyterlab-pygments + # nbconvert + # qtconsole +pynacl==1.4.0 + # via paramiko +pyparsing==2.4.7 + # via + # cliff + # configspace + # matplotlib + # packaging +pyperclip==1.8.1 + # via cmd2 +pyro4==4.80 + # via hpbandster +pyrsistent==0.17.3 + # via jsonschema +pytest-remotedata==0.3.2 + # via -r requirements_tune.in +pytest==5.4.3 + # via + # -c ../requirements.txt + # autogluon.core + # pytest-remotedata +python-dateutil==2.8.1 + # via + # alembic + # bokeh + # botocore + # jupyter-client + # kubernetes + # matplotlib + # mlflow + # pandas + # wandb +python-editor==1.0.4 + # via alembic +pytorch-lightning-bolts==0.2.5 + # via -r requirements_tune.in +pytorch-lightning==1.0.3 + # via + # -r requirements_tune.in + # pytorch-lightning-bolts +pytz==2020.5 + # via pandas +pyyaml==5.3.1 + # via + # -c ../requirements.txt + # autocfg + # bokeh + # cliff + # dask + # distributed + # gluoncv + # keras + # kubernetes + # mlflow + # pyaml + # pytorch-lightning + # wandb + # yacs +pyzmq==20.0.0 + # via + # jupyter-client + # notebook + # qtconsole +qtconsole==5.0.1 + # via jupyter +qtpy==1.9.0 + # via qtconsole +querystring-parser==1.2.4 + # via mlflow +regex==2020.11.13 + # via + # sacremoses + # transformers +requests-oauthlib==1.3.0 + # via + # google-auth-oauthlib + # kubernetes + # msrest +requests==2.25.1 + # via + # -c ../requirements.txt + # autogluon.core + # azure-core + # databricks-cli + # docker + # gluoncv + # kubernetes + # mlflow + # msrest + # mxnet + # requests-oauthlib + # sigopt + # tensorboard + # transformers + # wandb +retrying==1.3.3 + # via plotly +rsa==4.7 + # via google-auth +s3transfer==0.3.4 + # via boto3 +sacremoses==0.0.43 + # via transformers +scikit-learn==0.22.2 + # via + # -c ../requirements.txt + # -r requirements_tune.in + # autogluon.core + # ax-platform + # bayesian-optimization + # gpytorch + # lightgbm + # scikit-optimize +scikit-optimize==0.8.1 + # via + # -r requirements_tune.in + # autogluon.core +scipy==1.4.1 + # via + # -c ../requirements.txt + # autogluon.core + # ax-platform + # bayesian-optimization + # botorch + # dragonfly-opt + # gluoncv + # gpy + # gpytorch + # gym + # hpbandster + # hyperopt + # keras + # lightgbm + # optuna + # paramz + # scikit-learn + # scikit-optimize + # statsmodels + # xgboost +send2trash==1.5.0 + # via notebook +sentencepiece==0.1.95 + # via transformers +sentry-sdk==0.19.5 + # via wandb +serpent==1.30.2 + # via + # hpbandster + # pyro4 +shortuuid==1.0.1 + # via wandb +sigopt==5.7.0 + # via -r requirements_tune.in +six==1.15.0 + # via + # absl-py + # argon2-cffi + # atari-py + # azure-core + # bcrypt + # bleach + # cliff + # cryptography + # cycler + # databricks-cli + # dm-tree + # docker + # docker-pycreds + # dragonfly-opt + # google-auth + # gpy + # grpcio + # hyperopt + # isodate + # jsonschema + # kubernetes + # mlflow + # paramz + # patsy + # plotly + # promise + # protobuf + # pynacl + # pytest-remotedata + # python-dateutil + # querystring-parser + # retrying + # sacremoses + # tensorboard + # tensorboardx + # tensorflow-probability + # traitlets + # wandb + # websocket-client +smart_open==4.0.1 + # via + # -c ../requirements.txt + # -r requirements_tune.in +smmap==3.0.4 + # via gitdb +sortedcontainers==2.3.0 + # via distributed +sqlalchemy==1.3.22 + # via + # alembic + # mlflow + # optuna +sqlparse==0.4.1 + # via mlflow +statsmodels==0.12.1 + # via hpbandster +stevedore==3.3.0 + # via cliff +subprocess32==3.5.4 + # via wandb +tabulate==0.8.7 + # via + # -c ../requirements.txt + # databricks-cli +tblib==1.7.0 + # via distributed +tensorboard-plugin-wit==1.7.0 + # via tensorboard +tensorboard==2.4.0 + # via pytorch-lightning +tensorboardx==2.1 + # via + # -c ../requirements.txt + # gluoncv +tensorflow-probability==0.11.1 + # via -r requirements_tune.in +terminado==0.9.2 + # via notebook +testpath==0.4.4 + # via nbconvert +timm==0.3.2 + # via -r requirements_tune.in +tokenizers==0.8.1.rc2 + # via transformers +toolz==0.11.1 + # via + # dask + # distributed + # partd +torch==1.7.0+cpu ; sys_platform != "darwin" + # via + # -r requirements_tune.in + # botorch + # gpytorch + # pytorch-lightning + # pytorch-lightning-bolts + # timm + # torchvision +torchvision==0.8.1+cpu ; sys_platform != "darwin" + # via + # -r requirements_tune.in + # timm +tornado==6.1 + # via + # autogluon.core + # bokeh + # distributed + # ipykernel + # jupyter-client + # notebook + # terminado +tqdm==4.56.0 + # via + # autogluon.core + # gluoncv + # hyperopt + # optuna + # pytorch-lightning + # sacremoses + # transformers +traitlets==4.3.3 + # via + # ipykernel + # ipython + # ipywidgets + # jupyter-client + # jupyter-core + # nbclient + # nbconvert + # nbformat + # notebook + # qtconsole +transformers==3.1 + # via -r requirements_tune.in +typing-extensions==3.7.4.3 + # via + # bokeh + # importlib-metadata + # nevergrad + # torch +typing==3.7.4.3 + # via configspace +urllib3==1.26.2 + # via + # botocore + # kubernetes + # requests + # sentry-sdk +wandb==0.10.12 + # via -r requirements_tune.in +watchdog==1.0.2 + # via wandb +wcwidth==0.2.5 + # via + # cmd2 + # prompt-toolkit + # pytest +webencodings==0.5.1 + # via bleach +websocket-client==0.57.0 + # via + # docker + # kubernetes +werkzeug==1.0.1 + # via + # -c ../requirements.txt + # flask + # tensorboard +wheel==0.36.2 + # via + # lightgbm + # tensorboard +widgetsnbextension==3.5.1 + # via ipywidgets +xgboost==1.3.0.post0 + # via -r requirements_tune.in +yacs==0.1.8 + # via gluoncv +zict==2.0.0 + # via distributed +zipp==3.4.0 + # via importlib-metadata +zoopt==0.4.1 + # via -r requirements_tune.in + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/python/requirements/linux-py3.7-requirements_tune.txt b/python/requirements/linux-py3.7-requirements_tune.txt new file mode 100644 index 000000000000..1ac1824330c0 --- /dev/null +++ b/python/requirements/linux-py3.7-requirements_tune.txt @@ -0,0 +1,878 @@ +# +# This file is autogenerated by pip-compile +# To update, run: +# +# pip-compile requirements_tune.in +# +--find-links https://download.pytorch.org/whl/torch_stable.html + +absl-py==0.11.0 + # via tensorboard +alembic==1.4.1 + # via + # mlflow + # optuna +argon2-cffi==20.1.0 + # via notebook +async-generator==1.10 + # via nbclient +atari-py==0.2.6 + # via + # -c ../requirements.txt + # gym +attrs==20.3.0 + # via + # cmd2 + # jsonschema + # pytest +autocfg==0.0.6 + # via gluoncv +autogluon.core==0.0.16b20210113 + # via gluoncv +autograd==1.3 + # via autogluon.core +ax-platform==0.1.19 ; python_version >= "3.7" + # via -r requirements_tune.in +azure-core==1.10.0 + # via azure-storage-blob +azure-storage-blob==12.6.0 + # via mlflow +backcall==0.2.0 + # via ipython +bayesian-optimization==1.2.0 + # via + # -r requirements_tune.in + # nevergrad +bcrypt==3.2.0 + # via paramiko +bleach==3.2.1 + # via nbconvert +bokeh==2.2.3 + # via dask +boto3==1.16.53 + # via + # -c ../requirements.txt + # autogluon.core + # smart-open +botocore==1.19.53 + # via + # boto3 + # s3transfer +botorch==0.3.3 + # via ax-platform +cached-property==1.5.2 + # via h5py +cachetools==4.2.0 + # via google-auth +certifi==2020.12.5 + # via + # kubernetes + # msrest + # requests + # sentry-sdk +cffi==1.14.4 + # via + # argon2-cffi + # bcrypt + # cryptography + # pynacl +chardet==4.0.0 + # via requests +click==7.1.2 + # via + # -c ../requirements.txt + # databricks-cli + # distributed + # flask + # mlflow + # sacremoses + # wandb +cliff==3.5.0 + # via optuna +cloudpickle==1.6.0 + # via + # dask + # distributed + # gym + # hyperopt + # mlflow + # tensorflow-probability +cma==3.0.3 + # via nevergrad +cmaes==0.7.0 + # via optuna +cmd2==1.4.0 + # via cliff +colorama==0.4.4 + # via + # -c ../requirements.txt + # cmd2 +colorlog==4.6.2 + # via optuna +configparser==5.0.1 + # via wandb +configspace==0.4.10 + # via + # -r requirements_tune.in + # autogluon.core + # hpbandster +cryptography==3.3.1 + # via + # azure-storage-blob + # paramiko +cycler==0.10.0 + # via matplotlib +cython==0.29.0 + # via + # -c ../requirements.txt + # autogluon.core + # configspace +dask[complete]==2020.12.0 + # via + # -c ../requirements.txt + # autogluon.core + # distributed +databricks-cli==0.14.1 + # via mlflow +dataclasses==0.6 + # via torch +decorator==4.4.2 + # via + # ipython + # networkx + # paramz + # tensorflow-probability +decord==0.4.2 + # via gluoncv +defusedxml==0.6.0 + # via nbconvert +dill==0.3.3 + # via autogluon.core +distributed==2020.12.0 + # via + # autogluon.core + # dask +dm-tree==0.1.5 + # via + # -c ../requirements.txt + # tensorflow-probability +docker-pycreds==0.4.0 + # via wandb +docker==4.4.1 + # via mlflow +dragonfly-opt==0.1.6 + # via -r requirements_tune.in +entrypoints==0.3 + # via + # mlflow + # nbconvert +filelock==3.0.12 + # via + # -c ../requirements.txt + # transformers +flask==1.1.2 + # via + # -c ../requirements.txt + # mlflow + # prometheus-flask-exporter +fsspec==0.8.5 + # via + # dask + # pytorch-lightning +future==0.18.2 + # via + # autograd + # dragonfly-opt + # hyperopt + # pyglet + # pytorch-lightning + # torch +gast==0.4.0 + # via tensorflow-probability +gitdb==4.0.5 + # via gitpython +gitpython==3.1.12 + # via + # mlflow + # wandb +gluoncv==0.9.1 + # via -r requirements_tune.in +google-auth-oauthlib==0.4.2 + # via tensorboard +google-auth==1.24.0 + # via + # google-auth-oauthlib + # kubernetes + # tensorboard +gpy==1.9.9 + # via -r requirements_tune.in +gpytorch==1.3.0 + # via botorch +graphviz==0.8.4 + # via + # autogluon.core + # mxnet +grpcio==1.34.0 + # via + # -c ../requirements.txt + # tensorboard +gunicorn==20.0.4 + # via mlflow +gym[atari]==0.18.0 + # via + # -c ../requirements.txt + # -r requirements_tune.in +h5py==3.1.0 + # via + # -r requirements_tune.in + # keras +heapdict==1.0.1 + # via zict +hpbandster==0.7.4 + # via -r requirements_tune.in +hyperopt==0.2.5 + # via -r requirements_tune.in +idna==2.10 + # via requests +importlib-metadata==3.4.0 + # via + # cmd2 + # jsonschema + # markdown + # pluggy + # pytest + # stevedore +ipykernel==5.4.3 + # via + # ipywidgets + # jupyter + # jupyter-console + # notebook + # qtconsole +ipython-genutils==0.2.0 + # via + # nbformat + # notebook + # qtconsole + # traitlets +ipython==7.19.0 + # via + # ipykernel + # ipywidgets + # jupyter-console +ipywidgets==7.6.3 + # via jupyter +isodate==0.6.0 + # via msrest +itsdangerous==1.1.0 + # via flask +jedi==0.18.0 + # via ipython +jinja2==2.11.2 + # via + # ax-platform + # bokeh + # flask + # nbconvert + # notebook +jmespath==0.10.0 + # via + # boto3 + # botocore +joblib==1.0.0 + # via + # optuna + # sacremoses + # scikit-learn + # scikit-optimize +jsonschema==3.2.0 + # via + # -c ../requirements.txt + # nbformat +jupyter-client==6.1.11 + # via + # ipykernel + # jupyter-console + # nbclient + # notebook + # qtconsole +jupyter-console==6.2.0 + # via jupyter +jupyter-core==4.7.0 + # via + # jupyter-client + # nbconvert + # nbformat + # notebook + # qtconsole +jupyter==1.0.0 + # via -r requirements_tune.in +jupyterlab-pygments==0.1.2 + # via nbconvert +jupyterlab-widgets==1.0.0 + # via ipywidgets +keras==2.4.3 + # via -r requirements_tune.in +kiwisolver==1.3.1 + # via matplotlib +kubernetes==12.0.1 + # via + # -c ../requirements.txt + # -r requirements_tune.in +lightgbm==3.1.1 + # via -r requirements_tune.in +locket==0.2.0 + # via partd +mako==1.1.3 + # via alembic +markdown==3.3.3 + # via tensorboard +markupsafe==1.1.1 + # via + # jinja2 + # mako +matplotlib==3.3.3 + # via + # -r requirements_tune.in + # autogluon.core + # gluoncv + # zoopt +mistune==0.8.4 + # via nbconvert +mlflow==1.13.1 + # via -r requirements_tune.in +more-itertools==8.6.0 + # via pytest +msgpack==1.0.2 + # via + # -c ../requirements.txt + # distributed +msrest==0.6.19 + # via azure-storage-blob +mxnet==1.7.0.post1 + # via -r requirements_tune.in +nbclient==0.5.1 + # via nbconvert +nbconvert==6.0.7 + # via + # jupyter + # notebook +nbformat==5.0.8 + # via + # ipywidgets + # nbclient + # nbconvert + # notebook +nest-asyncio==1.4.3 + # via nbclient +netifaces==0.10.9 + # via hpbandster +networkx==2.5 + # via + # -c ../requirements.txt + # hyperopt +nevergrad==0.4.2.post5 + # via -r requirements_tune.in +notebook==6.2.0 + # via + # jupyter + # widgetsnbextension +numpy==1.19.5 + # via + # -c ../requirements.txt + # atari-py + # autogluon.core + # autograd + # bayesian-optimization + # bokeh + # cma + # cmaes + # configspace + # dask + # decord + # dragonfly-opt + # gluoncv + # gpy + # gym + # h5py + # hpbandster + # hyperopt + # keras + # lightgbm + # matplotlib + # mlflow + # mxnet + # nevergrad + # opencv-python + # optuna + # pandas + # paramz + # patsy + # pytorch-lightning + # scikit-learn + # scikit-optimize + # scipy + # statsmodels + # tensorboard + # tensorboardx + # tensorflow-probability + # torch + # torchvision + # transformers + # xgboost + # zoopt +oauthlib==3.1.0 + # via requests-oauthlib +opencv-python==4.5.1.48 + # via + # gluoncv + # gym +optuna==2.3.0 + # via -r requirements_tune.in +packaging==20.8 + # via + # bleach + # bokeh + # optuna + # pytest + # transformers +pandas==1.0.5 + # via + # -c ../requirements.txt + # autogluon.core + # ax-platform + # dask + # gluoncv + # mlflow + # statsmodels +pandocfilters==1.4.3 + # via nbconvert +paramiko==2.7.2 + # via autogluon.core +paramz==0.9.5 + # via gpy +parso==0.8.1 + # via jedi +partd==1.1.0 + # via dask +patsy==0.5.1 + # via statsmodels +pbr==5.5.1 + # via + # cliff + # stevedore +pexpect==4.8.0 + # via + # -c ../requirements.txt + # ipython +pickleshare==0.7.5 + # via ipython +pillow==7.2.0 ; platform_system != "Windows" + # via + # -c ../requirements.txt + # bokeh + # gluoncv + # gym + # matplotlib + # torchvision +plotly==4.14.3 + # via ax-platform +pluggy==0.13.1 + # via pytest +portalocker==2.0.0 + # via gluoncv +prettytable==0.7.2 + # via cliff +prometheus-client==0.9.0 + # via + # -c ../requirements.txt + # notebook + # prometheus-flask-exporter +prometheus-flask-exporter==0.18.1 + # via mlflow +promise==2.3 + # via wandb +prompt-toolkit==3.0.10 + # via + # ipython + # jupyter-console +protobuf==3.14.0 + # via + # -c ../requirements.txt + # mlflow + # tensorboard + # tensorboardx + # wandb +psutil==5.8.0 + # via + # distributed + # wandb +ptyprocess==0.7.0 + # via + # pexpect + # terminado +py==1.10.0 + # via pytest +pyaml==20.4.0 + # via scikit-optimize +pyasn1-modules==0.2.8 + # via google-auth +pyasn1==0.4.8 + # via + # pyasn1-modules + # rsa +pycparser==2.20 + # via cffi +pyglet==1.5.0 + # via gym +pygments==2.7.4 + # via + # -c ../requirements.txt + # ipython + # jupyter-console + # jupyterlab-pygments + # nbconvert + # qtconsole +pynacl==1.4.0 + # via paramiko +pyparsing==2.4.7 + # via + # cliff + # configspace + # matplotlib + # packaging +pyperclip==1.8.1 + # via cmd2 +pyro4==4.80 + # via hpbandster +pyrsistent==0.17.3 + # via jsonschema +pytest-remotedata==0.3.2 + # via -r requirements_tune.in +pytest==5.4.3 + # via + # -c ../requirements.txt + # autogluon.core + # pytest-remotedata +python-dateutil==2.8.1 + # via + # alembic + # bokeh + # botocore + # jupyter-client + # kubernetes + # matplotlib + # mlflow + # pandas + # wandb +python-editor==1.0.4 + # via alembic +pytorch-lightning-bolts==0.2.5 + # via -r requirements_tune.in +pytorch-lightning==1.0.3 + # via + # -r requirements_tune.in + # pytorch-lightning-bolts +pytz==2020.5 + # via pandas +pyyaml==5.3.1 + # via + # -c ../requirements.txt + # autocfg + # bokeh + # cliff + # dask + # distributed + # gluoncv + # keras + # kubernetes + # mlflow + # pyaml + # pytorch-lightning + # wandb + # yacs +pyzmq==20.0.0 + # via + # jupyter-client + # notebook + # qtconsole +qtconsole==5.0.1 + # via jupyter +qtpy==1.9.0 + # via qtconsole +querystring-parser==1.2.4 + # via mlflow +regex==2020.11.13 + # via + # sacremoses + # transformers +requests-oauthlib==1.3.0 + # via + # google-auth-oauthlib + # kubernetes + # msrest +requests==2.25.1 + # via + # -c ../requirements.txt + # autogluon.core + # azure-core + # databricks-cli + # docker + # gluoncv + # kubernetes + # mlflow + # msrest + # mxnet + # requests-oauthlib + # sigopt + # tensorboard + # transformers + # wandb +retrying==1.3.3 + # via plotly +rsa==4.7 + # via google-auth +s3transfer==0.3.4 + # via boto3 +sacremoses==0.0.43 + # via transformers +scikit-learn==0.22.2 + # via + # -c ../requirements.txt + # -r requirements_tune.in + # autogluon.core + # ax-platform + # bayesian-optimization + # gpytorch + # lightgbm + # scikit-optimize +scikit-optimize==0.8.1 + # via + # -r requirements_tune.in + # autogluon.core +scipy==1.4.1 + # via + # -c ../requirements.txt + # autogluon.core + # ax-platform + # bayesian-optimization + # botorch + # dragonfly-opt + # gluoncv + # gpy + # gpytorch + # gym + # hpbandster + # hyperopt + # keras + # lightgbm + # optuna + # paramz + # scikit-learn + # scikit-optimize + # statsmodels + # xgboost +send2trash==1.5.0 + # via notebook +sentencepiece==0.1.95 + # via transformers +sentry-sdk==0.19.5 + # via wandb +serpent==1.30.2 + # via + # hpbandster + # pyro4 +shortuuid==1.0.1 + # via wandb +sigopt==5.7.0 + # via -r requirements_tune.in +six==1.15.0 + # via + # absl-py + # argon2-cffi + # atari-py + # azure-core + # bcrypt + # bleach + # cliff + # cryptography + # cycler + # databricks-cli + # dm-tree + # docker + # docker-pycreds + # dragonfly-opt + # google-auth + # gpy + # grpcio + # hyperopt + # isodate + # jsonschema + # kubernetes + # mlflow + # paramz + # patsy + # plotly + # promise + # protobuf + # pynacl + # pytest-remotedata + # python-dateutil + # querystring-parser + # retrying + # sacremoses + # tensorboard + # tensorboardx + # tensorflow-probability + # wandb + # websocket-client +smart_open[s3]==4.0.1 + # via + # -c ../requirements.txt + # -r requirements_tune.in +smmap==3.0.4 + # via gitdb +sortedcontainers==2.3.0 + # via distributed +sqlalchemy==1.3.22 + # via + # alembic + # mlflow + # optuna +sqlparse==0.4.1 + # via mlflow +statsmodels==0.12.1 + # via hpbandster +stevedore==3.3.0 + # via cliff +subprocess32==3.5.4 + # via wandb +tabulate==0.8.7 + # via + # -c ../requirements.txt + # databricks-cli +tblib==1.7.0 + # via distributed +tensorboard-plugin-wit==1.7.0 + # via tensorboard +tensorboard==2.4.0 + # via pytorch-lightning +tensorboardx==2.1 + # via + # -c ../requirements.txt + # gluoncv +tensorflow-probability==0.11.1 + # via -r requirements_tune.in +terminado==0.9.2 + # via notebook +testpath==0.4.4 + # via nbconvert +timm==0.3.2 + # via -r requirements_tune.in +tokenizers==0.8.1.rc2 + # via transformers +toolz==0.11.1 + # via + # dask + # distributed + # partd +torch==1.7.0+cpu ; sys_platform != "darwin" + # via + # -r requirements_tune.in + # botorch + # gpytorch + # pytorch-lightning + # pytorch-lightning-bolts + # timm + # torchvision +torchvision==0.8.1+cpu ; sys_platform != "darwin" + # via + # -r requirements_tune.in + # timm +tornado==6.1 + # via + # autogluon.core + # bokeh + # distributed + # ipykernel + # jupyter-client + # notebook + # terminado +tqdm==4.56.0 + # via + # autogluon.core + # gluoncv + # hyperopt + # optuna + # pytorch-lightning + # sacremoses + # transformers +traitlets==5.0.5 + # via + # ipykernel + # ipython + # ipywidgets + # jupyter-client + # jupyter-core + # nbclient + # nbconvert + # nbformat + # notebook + # qtconsole +transformers==3.1 + # via -r requirements_tune.in +typeguard==2.10.0 + # via ax-platform +typing-extensions==3.7.4.3 + # via + # bokeh + # importlib-metadata + # nevergrad + # torch +typing==3.7.4.3 + # via configspace +urllib3==1.26.2 + # via + # botocore + # kubernetes + # requests + # sentry-sdk +wandb==0.10.12 + # via -r requirements_tune.in +watchdog==1.0.2 + # via wandb +wcwidth==0.2.5 + # via + # cmd2 + # prompt-toolkit + # pytest +webencodings==0.5.1 + # via bleach +websocket-client==0.57.0 + # via + # docker + # kubernetes +werkzeug==1.0.1 + # via + # -c ../requirements.txt + # flask + # tensorboard +wheel==0.36.2 + # via + # lightgbm + # tensorboard +widgetsnbextension==3.5.1 + # via ipywidgets +xgboost==1.3.0.post0 + # via -r requirements_tune.in +yacs==0.1.8 + # via gluoncv +zict==2.0.0 + # via distributed +zipp==3.4.0 + # via importlib-metadata +zoopt==0.4.1 + # via -r requirements_tune.in + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/python/requirements/requirements_tune.txt b/python/requirements/requirements_tune.in similarity index 91% rename from python/requirements/requirements_tune.txt rename to python/requirements/requirements_tune.in index 5ee1b9026f9e..40ccf4be43d1 100644 --- a/python/requirements/requirements_tune.txt +++ b/python/requirements/requirements_tune.in @@ -1,3 +1,6 @@ +# Use base requirements to constrain these requirements. +-c ../requirements.txt + ax-platform==0.1.9; python_version < '3.7' ax-platform==0.1.19; python_version >= '3.7' bayesian-optimization==1.2.0 @@ -17,7 +20,7 @@ matplotlib==3.3.3 mlflow==1.13.1 mxnet==1.7.0.post1 nevergrad==0.4.2.post5 -optuna==2.4.0 +optuna==2.3.0 pytest-remotedata==0.3.2 pytorch-lightning-bolts==0.2.5 pytorch-lightning==1.0.3 diff --git a/python/requirements_ml_docker.txt b/python/requirements_ml_docker.txt index bbecb5bd873e..6f610c46862e 100644 --- a/python/requirements_ml_docker.txt +++ b/python/requirements_ml_docker.txt @@ -1,7 +1,3 @@ ipython -tensorflow-gpu>=2.4.0 --f https://download.pytorch.org/whl/torch_stable.html -torch==1.7.1+cu110 --f https://download.pytorch.org/whl/torch_stable.html -torchvision==0.8.2+cu110 -pip; python_version > "3.7" +tensorflow-gpu +torch \ No newline at end of file diff --git a/python/requirements_rllib.txt b/python/requirements_rllib.txt index 5f5a0f99112d..94ae9cdbb338 100644 --- a/python/requirements_rllib.txt +++ b/python/requirements_rllib.txt @@ -13,10 +13,3 @@ pettingzoo>=1.4.0 # For tests on RecSim and Kaggle envs. recsim kaggle_environments - -# For MAML on PyTorch. -higher - -# Unity3D testing -mlagents -mlagents_envs diff --git a/python/setup.py b/python/setup.py index 76e540ada294..18d012b99e52 100644 --- a/python/setup.py +++ b/python/setup.py @@ -92,7 +92,7 @@ ] # If you're adding dependencies for ray extras, please -# also update the matching section of requirements/requirements.txt +# also update the matching section of requirements.txt # in this directory extras = { "serve": [ @@ -120,7 +120,7 @@ # These are the main dependencies for users of ray. This list # should be carefully curated. If you change it, please reflect -# the change in the matching section of requirements/requirements.txt +# the change in the matching section of requirements.txt install_requires = [ # TODO(alex) Pin the version once this PR is # included in the stable release. @@ -129,7 +129,6 @@ "aiohttp_cors", "aioredis", "click >= 7.0", - "cloudpickle", "colorama", "colorful", "filelock", @@ -450,7 +449,7 @@ def has_ext_modules(self): "ray=ray.scripts.scripts:main", "rllib=ray.rllib.scripts:cli [rllib]", "tune=ray.tune.scripts:cli", - "ray-operator=ray.ray_operator.operator:main", + "ray-operator=ray.operator.operator:main", "serve=ray.serve.scripts:cli", ] }, diff --git a/release/RELEASE_CHECKLIST.md b/release/RELEASE_CHECKLIST.md index f529b38ec52a..50b30f8ff54c 100644 --- a/release/RELEASE_CHECKLIST.md +++ b/release/RELEASE_CHECKLIST.md @@ -56,31 +56,6 @@ This checklist is meant to be used in conjunction with the RELEASE_PROCESS.rst d - [ ] Results added to `release/release_logs` - [ ] stress_tests - [ ] unit_gpu_tests -- [ ] Scalability Envelope Tests -- [ ] ASAN Test -- [ ] K8s Test - - [ ] K8s cluster launcher test - - [ ] K8s operator test -- [ ] Data processing tests - - [ ] streaming_shuffle -- [ ] Tune tests - - [ ] test_bookkeeping_overhead - - [x] test_result_throughput_cluster (ignore final time) - - [x] test_result_throughput_single_node (ignore final time) - - [x] test_network_overhead (ignore final time) - - [ ] test_long_running_large_checkpoints - - [ ] test_xgboost_sweep - - [ ] test_durable_trainable -- [ ] XGBoost Tests - - [ ] distributed_api_test - - [ ] train_small - - [ ] train_moderate - - [ ] train_gpu - - [ ] tune_small - - [ ] tune_4x32 - - [ ] tune_32x4 - - [ ] ft_small_non_elastic (flaky!) - - [ ] ft_small_elastic (flaky!) ## Final Steps - [ ] Wheels uploaded to Test PyPI @@ -108,4 +83,4 @@ This checklist is meant to be used in conjunction with the RELEASE_PROCESS.rst d - [ ] PR to bump master version is merged - [ ] Release is announced internally - [ ] Release is announced externally -- [ ] Any code/doc changes made during the release process contributed back to master branch +- [ ] Any code/doc changes made during the release process contributed back to master branch \ No newline at end of file diff --git a/release/RELEASE_PROCESS.rst b/release/RELEASE_PROCESS.rst index f7eb6292fb49..287ba870c661 100644 --- a/release/RELEASE_PROCESS.rst +++ b/release/RELEASE_PROCESS.rst @@ -134,60 +134,10 @@ is generally the easiest way to run release tests. The summaries printed by each test should be checked in under ``release_logs/`` on the **master** branch (make a pull request). -5. **Scalability envelope tests** - - - Run the tests in `benchmarks/` (with `ray submit --start cluster.yaml `) - - Record the outputted times. - - Whether the results are acceptable is a judgement call. - -6. **ASAN tests** - - Run the ``ci/asan_tests`` with the commit. This will enable ASAN build and run the whole Python tests to detect memory leaks. - -7. **K8s operator tests** - - Run the ``python/ray/tests/test_k8s_*`` to make sure K8s cluster launcher and operator works. Make sure the docker image is the released version. - -8. **Data processing tests** - - .. code-block:: bash - - data_processing_tests/README.rst - - Follow the instructions to kick off the tests and check the status of the workloads. - Data processing tests make sure all the data processing features are reliable and performant. - The following tests should be run. - - - ``data_processing_tests/workloads/streaming_shuffle.py`` run the 100GB streaming shuffle in a single node & fake 4 nodes cluster. - - **IMPORTANT** Check if the workload scripts has terminated. If so, please record the result (both read/write bandwidth and the shuffle result) to the ``release_logs/data_processing_tests/[test_name]``. - Both shuffling runtime and read/write bandwidth shouldn't be decreasing more than 15% compared to the previous release. - -9. **Ray Tune release tests** - - General Ray Tune functionality is implicitly tested via RLLib and XGBoost release tests. - We are in the process of introducing scalability envelopes for Ray Tune. - - Of the seven existing tests, three are currently not reaching their target time. - These three tests (test_result_throughput_cluster, test_result_throughput_single_node, and - test_network_overhead) are marked in the release checklist and don't have to be run at this time. - - The other release tests are expected to run through without errors and to pass within a pre-specified time. - The time is checked in the test function and the output will let you know if a run was fast enough and - thus passed the test. - -10. **XGBoost release tests** - - .. code-block:: bash - - xgboost_tests/README.rst - - Follow the instructions to kick off the tests and check the status of the workloads. - The XGBoost release tests use assertions or fail with exceptions and thus - should automatically tell you if they failed or not. - Only in the case of the fault tolerance tests you might want - to check the logs. See the readme for more information. +5. **ASAN tests** + Run the ``ci/asan_tests`` with the commit. This will enable ASAN build and run the + whole Python tests to detect memory leaks. Identify and Resolve Release Blockers ------------------------------------- @@ -316,11 +266,10 @@ to proceed with the final stages of the release! of the docs, trigger a new build of the "latest" branch in readthedocs to see if that fixes it. -7. **Update latest Docker Image:** SET THE VERSION NUMBER IN `docker/fix-docker-latest.sh`, then run the script ot update the "latest" tag +7. **Update latest Docker Image:** Message Ian Rodney to bump the "latest" tag in Dockerhub for the - ``rayproject/ray`` and ``rayproject/ray-ml`` Docker images to point to the Docker images built from the release. (Make sure there is no permission denied error, you will likely have to ask Thomas for permissions). - - Check the dockerhub to verify the update worked. https://hub.docker.com/repository/docker/rayproject/ray/tags?page=1&name=latest&ordering=last_updated + ``rayproject/ray`` and ``rayproject/ray-ml`` Docker images to point to the Docker images built from the release. (If you have privileges in these + docker projects, you can do this step yourself.) 8. **Send out an email announcing the release** to the engineering@anyscale.com Google group, and post a slack message in the Announcements channel of the diff --git a/release/data_processing_tests/README.rst b/release/data_processing_tests/README.rst deleted file mode 100644 index 3db8eeb9ce67..000000000000 --- a/release/data_processing_tests/README.rst +++ /dev/null @@ -1,9 +0,0 @@ -Running script --------------- - -Run `unset RAY_ADDRESS; python workloads/streaming_shuffle.py` - -Cluster configurations ----------------------- - -Make sure the test runs in i3.8xl (IO optimized instance). \ No newline at end of file diff --git a/release/data_processing_tests/cluster.yaml b/release/data_processing_tests/cluster.yaml deleted file mode 100644 index 903dd2564def..000000000000 --- a/release/data_processing_tests/cluster.yaml +++ /dev/null @@ -1,128 +0,0 @@ -# An unique identifier for the head node and workers of this cluster. -cluster_name: native-shuffle-tests - -# The minimum number of workers nodes to launch in addition to the head -# node. This number should be >= 0. -min_workers: 0 - -# The maximum number of workers nodes to launch in addition to the head -# node. This takes precedence over min_workers. -max_workers: 0 - -# The autoscaler will scale up the cluster faster with higher upscaling speed. -# E.g., if the task requires adding more nodes then autoscaler will gradually -# scale up the cluster in chunks of upscaling_speed*currently_running_nodes. -# This number should be > 0. -upscaling_speed: 1.0 - -# This executes all commands on all nodes in the docker container, -# and opens all the necessary ports to support the Ray cluster. -# Empty string means disabled. -docker: - image: "" # You can change this to latest-cpu if you don't need GPU support and want a faster startup - # image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull - container_name: "" - # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image - # if no cached version is present. - pull_before_run: True - run_options: [] # Extra options to pass into "docker run" - - # Example of running a GPU head with CPU workers - # head_image: "rayproject/ray-ml:latest-gpu" - # Allow Ray to automatically detect GPUs - - # worker_image: "rayproject/ray-ml:latest-cpu" - # worker_run_options: [] - -# If a node is idle for this many minutes, it will be removed. -idle_timeout_minutes: 5 - -# Cloud-provider specific configuration. -provider: - type: aws - region: us-west-2 - # Availability zone(s), comma-separated, that nodes may be launched in. - # Nodes are currently spread between zones by a round-robin approach, - # however this implementation detail should not be relied upon. - availability_zone: us-west-2a,us-west-2b - # Whether to allow node reuse. If set to False, nodes will be terminated - # instead of stopped. - cache_stopped_nodes: True # If not present, the default is True. - -# How Ray will authenticate with newly launched nodes. -auth: - ssh_user: ubuntu -# By default Ray creates a new private keypair, but you can also use your own. -# If you do so, make sure to also set "KeyName" in the head and worker node -# configurations below. -# ssh_private_key: /path/to/your/key.pem - -# Provider-specific config for the head node, e.g. instance type. By default -# Ray will auto-configure unspecified fields such as SubnetId and KeyName. -# For more documentation on available fields, see: -# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances -head_node: - InstanceType: i3.8xlarge - ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30 - - # You can provision additional disk space with a conf as follows - BlockDeviceMappings: - - DeviceName: /dev/sda1 - Ebs: - VolumeSize: 1000 - - # Additional options in the boto docs. - -# Provider-specific config for worker nodes, e.g. instance type. By default -# Ray will auto-configure unspecified fields such as SubnetId and KeyName. -# For more documentation on available fields, see: -# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances -worker_nodes: - InstanceType: i3.8xlarge - ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30 - - # You can provision additional disk space with a conf as follows - BlockDeviceMappings: - - DeviceName: /dev/sda1 - Ebs: - VolumeSize: 1000 - -# Patterns for files to exclude when running rsync up or rsync down -rsync_exclude: - - "**/.git" - - "**/.git/**" - -# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for -# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided -# as a value, the behavior will match git's behavior for finding and using .gitignore files. -rsync_filter: - - ".gitignore" - -# List of commands that will be run before `setup_commands`. If docker is -# enabled, these commands will run outside the container and before docker -# is setup. -initialization_commands: [] - -# List of shell commands to run to set up nodes. -setup_commands: - - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc - - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl - # Not necessary. - - sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 65535" >> /etc/security/limits.conf; echo "* hard nofile 65535" >> /etc/security/limits.conf;' - - pip install tqdm - -# Custom commands that will be run on the head node after common setup. -head_setup_commands: [] - -# Custom commands that will be run on worker nodes after common setup. -worker_setup_commands: [] - -# Command to start ray on the head node. You don't need to change this. -head_start_ray_commands: - - ray stop - # - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --system-config='{"automatic_object_spilling_enabled":true,"max_io_workers":1,"object_spilling_config":"{\"type\":\"filesystem\",\"params\":{\"directory_path\":\"/tmp/spill\"}}"}' - -# Command to start ray on worker nodes. You don't need to change this. -worker_start_ray_commands: - - ray stop - # - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/release/data_processing_tests/workloads/streaming_shuffle.py b/release/data_processing_tests/workloads/streaming_shuffle.py deleted file mode 100644 index 903042bb9956..000000000000 --- a/release/data_processing_tests/workloads/streaming_shuffle.py +++ /dev/null @@ -1,177 +0,0 @@ -import time -import json -import ray -import numpy as np -from typing import List -from tqdm import tqdm - -from ray.cluster_utils import Cluster - -num_nodes = 4 -num_cpus = 4 -partition_size = int(500e6) # 500MB -# Number of map & reduce tasks == num_partitions. -# Number of objects == num_partitions ^ 2. -num_partitions = 200 -# There are two int64 per row, so we divide by 8 * 2 bytes. -rows_per_partition = partition_size // (8 * 2) -object_store_size = 20 * 1024 * 1024 * 1024 # 20G - -system_config = { - "automatic_object_spilling_enabled": True, - "max_io_workers": 1, - "object_spilling_config": json.dumps( - { - "type": "filesystem", - "params": { - "directory_path": "/tmp/spill" - } - }, - separators=(",", ":")) -} - - -def display_spilling_info(address): - state = ray.state.GlobalState() - state._initialize_global_state(address, - ray.ray_constants.REDIS_DEFAULT_PASSWORD) - raylet = state.node_table()[0] - memory_summary = ray.internal.internal_api.memory_summary( - raylet["NodeManagerAddress"], raylet["NodeManagerPort"]) - for line in memory_summary.split("\n"): - if "Spilled" in line: - print(line) - if "Restored" in line: - print(line) - print("\n\n") - - -@ray.remote -class Counter: - def __init__(self): - self.num_map = 0 - self.num_reduce = 0 - - def inc(self): - self.num_map += 1 - # print("Num map tasks finished", self.num_map) - - def inc2(self): - self.num_reduce += 1 - # print("Num reduce tasks finished", self.num_reduce) - - def finish(self): - pass - - -# object store peak memory: O(partition size / num partitions) -# heap memory: O(partition size / num partitions) -@ray.remote(num_returns=num_partitions) -def shuffle_map_streaming( - i, counter_handle=None) -> List["ObjectRef[np.ndarray]"]: - outputs = [ - ray.put( - np.ones((rows_per_partition // num_partitions, 2), dtype=np.int64)) - for _ in range(num_partitions) - ] - counter_handle.inc.remote() - return outputs - - -# object store peak memory: O(partition size / num partitions) -# heap memory: O(partition size) -- TODO can be reduced too -@ray.remote -def shuffle_reduce_streaming(*inputs, counter_handle=None) -> np.ndarray: - out = None - for chunk in inputs: - if out is None: - out = ray.get(chunk) - else: - out = np.concatenate([out, ray.get(chunk)]) - counter_handle.inc2.remote() - return out - - -shuffle_map = shuffle_map_streaming -shuffle_reduce = shuffle_reduce_streaming - - -def run_shuffle(): - counter = Counter.remote() - start = time.time() - print("start map") - shuffle_map_out = [ - shuffle_map.remote(i, counter_handle=counter) - for i in range(num_partitions) - ] - # wait until all map is done before reduce phase. - for out in tqdm(shuffle_map_out): - ray.get(out) - - # Start reducing - shuffle_reduce_out = [ - shuffle_reduce.remote( - *[shuffle_map_out[i][j] for i in range(num_partitions)], - counter_handle=counter) for j in range(num_partitions) - ] - - print("start shuffle.") - pbar = tqdm(total=num_partitions) - total_rows = 0 - ready, unready = ray.wait(shuffle_reduce_out) - while unready: - ready, unready = ray.wait(unready) - for output in ready: - pbar.update(1) - total_rows += ray.get(output).shape[0] - delta = time.time() - start - - ray.get(counter.finish.remote()) - print("Shuffled", total_rows * 8 * 2, "bytes in", delta, - "seconds in a single node.\n") - - -def run_single_node(): - address = ray.init( - num_cpus=num_cpus * num_nodes, - object_store_memory=object_store_size, - _system_config=system_config) - - # Run shuffle. - print( - "\n\nTest streaming shuffle with a single node.\n" - f"Shuffle size: {partition_size * num_partitions / 1024 / 1024 / 1024}" - "GB") - run_shuffle() - time.sleep(5) - display_spilling_info(address["redis_address"]) - ray.shutdown() - time.sleep(5) - - -def run_multi_nodes(): - c = Cluster() - c.add_node( - num_cpus=4, - object_store_memory=object_store_size, - _system_config=system_config) - ray.init(address=c.address) - for _ in range(num_nodes - 1): # subtract a head node. - c.add_node(num_cpus=4, object_store_memory=object_store_size) - c.wait_for_nodes() - - # Run shuffle. - print( - f"\n\nTest streaming shuffle with {num_nodes} nodes.\n" - f"Shuffle size: {partition_size * num_partitions / 1024 / 1024 / 1024}" - "GB") - run_shuffle() - time.sleep(5) - display_spilling_info(c.address) - ray.shutdown() - c.shutdown() - time.sleep(5) - - -run_single_node() -run_multi_nodes() diff --git a/release/horovod_tests/cluster.yaml b/release/horovod_tests/cluster.yaml index 5dbc457a78c7..880ebdba2423 100644 --- a/release/horovod_tests/cluster.yaml +++ b/release/horovod_tests/cluster.yaml @@ -10,6 +10,8 @@ min_workers: 3 # node. This takes precedence over min_workers. min_workers defaults to 0. max_workers: 3 +target_utilization_fraction: 0.8 + # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 5 diff --git a/release/long_running_distributed_tests/cluster.yaml b/release/long_running_distributed_tests/cluster.yaml index 4710a47fcc4a..f8d10549a24c 100644 --- a/release/long_running_distributed_tests/cluster.yaml +++ b/release/long_running_distributed_tests/cluster.yaml @@ -3,6 +3,7 @@ cluster_name: long-running-distributed-tests min_workers: 3 max_workers: 3 +target_utilization_fraction: 0.8 idle_timeout_minutes: 15 docker: diff --git a/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py b/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py deleted file mode 120000 index 4bc3925a1e83..000000000000 --- a/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py +++ /dev/null @@ -1 +0,0 @@ -../../../python/ray/util/sgd/torch/examples/pytorch_pbt_failure.py \ No newline at end of file diff --git a/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py b/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py new file mode 100644 index 000000000000..2451fe4a2228 --- /dev/null +++ b/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py @@ -0,0 +1,138 @@ +import argparse +import numpy as np +import os +import torch +import torch.nn as nn +from torch.utils.data import DataLoader, Subset +from torchvision.datasets import CIFAR10 +import torchvision.transforms as transforms + +import ray +from ray import tune +from ray.tune import CLIReporter +from ray.tune.schedulers import PopulationBasedTraining +from ray.tune.utils.util import merge_dicts +from ray.tune.utils.mock import FailureInjectorCallback +from ray.util.sgd.torch import TorchTrainer, TrainingOperator +from ray.util.sgd.torch.resnet import ResNet18 +from ray.util.sgd.utils import BATCH_SIZE + +parser = argparse.ArgumentParser() +parser.add_argument( + "--smoke-test", + action="store_true", + default=False, + help="Finish quickly for training.") +args = parser.parse_args() + + +def initialization_hook(): + # Need this for avoiding a connection restart issue on AWS. + os.environ["NCCL_SOCKET_IFNAME"] = "^docker0,lo" + os.environ["NCCL_LL_THRESHOLD"] = "0" + + # set the below if needed + # print("NCCL DEBUG SET") + # os.environ["NCCL_DEBUG"] = "INFO" + + +def cifar_creator(config): + transform_train = transforms.Compose([ + transforms.RandomCrop(32, padding=4), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), + (0.2023, 0.1994, 0.2010)), + ]) # meanstd transformation + + transform_test = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), + (0.2023, 0.1994, 0.2010)), + ]) + train_dataset = CIFAR10( + root="~/data", train=True, download=True, transform=transform_train) + validation_dataset = CIFAR10( + root="~/data", train=False, download=False, transform=transform_test) + + if config.get("test_mode"): + train_dataset = Subset(train_dataset, list(range(64))) + validation_dataset = Subset(validation_dataset, list(range(64))) + + train_loader = DataLoader( + train_dataset, batch_size=config[BATCH_SIZE], num_workers=2) + validation_loader = DataLoader( + validation_dataset, batch_size=config[BATCH_SIZE], num_workers=2) + return train_loader, validation_loader + + +def optimizer_creator(model, config): + """Returns optimizer""" + return torch.optim.SGD( + model.parameters(), + lr=config.get("lr", 0.1), + momentum=config.get("momentum", 0.9)) + + +ray.init(address="auto" if not args.smoke_test else None, log_to_driver=True) +num_training_workers = 1 if args.smoke_test else 3 + +CustomTrainingOperator = TrainingOperator.from_creators( + model_creator=ResNet18, + optimizer_creator=optimizer_creator, + data_creator=cifar_creator, + loss_creator=nn.CrossEntropyLoss) + +TorchTrainable = TorchTrainer.as_trainable( + training_operator_cls=CustomTrainingOperator, + initialization_hook=initialization_hook, + num_workers=num_training_workers, + config={ + "test_mode": args.smoke_test, + BATCH_SIZE: 128 * num_training_workers, + }, + use_gpu=not args.smoke_test) + + +class NoFaultToleranceTrainable(TorchTrainable): + def _train(self): + train_stats = self.trainer.train(max_retries=0, profile=True) + validation_stats = self.trainer.validate(profile=True) + stats = merge_dicts(train_stats, validation_stats) + return stats + + +pbt_scheduler = PopulationBasedTraining( + time_attr="training_iteration", + metric="val_loss", + mode="min", + perturbation_interval=1, + hyperparam_mutations={ + # distribution for resampling + "lr": lambda: np.random.uniform(0.001, 1), + # allow perturbations within this set of categorical values + "momentum": [0.8, 0.9, 0.99], + }) + +reporter = CLIReporter() +reporter.add_metric_column("val_loss", "loss") +reporter.add_metric_column("val_accuracy", "acc") + +analysis = tune.run( + NoFaultToleranceTrainable, + num_samples=4, + config={ + "lr": tune.choice([0.001, 0.01, 0.1]), + "momentum": 0.8, + "head_location": None, + "worker_locations": None + }, + max_failures=-1, # used for fault tolerance + checkpoint_freq=2, # used for fault tolerance + progress_reporter=reporter, + scheduler=pbt_scheduler, + callbacks=[FailureInjectorCallback()], + queue_trials=True, + stop={"training_iteration": 1} if args.smoke_test else None) + +print(analysis.get_best_config(metric="val_loss", mode="min")) diff --git a/release/release_logs/1.2.0/microbenchmark.txt b/release/release_logs/1.2.0/microbenchmark.txt deleted file mode 100644 index 064e8b4411d4..000000000000 --- a/release/release_logs/1.2.0/microbenchmark.txt +++ /dev/null @@ -1,28 +0,0 @@ -single client get calls per second 48106.48 +- 847.52 -single client put calls per second 42709.1 +- 84.85 -multi client put calls per second 172608.71 +- 3071.81 -single client get calls (Plasma Store) per second 10669.26 +- 286.63 -single client put calls (Plasma Store) per second 6622.51 +- 47.03 -multi client put calls (Plasma Store) per second 9804.51 +- 462.32 -single client put gigabytes per second 11.45 +- 10.79 -multi client put gigabytes per second 35.06 +- 0.26 -single client tasks sync per second 1899.11 +- 87.63 -single client tasks async per second 18599.58 +- 124.02 -multi client tasks async per second 50388.88 +- 2585.47 -1:1 actor calls sync per second 3053.21 +- 60.37 -1:1 actor calls async per second 7768.59 +- 268.78 -1:1 actor calls concurrent per second 7106.24 +- 219.87 -1:n actor calls async per second 17132.11 +- 881.8 -n:n actor calls async per second 51037.11 +- 1732.95 -n:n actor calls with arg async per second 13746.19 +- 171.94 -1:1 async-actor calls sync per second 2103.39 +- 52.51 -1:1 async-actor calls async per second 4100.13 +- 53.6 -1:1 async-actor calls with args async per second 3085.78 +- 165.8 -1:n async-actor calls async per second 13906.28 +- 363.9 -n:n async-actor calls async per second 40269.65 +- 1113.55 -client: get calls per second 2414.77 +- 43.07 -client: put calls per second 1346.13 +- 8.2 -client: remote put calls per second 58855.54 +- 849.21 -client: 1:1 actor calls sync per second 730.58 +- 11.66 -client: 1:1 actor calls async per second 774.79 +- 14.1 -client: 1:1 actor calls concurrent per second 805.73 +- 11.46 \ No newline at end of file diff --git a/release/release_logs/1.2.0/notes.txt b/release/release_logs/1.2.0/notes.txt deleted file mode 100644 index 91c693f445a4..000000000000 --- a/release/release_logs/1.2.0/notes.txt +++ /dev/null @@ -1,3 +0,0 @@ -The test.pypi.org wheel does not match the release wheel because there was #14062 was discovered during the sanity check. - -Wheels were re-sanity checked by pip installing from s3. diff --git a/release/release_logs/1.2.0/rllib_regression_tf.txt b/release/release_logs/1.2.0/rllib_regression_tf.txt deleted file mode 100644 index 8760b66ffb64..000000000000 --- a/release/release_logs/1.2.0/rllib_regression_tf.txt +++ /dev/null @@ -1,27 +0,0 @@ -== Status == -Memory usage on this node: 8.8/480.3 GiB -Using FIFO scheduling algorithm. -Resources requested: 0/64 CPUs, 0.0/8 GPUs, 0.0/325.83 GiB heap, 0.0/99.07 GiB objects (0/1.0 accelerator_type:V100) -Result logdir: /home/ray/ray_results/a2c-tf-atari -Result logdir: /home/ray/ray_results/apex-dqn-tf-atari -Result logdir: /home/ray/ray_results/dqn-tf-atari -Result logdir: /home/ray/ray_results/impala-tf-atari -Result logdir: /home/ray/ray_results/ppo-tf-atari -Result logdir: /home/ray/ray_results/sac-tf-halfcheetah-pybullet -Number of trials: 12/12 (12 TERMINATED) -+-------------------------------------------+------------+-------+--------+------------------+---------+-----------+----------------------+----------------------+--------------------+ -| Trial name | status | loc | iter | total time (s) | ts | reward | episode_reward_max | episode_reward_min | episode_len_mean | -|-------------------------------------------+------------+-------+--------+------------------+---------+-----------+----------------------+----------------------+--------------------| -| A2C_BreakoutNoFrameskip-v4_e6509_00000 | TERMINATED | | 355 | 3604.01 | 4137500 | 1.86 | 10 | 0 | 815.78 | -| A2C_BreakoutNoFrameskip-v4_e6509_00001 | TERMINATED | | 354 | 3601.32 | 4067500 | 1.79 | 10 | 0 | 803.07 | -| APEX_BreakoutNoFrameskip-v4_e6509_00002 | TERMINATED | | 98 | 3626.91 | 7297440 | 1.4 | 9 | 0 | 739.886 | -| APEX_BreakoutNoFrameskip-v4_e6509_00003 | TERMINATED | | 97 | 3607.18 | 7222240 | 1.17816 | 5 | 0 | 702.362 | -| DQN_BreakoutNoFrameskip-v4_e6509_00004 | TERMINATED | | 35 | 3636.53 | 360000 | 1.25 | 6 | 0 | 710.49 | -| DQN_BreakoutNoFrameskip-v4_e6509_00005 | TERMINATED | | 35 | 3631.05 | 360000 | 1.36 | 9 | 0 | 723.54 | -| IMPALA_BreakoutNoFrameskip-v4_e6509_00006 | TERMINATED | | 350 | 3607.49 | 3024500 | 1.87 | 9 | 0 | 816.3 | -| IMPALA_BreakoutNoFrameskip-v4_e6509_00007 | TERMINATED | | 349 | 3601.95 | 3025500 | 1.21 | 6 | 0 | 716.7 | -| PPO_BreakoutNoFrameskip-v4_e6509_00008 | TERMINATED | | 1858 | 3600.41 | 9290000 | 1.69 | 10 | 0 | 792.13 | -| PPO_BreakoutNoFrameskip-v4_e6509_00009 | TERMINATED | | 1851 | 3601.2 | 9255000 | 1.6 | 11 | 0 | 770.95 | -| SAC_HalfCheetahBulletEnv-v0_e6509_00010 | TERMINATED | | 45 | 3670.33 | 54000 | 269.06 | 622.238 | -454.818 | 1000 | -| SAC_HalfCheetahBulletEnv-v0_e6509_00011 | TERMINATED | | 45 | 3654.38 | 54000 | 473.166 | 628.875 | 156.264 | 1000 | -+-------------------------------------------+------------+-------+--------+------------------+---------+-----------+----------------------+----------------------+--------------------+ diff --git a/release/release_logs/1.2.0/rllib_regression_torch.txt b/release/release_logs/1.2.0/rllib_regression_torch.txt deleted file mode 100644 index 11309f5e3c68..000000000000 --- a/release/release_logs/1.2.0/rllib_regression_torch.txt +++ /dev/null @@ -1,27 +0,0 @@ -== Status == -Memory usage on this node: 8.6/480.3 GiB -Using FIFO scheduling algorithm. -Resources requested: 0/64 CPUs, 0.0/8 GPUs, 0.0/325.73 GiB heap, 0.0/99.07 GiB objects (0/1.0 accelerator_type:V100) -Result logdir: /home/ray/ray_results/a2c-torch-atari -Result logdir: /home/ray/ray_results/apex-dqn-torch-atari -Result logdir: /home/ray/ray_results/dqn-torch-atari -Result logdir: /home/ray/ray_results/impala-torch-atari -Result logdir: /home/ray/ray_results/ppo-torch-atari -Result logdir: /home/ray/ray_results/sac-torch-halfcheetah-pybullet -Number of trials: 12/12 (12 TERMINATED) -+-------------------------------------------+------------+-------+--------+------------------+---------+-----------+----------------------+----------------------+--------------------+ -| Trial name | status | loc | iter | total time (s) | ts | reward | episode_reward_max | episode_reward_min | episode_len_mean | -|-------------------------------------------+------------+-------+--------+------------------+---------+-----------+----------------------+----------------------+--------------------| -| A2C_BreakoutNoFrameskip-v4_a6f57_00000 | TERMINATED | | 353 | 3603.76 | 3378500 | 1.93 | 15 | 0 | 821.58 | -| A2C_BreakoutNoFrameskip-v4_a6f57_00001 | TERMINATED | | 353 | 3608.48 | 3404500 | 1.15 | 6 | 0 | 701.51 | -| APEX_BreakoutNoFrameskip-v4_a6f57_00002 | TERMINATED | | 113 | 3615.57 | 5680160 | 1.6381 | 9 | 0 | 773.381 | -| APEX_BreakoutNoFrameskip-v4_a6f57_00003 | TERMINATED | | 114 | 3636.38 | 5764800 | 1.39655 | 6 | 0 | 735.914 | -| DQN_BreakoutNoFrameskip-v4_a6f57_00004 | TERMINATED | | 27 | 3684.72 | 280000 | 1.79 | 12 | 0 | 743.6 | -| DQN_BreakoutNoFrameskip-v4_a6f57_00005 | TERMINATED | | 27 | 3685.26 | 280000 | 1.14 | 5 | 0 | 699.19 | -| IMPALA_BreakoutNoFrameskip-v4_a6f57_00006 | TERMINATED | | 356 | 3606.67 | 7850250 | 1.7803 | 12 | 0 | 795.455 | -| IMPALA_BreakoutNoFrameskip-v4_a6f57_00007 | TERMINATED | | 355 | 3609.98 | 7903500 | 1.68217 | 8 | 0 | 796.659 | -| PPO_BreakoutNoFrameskip-v4_a6f57_00008 | TERMINATED | | 1401 | 3601.51 | 7005000 | 2.61 | 10 | 0 | 897.83 | -| PPO_BreakoutNoFrameskip-v4_a6f57_00009 | TERMINATED | | 1406 | 3600.35 | 7030000 | 1.47 | 11 | 0 | 647.8 | -| SAC_HalfCheetahBulletEnv-v0_a6f57_00010 | TERMINATED | | 37 | 3686.44 | 46000 | 641.43 | 723.144 | 504.62 | 1000 | -| SAC_HalfCheetahBulletEnv-v0_a6f57_00011 | TERMINATED | | 37 | 3645.16 | 46000 | 631.65 | 664.021 | 599.864 | 1000 | -+-------------------------------------------+------------+-------+--------+------------------+---------+-----------+----------------------+----------------------+--------------------+ diff --git a/release/release_logs/1.2.0/scalability/distributed.txt b/release/release_logs/1.2.0/scalability/distributed.txt deleted file mode 100644 index 860875201cea..000000000000 --- a/release/release_logs/1.2.0/scalability/distributed.txt +++ /dev/null @@ -1,4 +0,0 @@ -Actor time: 34.21903751100001 (10000 actors) │ -Task time: 386.82114117900005 (10000 tasks) │ -PG time: 31.368525181999985 (1000 placement groups) │ -Node launch time: 756.3447095859999 (250 nodes) \ No newline at end of file diff --git a/release/release_logs/1.2.0/scalability/object_store.txt b/release/release_logs/1.2.0/scalability/object_store.txt deleted file mode 100644 index 0471a93ba429..000000000000 --- a/release/release_logs/1.2.0/scalability/object_store.txt +++ /dev/null @@ -1 +0,0 @@ -Broadcast time: 135.75278311699998 (1073741824 B x 50 nodes) diff --git a/release/release_logs/1.2.0/scalability/single_node.txt b/release/release_logs/1.2.0/scalability/single_node.txt deleted file mode 100644 index 7a100e3eae98..000000000000 --- a/release/release_logs/1.2.0/scalability/single_node.txt +++ /dev/null @@ -1,5 +0,0 @@ -Many args time: 11.433474627000002 (10000 args) -Many returns time: 4.487700554 (3000 returns) -Ray.get time: 21.957432587999996 (10000 args) -Queued task time: 124.148238013 (1000000 tasks) -Ray.get large object time: 35.118229127000006 (107374182400 bytes) \ No newline at end of file diff --git a/release/release_logs/1.2.0/stress_tests/test_dead_actors.txt b/release/release_logs/1.2.0/stress_tests/test_dead_actors.txt deleted file mode 100644 index 2e73606f2328..000000000000 --- a/release/release_logs/1.2.0/stress_tests/test_dead_actors.txt +++ /dev/null @@ -1,4 +0,0 @@ -Finished in: 133.60612034797668s -Average iteration time: 1.3360581374168397s -Max iteration time: 5.137001276016235s -Min iteration time: 0.15551400184631348s diff --git a/release/release_logs/1.2.0/stress_tests/test_many_tasks.txt b/release/release_logs/1.2.0/stress_tests/test_many_tasks.txt deleted file mode 100644 index ffc9bc3cd483..000000000000 --- a/release/release_logs/1.2.0/stress_tests/test_many_tasks.txt +++ /dev/null @@ -1,17 +0,0 @@ -Stage 0 results: - Total time: 50.40076494216919 -Stage 1 results: - Total time: 191.78780102729797 - Average iteration time: 19.178766775131226 - Max iteration time: 21.238199949264526 - Min iteration time: 18.299438953399658 -Stage 2 results: - Total time: 280.4905333518982 - Average iteration time: 56.0978446483612 - Max iteration time: 56.96464133262634 - Min iteration time: 53.859785318374634 -Stage 3 results: - Actor creation time: 0.3304018974304199 - Total time: 2303.117142677307 -Stage 4 results: - Scheduling spread: 66.90121385927009. \ No newline at end of file diff --git a/release/release_logs/1.2.0/stress_tests/test_placement_group.txt b/release/release_logs/1.2.0/stress_tests/test_placement_group.txt deleted file mode 100644 index 62f8a7b74786..000000000000 --- a/release/release_logs/1.2.0/stress_tests/test_placement_group.txt +++ /dev/null @@ -1,3 +0,0 @@ -Avg placement group creating time: 0.2691924729741867 ms -Avg placement group removing time: 0.8786630945927776 ms -Stress Test succeed. \ No newline at end of file diff --git a/release/rllib_tests/stress_tests/cluster.yaml b/release/rllib_tests/stress_tests/cluster.yaml index 4c83e27c33aa..8f20a46afb85 100644 --- a/release/rllib_tests/stress_tests/cluster.yaml +++ b/release/rllib_tests/stress_tests/cluster.yaml @@ -3,6 +3,7 @@ cluster_name: ray-rllib-stress-tests min_workers: 9 max_workers: 9 +target_utilization_fraction: 0.8 idle_timeout_minutes: 15 docker: diff --git a/release/rllib_tests/unit_gpu_tests/requirements.txt b/release/rllib_tests/unit_gpu_tests/requirements.txt index b8a991f74f34..4f88975397f9 100644 --- a/release/rllib_tests/unit_gpu_tests/requirements.txt +++ b/release/rllib_tests/unit_gpu_tests/requirements.txt @@ -1,9 +1,7 @@ ray[rllib] ray --f https://download.pytorch.org/whl/torch_stable.html -torch==1.7.1+cu110 --f https://download.pytorch.org/whl/torch_stable.html -torchvision==0.8.2+cu110 +torch==1.6+cu101 +torchvision==0.7.0+cu101 boto3==1.4.8 cython==0.29.0 pytest diff --git a/release/stress_tests/autoscaler-cluster.yaml b/release/stress_tests/autoscaler-cluster.yaml index 9c17d303e4db..ed5ee2bd58f1 100644 --- a/release/stress_tests/autoscaler-cluster.yaml +++ b/release/stress_tests/autoscaler-cluster.yaml @@ -13,6 +13,13 @@ min_workers: 100 # node. This takes precedence over min_workers. max_workers: 100 +# The autoscaler will scale up the cluster to this target fraction of resource +# usage. For example, if a cluster of 10 nodes is 100% busy and +# target_utilization is 0.8, it would resize the cluster to 13. This fraction +# can be decreased to increase the aggressiveness of upscaling. +# This value must be less than 1.0 for scaling to happen. +target_utilization_fraction: 0.8 + # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 5 diff --git a/release/stress_tests/cluster.yaml b/release/stress_tests/cluster.yaml index 155ae1329c0b..a513d9764c11 100644 --- a/release/stress_tests/cluster.yaml +++ b/release/stress_tests/cluster.yaml @@ -13,6 +13,13 @@ min_workers: 100 # node. This takes precedence over min_workers. max_workers: 100 +# The autoscaler will scale up the cluster to this target fraction of resource +# usage. For example, if a cluster of 10 nodes is 100% busy and +# target_utilization is 0.8, it would resize the cluster to 13. This fraction +# can be decreased to increase the aggressiveness of upscaling. +# This value must be less than 1.0 for scaling to happen. +target_utilization_fraction: 0.8 + # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 5 diff --git a/release/tune_tests/scalability_tests/cluster.yaml b/release/tune_tests/scalability_tests/cluster.yaml new file mode 100644 index 000000000000..e279efb37dab --- /dev/null +++ b/release/tune_tests/scalability_tests/cluster.yaml @@ -0,0 +1,33 @@ +cluster_name: ray-tune-scalability-tests + +min_workers: 15 +max_workers: 15 +initial_workers: 15 + +target_utilization_fraction: 0.8 +idle_timeout_minutes: 15 + +docker: + image: anyscale/ray:nightly + container_name: ray_container + pull_before_run: true + +provider: + type: aws + region: us-west-2 + availability_zone: us-west-2a + cache_stopped_nodes: false + +auth: + ssh_user: ubuntu + +head_node: + # 64 CPUs + InstanceType: m5.16xlarge + +worker_nodes: + # 64 CPUs + InstanceType: m5.16xlarge + +setup_commands: + - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl diff --git a/release/tune_tests/scalability_tests/cluster_16x2.yaml b/release/tune_tests/scalability_tests/cluster_16x2.yaml deleted file mode 100644 index e5e56e7c957d..000000000000 --- a/release/tune_tests/scalability_tests/cluster_16x2.yaml +++ /dev/null @@ -1,47 +0,0 @@ -cluster_name: ray-tune-scalability-tests-16x2 - -max_workers: 15 -upscaling_speed: 15 - -idle_timeout_minutes: 0 - -docker: - image: anyscale/ray:nightly - container_name: ray_container - pull_before_run: true - -provider: - type: aws - region: us-west-2 - availability_zone: us-west-2a - cache_stopped_nodes: false - -available_node_types: - cpu_2_ondemand: - node_config: - InstanceType: m5.large - resources: {"CPU": 2} - min_workers: 0 - max_workers: 0 - cpu_2_spot: - node_config: - InstanceType: m5.large - InstanceMarketOptions: - MarketType: spot - resources: {"CPU": 2} - min_workers: 15 - max_workers: 15 - -auth: - ssh_user: ubuntu - -head_node_type: cpu_2_ondemand -worker_default_node_type: cpu_2_spot - -setup_commands: - - ray install-nightly - - pip install -U awscli - -file_mounts: { - "~/release-automation-tune_scalability_tests": "." -} diff --git a/release/tune_tests/scalability_tests/cluster_16x64.yaml b/release/tune_tests/scalability_tests/cluster_16x64.yaml deleted file mode 100644 index fbe954b6c789..000000000000 --- a/release/tune_tests/scalability_tests/cluster_16x64.yaml +++ /dev/null @@ -1,42 +0,0 @@ -cluster_name: ray-tune-scalability-tests-16x64 - -max_workers: 15 -upscaling_speed: 15 - -idle_timeout_minutes: 0 - -docker: - image: anyscale/ray:nightly - container_name: ray_container - pull_before_run: true - -provider: - type: aws - region: us-west-2 - availability_zone: us-west-2a - cache_stopped_nodes: false - -available_node_types: - cpu_64_ondemand: - node_config: - InstanceType: m5.16xlarge - resources: {"CPU": 64} - min_workers: 0 - max_workers: 0 - cpu_64_spot: - node_config: - InstanceType: m5.16xlarge - InstanceMarketOptions: - MarketType: spot - resources: {"CPU": 64} - min_workers: 15 - max_workers: 15 - -auth: - ssh_user: ubuntu - -head_node_type: cpu_64_ondemand -worker_default_node_type: cpu_64_spot - -setup_commands: - - ray install-nightly diff --git a/release/tune_tests/scalability_tests/cluster_16x64_data.yaml b/release/tune_tests/scalability_tests/cluster_16x64_data.yaml deleted file mode 100644 index 56db5a349065..000000000000 --- a/release/tune_tests/scalability_tests/cluster_16x64_data.yaml +++ /dev/null @@ -1,53 +0,0 @@ -cluster_name: ray-tune-scalability-tests-16x64_data - -max_workers: 16 -upscaling_speed: 16 - -idle_timeout_minutes: 0 - -docker: - image: anyscale/ray:nightly - container_name: ray_container - pull_before_run: true - -provider: - type: aws - region: us-west-2 - availability_zone: us-west-2a - cache_stopped_nodes: false - -available_node_types: - cpu_64_ondemand: - node_config: - InstanceType: m5.16xlarge - resources: {"CPU": 64} - min_workers: 0 - max_workers: 0 - cpu_64_spot: - node_config: - InstanceType: m5.16xlarge - InstanceMarketOptions: - MarketType: spot - resources: {"CPU": 64} - min_workers: 15 - max_workers: 15 - -auth: - ssh_user: ubuntu - -head_node_type: cpu_64_ondemand -worker_default_node_type: cpu_64_spot - -file_mounts: { - "~/release-automation-tune_scalability_tests": "." -} - -setup_commands: - - ray install-nightly - - pip install pytest xgboost_ray - - mkdir -p ~/data || true - - rm -rf ~/data/train.parquet || true - - rm -rf ~/data/test.parquet || true - - cp -R /tmp/ray_tmp_mount/release-automation-tune_scalability_tests ~/release-automation-tune_scalability_tests || echo "Copy failed" - - python ~/release-automation-tune_scalability_tests/create_test_data.py ~/data/train.parquet --seed 1234 --num-rows 40000000 --num-cols 40 --num-partitions 128 --num-classes 2 - - python ~/release-automation-tune_scalability_tests/create_test_data.py ~/data/test.parquet --seed 1234 --num-rows 10000000 --num-cols 40 --num-partitions 128 --num-classes 2 diff --git a/release/tune_tests/scalability_tests/cluster_1x16.yaml b/release/tune_tests/scalability_tests/cluster_1x16.yaml deleted file mode 100644 index a40e0d0a0711..000000000000 --- a/release/tune_tests/scalability_tests/cluster_1x16.yaml +++ /dev/null @@ -1,34 +0,0 @@ -cluster_name: ray-tune-scalability-tests-1x16 - -max_workers: 0 -upscaling_speed: 1 - -idle_timeout_minutes: 0 - -docker: - image: anyscale/ray:nightly - container_name: ray_container - pull_before_run: true - -provider: - type: aws - region: us-west-2 - availability_zone: us-west-2a - cache_stopped_nodes: false - -available_node_types: - cpu_4_ondemand: - node_config: - InstanceType: m5.xlarge - resources: {"CPU": 4} - min_workers: 0 - max_workers: 0 - -auth: - ssh_user: ubuntu - -head_node_type: cpu_4_ondemand -worker_default_node_type: cpu_4_ondemand - -setup_commands: - - ray install-nightly diff --git a/release/tune_tests/scalability_tests/cluster_1x32_hd.yaml b/release/tune_tests/scalability_tests/cluster_1x32_hd.yaml deleted file mode 100644 index e909c138c90b..000000000000 --- a/release/tune_tests/scalability_tests/cluster_1x32_hd.yaml +++ /dev/null @@ -1,40 +0,0 @@ -cluster_name: ray-tune-scalability-tests-1x32_hd - -max_workers: 0 -upscaling_speed: 1 - -idle_timeout_minutes: 0 - -docker: - image: anyscale/ray:nightly - container_name: ray_container - pull_before_run: true - -provider: - type: aws - region: us-west-2 - availability_zone: us-west-2a - cache_stopped_nodes: false - -available_node_types: - cpu_32_hd_ondemand: - node_config: - InstanceType: m5.8xlarge - - BlockDeviceMappings: - - DeviceName: /dev/sda1 - Ebs: - VolumeSize: 160 - - resources: {"CPU": 32} # 128 GB memory - min_workers: 0 - max_workers: 0 - -auth: - ssh_user: ubuntu - -head_node_type: cpu_32_hd_ondemand -worker_default_node_type: cpu_32_hd_ondemand - -setup_commands: - - ray install-nightly diff --git a/release/tune_tests/scalability_tests/cluster_1x96.yaml b/release/tune_tests/scalability_tests/cluster_1x96.yaml deleted file mode 100644 index ec01ede17926..000000000000 --- a/release/tune_tests/scalability_tests/cluster_1x96.yaml +++ /dev/null @@ -1,34 +0,0 @@ -cluster_name: ray-tune-scalability-tests-1x96 - -max_workers: 0 -upscaling_speed: 1 - -idle_timeout_minutes: 0 - -docker: - image: anyscale/ray:nightly - container_name: ray_container - pull_before_run: true - -provider: - type: aws - region: us-west-2 - availability_zone: us-west-2a - cache_stopped_nodes: false - -available_node_types: - cpu_96_ondemand: - node_config: - InstanceType: m5.24xlarge - resources: {"CPU": 96} - min_workers: 0 - max_workers: 0 - -auth: - ssh_user: ubuntu - -head_node_type: cpu_96_ondemand -worker_default_node_type: cpu_96_ondemand - -setup_commands: - - ray install-nightly diff --git a/release/tune_tests/scalability_tests/cluster_200x2.yaml b/release/tune_tests/scalability_tests/cluster_200x2.yaml deleted file mode 100644 index 143505ab2d14..000000000000 --- a/release/tune_tests/scalability_tests/cluster_200x2.yaml +++ /dev/null @@ -1,42 +0,0 @@ -cluster_name: ray-tune-scalability-tests-200x2 - -max_workers: 199 -upscaling_speed: 199 - -idle_timeout_minutes: 0 - -docker: - image: anyscale/ray:nightly - container_name: ray_container - pull_before_run: true - -provider: - type: aws - region: us-west-2 - availability_zone: us-west-2a - cache_stopped_nodes: false - -available_node_types: - cpu_2_ondemand: - node_config: - InstanceType: m5.large - resources: {"CPU": 2} - min_workers: 0 - max_workers: 0 - cpu_2_spot: - node_config: - InstanceType: m5.large - InstanceMarketOptions: - MarketType: spot - resources: {"CPU": 2} - min_workers: 199 - max_workers: 199 - -auth: - ssh_user: ubuntu - -head_node_type: cpu_2_ondemand -worker_default_node_type: cpu_2_spot - -setup_commands: - - ray install-nightly diff --git a/release/tune_tests/scalability_tests/create_test_data.py b/release/tune_tests/scalability_tests/create_test_data.py deleted file mode 100644 index f7a450105426..000000000000 --- a/release/tune_tests/scalability_tests/create_test_data.py +++ /dev/null @@ -1,61 +0,0 @@ -import argparse -import numpy as np -import os - -from xgboost_ray.tests.utils import create_parquet - -if __name__ == "__main__": - if "OMP_NUM_THREADS" in os.environ: - del os.environ["OMP_NUM_THREADS"] - - parser = argparse.ArgumentParser(description="Create fake data.") - parser.add_argument( - "filename", type=str, default="/data/parted.parquet/", help="ray/dask") - parser.add_argument( - "-r", - "--num-rows", - required=False, - type=int, - default=1e8, - help="num rows") - parser.add_argument( - "-p", - "--num-partitions", - required=False, - type=int, - default=100, - help="num partitions") - parser.add_argument( - "-c", - "--num-cols", - required=False, - type=int, - default=4, - help="num columns (features)") - parser.add_argument( - "-C", - "--num-classes", - required=False, - type=int, - default=2, - help="num classes") - parser.add_argument( - "-s", - "--seed", - required=False, - type=int, - default=1234, - help="random seed") - - args = parser.parse_args() - - if os.path.exists(args.filename): - print(f"File already exists: {args.filename}. Skipping creation.") - - np.random.seed(args.seed) - create_parquet( - args.filename, - num_rows=int(args.num_rows), - num_partitions=int(args.num_partitions), - num_features=int(args.num_cols), - num_classes=int(args.num_classes)) diff --git a/release/tune_tests/scalability_tests/run.sh b/release/tune_tests/scalability_tests/run.sh index 6c7172bfcc00..e4f5698aa6a9 100755 --- a/release/tune_tests/scalability_tests/run.sh +++ b/release/tune_tests/scalability_tests/run.sh @@ -1,7 +1,6 @@ #!/usr/bin/env bash -nodes="" -ray_version="" +ray_version="" commit="" ray_branch="" @@ -9,11 +8,9 @@ for i in "$@" do echo "$i" case "$i" in - --nodes=*) - nodes="${i#*=}" - ;; --ray-version=*) ray_version="${i#*=}" + ;; --commit=*) commit="${i#*=}" @@ -35,22 +32,25 @@ case "$i" in esac done -if [[ $nodes == "" || $ray_version == "" || $commit == "" || $ray_branch == "" ]] +if [[ $ray_version == "" || $commit == "" || $ray_branch == "" ]] then - echo "Provide --nodes --ray-version, --commit, and --ray-branch" + echo "Provide --ray-version, --commit, and --ray-branch" exit 1 fi -echo "nodes: $nodes" echo "version: $ray_version" echo "commit: $commit" echo "branch: $ray_branch" echo "workload: ignored" -# wheel="https://s3-us-west-2.amazonaws.com/ray-wheels/$ray_branch/$commit/ray-$ray_version-cp37-cp37m-manylinux2014_x86_64.whl" -# pip install -U "$wheel" +wheel="https://s3-us-west-2.amazonaws.com/ray-wheels/$ray_branch/$commit/ray-$ray_version-cp37-cp37m-manylinux2014_x86_64.whl" + +pip install -U pip +pip install -U "$wheel" +pip install "ray[tune]" "ray" +pip install boto3==1.4.8 cython==0.29.0 -if ! python "wait_cluster.py" "$nodes" 600; then +if ! python "wait_cluster.py" 16 450; then echo "Cluster did not come up in time. Aborting test." exit 1 fi diff --git a/release/tune_tests/scalability_tests/workloads/_trainable.py b/release/tune_tests/scalability_tests/workloads/_trainable.py deleted file mode 100644 index c5ce8c005f79..000000000000 --- a/release/tune_tests/scalability_tests/workloads/_trainable.py +++ /dev/null @@ -1,153 +0,0 @@ -import os -import time - -import numpy as np -import pickle - -from ray import tune - -from ray.tune.durable_trainable import DurableTrainable - - -class TestDurableTrainable(DurableTrainable): - def __init__(self, remote_checkpoint_dir, config, logger_creator=None): - self.setup_env() - - super(TestDurableTrainable, self).__init__( - remote_checkpoint_dir, - config=config, - logger_creator=logger_creator) - - def setup_env(self): - pass - - def setup(self, config): - self._num_iters = int(config["num_iters"]) - self._sleep_time = config["sleep_time"] - self._score = config["score"] - - self._checkpoint_iters = config["checkpoint_iters"] - self._checkpoint_size_b = config["checkpoint_size_b"] - self._checkpoint_num_items = self._checkpoint_size_b // 8 # np.float64 - - self._iter = 0 - - def step(self): - if self._iter > 0: - time.sleep(self._sleep_time) - - res = dict(score=self._iter + self._score) - - if self._iter >= self._num_iters: - res["done"] = True - - self._iter += 1 - return res - - def save_checkpoint(self, tmp_checkpoint_dir): - checkpoint_file = os.path.join(tmp_checkpoint_dir, "bogus.ckpt") - checkpoint_data = np.random.uniform( - 0, 1, size=self._checkpoint_num_items) - with open(checkpoint_file, "wb") as fp: - pickle.dump(checkpoint_data, fp) - return checkpoint_file - - def load_checkpoint(self, checkpoint): - pass - - -def function_trainable(config): - num_iters = int(config["num_iters"]) - sleep_time = config["sleep_time"] - score = config["score"] - - checkpoint_iters = config["checkpoint_iters"] - checkpoint_size_b = config["checkpoint_size_b"] - checkpoint_num_items = checkpoint_size_b // 8 # np.float64 - - for i in range(num_iters): - if checkpoint_iters >= 0 and checkpoint_size_b > 0 and \ - i % checkpoint_iters == 0: - with tune.checkpoint_dir(step=i) as dir: - checkpoint_file = os.path.join(dir, "bogus.ckpt") - checkpoint_data = np.random.uniform( - 0, 1, size=checkpoint_num_items) - with open(checkpoint_file, "wb") as fp: - pickle.dump(checkpoint_data, fp) - - tune.report(score=i + score) - time.sleep(sleep_time) - - -def timed_tune_run(name: str, - num_samples: int, - results_per_second: int = 1, - trial_length_s: int = 1, - max_runtime: int = 300, - checkpoint_freq_s: int = -1, - checkpoint_size_b: int = 0, - **tune_kwargs): - durable = "sync_config" in tune_kwargs and \ - tune_kwargs["sync_config"].upload_dir.startswith("s3://") - - sleep_time = 1. / results_per_second - num_iters = int(trial_length_s / sleep_time) - checkpoint_iters = -1 - if checkpoint_freq_s >= 0: - checkpoint_iters = int(checkpoint_freq_s / sleep_time) - - config = { - "score": tune.uniform(0., 1.), - "num_iters": num_iters, - "sleep_time": sleep_time, - "checkpoint_iters": checkpoint_iters, - "checkpoint_size_b": checkpoint_size_b, - } - - print(f"Starting benchmark with config: {config}") - - run_kwargs = {"reuse_actors": True, "verbose": 2} - run_kwargs.update(tune_kwargs) - - _train = function_trainable - - aws_key_id = os.getenv("AWS_ACCESS_KEY_ID", "") - aws_secret = os.getenv("AWS_SECRET_ACCESS_KEY", "") - aws_session = os.getenv("AWS_SESSION_TOKEN", "") - - if durable: - - class AwsDurableTrainable(TestDurableTrainable): - AWS_ACCESS_KEY_ID = aws_key_id - AWS_SECRET_ACCESS_KEY = aws_secret - AWS_SESSION_TOKEN = aws_session - - def setup_env(self): - os.environ["AWS_ACCESS_KEY_ID"] = self.AWS_ACCESS_KEY_ID - os.environ[ - "AWS_SECRET_ACCESS_KEY"] = self.AWS_SECRET_ACCESS_KEY - os.environ["AWS_SESSION_TOKEN"] = self.AWS_SESSION_TOKEN - - _train = AwsDurableTrainable - run_kwargs["checkpoint_freq"] = checkpoint_iters - - start_time = time.monotonic() - tune.run( - _train, - config=config, - num_samples=num_samples, - raise_on_failed_trial=False, - **run_kwargs) - time_taken = time.monotonic() - start_time - - assert time_taken < max_runtime, \ - f"The {name} test took {time_taken:.2f} seconds, but should not " \ - f"have exceeded {max_runtime:.2f} seconds. Test failed. \n\n" \ - f"--- FAILED: {name.upper()} ::: " \ - f"{time_taken:.2f} > {max_runtime:.2f} ---" - - print(f"The {name} test took {time_taken:.2f} seconds, which " - f"is below the budget of {max_runtime:.2f} seconds. " - f"Test successful. \n\n" - f"--- PASSED: {name.upper()} ::: " - f"{time_taken:.2f} <= {max_runtime:.2f} ---") diff --git a/release/tune_tests/scalability_tests/workloads/test_bookkeeping_overhead.py b/release/tune_tests/scalability_tests/workloads/test_bookkeeping_overhead.py deleted file mode 100644 index 2792c18d8830..000000000000 --- a/release/tune_tests/scalability_tests/workloads/test_bookkeeping_overhead.py +++ /dev/null @@ -1,42 +0,0 @@ -"""Bookkeeping overhead (1 node, 10k trials) - -In this run, we will start a large number of trials (10k) that take just a -second to run. We thus measure overhead that comes with dealing with a -large number of trials, e.g. experiment checkpointing. - -Cluster: cluster_1x16.yaml - -Test owner: krfricke - -Acceptance criteria: Should run faster than 800 seconds. - -Theoretical minimum time: 10000/16 = 625 seconds -""" -import os - -import ray - -from _trainable import timed_tune_run - - -def main(): - os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "100" # Tweak - - ray.init(address="auto") - - num_samples = 10000 - results_per_second = 1 - trial_length_s = 1 - - max_runtime = 800 - - timed_tune_run( - name="bookkeeping overhead", - num_samples=num_samples, - results_per_second=results_per_second, - trial_length_s=trial_length_s, - max_runtime=max_runtime) - - -if __name__ == "__main__": - main() diff --git a/release/tune_tests/scalability_tests/workloads/test_durable_trainable.py b/release/tune_tests/scalability_tests/workloads/test_durable_trainable.py deleted file mode 100644 index b37fd596f6fe..000000000000 --- a/release/tune_tests/scalability_tests/workloads/test_durable_trainable.py +++ /dev/null @@ -1,47 +0,0 @@ -"""Durable trainable (16 trials, checkpoint to cloud) - -In this run, we will start 16 trials on a cluster. The trials create -10 MB checkpoints every 10 seconds and should only keep 2 of these. This test -ensures that durable checkpoints don't slow down experiment progress too much. - -Cluster: cluster_16x2.yaml - -Test owner: krfricke - -Acceptance criteria: Should run faster than 500 seconds. - -Theoretical minimum time: 300 seconds -""" -import ray -from ray import tune - -from _trainable import timed_tune_run - - -def main(): - ray.init(address="auto") - - num_samples = 16 - results_per_second = 10 / 60 - trial_length_s = 300 - - max_runtime = 500 - - timed_tune_run( - name="durable trainable", - num_samples=num_samples, - results_per_second=results_per_second, - trial_length_s=trial_length_s, - max_runtime=max_runtime, - checkpoint_freq_s=10, # Once every 10 seconds - checkpoint_size_b=int(10 * 1000**2), # 10 MB - keep_checkpoints_num=2, - resources_per_trial={"cpu": 2}, - sync_config=tune.SyncConfig( - sync_to_driver=False, - upload_dir="s3://ray-tune-scalability-test/durable/", - )) - - -if __name__ == "__main__": - main() diff --git a/release/tune_tests/scalability_tests/workloads/test_long_running_large_checkpoints.py b/release/tune_tests/scalability_tests/workloads/test_long_running_large_checkpoints.py deleted file mode 100644 index 05484431c700..000000000000 --- a/release/tune_tests/scalability_tests/workloads/test_long_running_large_checkpoints.py +++ /dev/null @@ -1,44 +0,0 @@ -"""Large checkpoints in long running trials (16 trials, 4 GB checkpoints). - -In this run, we will start 16 trials on a single node. The trials create -4 GB checkpoints every 15 minutes and should only keep 2 of these. This test -ensures that handling large checkpoints don't lead to much overhead. - -Cluster: cluster_1x32_hd.yaml - -Test owner: krfricke - -Acceptance criteria: Should run faster than 90,000 seconds. - -Theoretical minimum time: 86,400 seconds -""" -import ray -from ray import tune - -from _trainable import timed_tune_run - - -def main(): - ray.init(address="auto") - - num_samples = 16 - results_per_second = 1 / 60 - trial_length_s = 86400 - - max_runtime = 90000 - - timed_tune_run( - name="long running large checkpoints", - num_samples=num_samples, - results_per_second=results_per_second, - trial_length_s=trial_length_s, - max_runtime=max_runtime, - checkpoint_freq_s=900, # Once every 15 minutes - checkpoint_size_b=int(3.75 * 1000**3), - keep_checkpoints_num=2, # 2 * 16 * 4 = 128 GB - resources_per_trial={"cpu": 1}, - sync_config=tune.SyncConfig(sync_to_driver=True)) - - -if __name__ == "__main__": - main() diff --git a/release/tune_tests/scalability_tests/workloads/test_network_overhead.py b/release/tune_tests/scalability_tests/workloads/test_network_overhead.py deleted file mode 100644 index 3222b6eca97d..000000000000 --- a/release/tune_tests/scalability_tests/workloads/test_network_overhead.py +++ /dev/null @@ -1,41 +0,0 @@ -"""Networking overhead (200 trials on 200 nodes) - -In this run, we will start 200 trials and run them on 200 different nodes. -This test will thus measure the overhead that comes with network communication -and specifically log synchronization. - -Cluster: cluster_200x2.yaml - -Test owner: krfricke - -Acceptance criteria: Should run faster than 500 seconds. - -Theoretical minimum time: 300 seconds -""" -import ray -from ray import tune - -from _trainable import timed_tune_run - - -def main(): - ray.init(address="auto") - - num_samples = 200 - results_per_second = 1 - trial_length_s = 300 - - max_runtime = 500 - - timed_tune_run( - name="result network overhead", - num_samples=num_samples, - results_per_second=results_per_second, - trial_length_s=trial_length_s, - max_runtime=max_runtime, - resources_per_trial={"cpu": 2}, # One per node - sync_config=tune.SyncConfig(sync_to_driver=True)) - - -if __name__ == "__main__": - main() diff --git a/release/tune_tests/scalability_tests/workloads/test_result_buffering.py b/release/tune_tests/scalability_tests/workloads/test_result_buffering.py new file mode 100644 index 000000000000..e6ea1762f9b2 --- /dev/null +++ b/release/tune_tests/scalability_tests/workloads/test_result_buffering.py @@ -0,0 +1,54 @@ +import time + +import ray +from ray import tune +from ray.tune.cluster_info import is_ray_cluster + + +def my_naive_trainable(config): + for i in range(int(config["num_iters"])): + tune.report(score=i + config["score"]) + time.sleep(config["sleep_time"]) + + +def main(): + ray.init(address="auto") + + num_samples = 1000 + + sleep_time = 0.1 + num_iters = 300 + + expected_run_time = num_iters * sleep_time + + # Allow minimum of 20 % overhead (or 10 seconds for short runs) + expected_run_time += max(expected_run_time * 0.2, 10.) + + if is_ray_cluster(): + # Add constant overhead for SSH connection + expected_run_time += 0.3 * num_samples + + start_time = time.time() + tune.run( + my_naive_trainable, + config={ + "score": tune.uniform(0., 1.), + "num_iters": num_iters, + "sleep_time": sleep_time + }, + reuse_actors=True, + verbose=2, + num_samples=num_samples) + time_taken = time.time() - start_time + + assert time_taken < expected_run_time, \ + f"The buffering test took {time_taken:.2f} seconds, but should not " \ + f"have exceeded {expected_run_time:.2f} seconds. Test failed." + + print(f"The buffering test took {time_taken:.2f} seconds, which " + f"is below the budget of {expected_run_time:.2f} seconds. " + f"Test successful.") + + +if __name__ == "__main__": + main() diff --git a/release/tune_tests/scalability_tests/workloads/test_result_throughput_cluster.py b/release/tune_tests/scalability_tests/workloads/test_result_throughput_cluster.py deleted file mode 100644 index 8a3ba682ca89..000000000000 --- a/release/tune_tests/scalability_tests/workloads/test_result_throughput_cluster.py +++ /dev/null @@ -1,49 +0,0 @@ -"""Result throughput on a cluster - -In this run, we will start 1000 trials concurrently that report often -(10 results per second). We thus measure the amount of overhead incurred when -dealing with a large number of results from distributed trials. - -Cluster: cluster_16x64.yaml - -Test owner: krfricke - -Acceptance criteria: Should run faster than 120 seconds. - -Theoretical minimum time: 100 seconds -""" -import os - -import ray -from ray import tune -from ray.tune.cluster_info import is_ray_cluster - -from _trainable import timed_tune_run - - -def main(): - os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = "1" # Tweak - - ray.init(address="auto") - - num_samples = 1000 - results_per_second = 10 - trial_length_s = 100 - - max_runtime = 120 - - if is_ray_cluster(): - # Add constant overhead for SSH connection - max_runtime = 120 - - timed_tune_run( - name="result throughput cluster", - num_samples=num_samples, - results_per_second=results_per_second, - trial_length_s=trial_length_s, - max_runtime=max_runtime, - sync_config=tune.SyncConfig(sync_to_driver=False)) # Tweak! - - -if __name__ == "__main__": - main() diff --git a/release/tune_tests/scalability_tests/workloads/test_result_throughput_single_node.py b/release/tune_tests/scalability_tests/workloads/test_result_throughput_single_node.py deleted file mode 100644 index 288b28d5f9a5..000000000000 --- a/release/tune_tests/scalability_tests/workloads/test_result_throughput_single_node.py +++ /dev/null @@ -1,42 +0,0 @@ -"""Result throughput on a single node - -In this run, we will start 96 trials concurrently that report very often -(500 results per second). We thus measure the amount of overhead incurred when -dealing with a large number of results. - -Cluster: cluster_1x96.yaml - -Test owner: krfricke - -Acceptance criteria: Should run faster than 120 seconds. - -Theoretical minimum time: 100 seconds -""" -import os - -import ray - -from _trainable import timed_tune_run - - -def main(): - os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = "1" # Tweak - - ray.init(address="auto") - - num_samples = 96 - results_per_second = 500 - trial_length_s = 100 - - max_runtime = 120 - - timed_tune_run( - name="result throughput single node", - num_samples=num_samples, - results_per_second=results_per_second, - trial_length_s=trial_length_s, - max_runtime=max_runtime) - - -if __name__ == "__main__": - main() diff --git a/release/tune_tests/scalability_tests/workloads/test_xgboost_sweep.py b/release/tune_tests/scalability_tests/workloads/test_xgboost_sweep.py deleted file mode 100644 index 16a1f261693a..000000000000 --- a/release/tune_tests/scalability_tests/workloads/test_xgboost_sweep.py +++ /dev/null @@ -1,98 +0,0 @@ -"""Large-scale XGBoost parameter sweep - -In this run, we will start 32 trials of 32 actors each running distributed -XGBoost training. This test is more about making sure that the run succeeds -than about total runtime. However, it is expected that this is faster than -1 hour. - -We fix the max_depth to 4 and the number of boosting rounds to 100. The -fastest observed training time for 32 actors (1 CPU each) was about 2000 -seconds. We allow up to 10 minutes of slack, so aim for 2600 seconds total -tuning time. - -Cluster: cluster_16x64_data.yaml - -Test owner: krfricke - -Acceptance criteria: Should run faster than 2600 seconds. Should run without -errors. -""" -import os -import time - -import ray -from ray import tune - -from xgboost_ray import train, RayParams, RayDMatrix - - -def xgboost_train(config, num_actors=128, num_boost_round=200): - train_set = RayDMatrix( - os.path.expanduser("~/data/train.parquet"), "labels") - test_set = RayDMatrix(os.path.expanduser("~/data/test.parquet"), "labels") - - evals_result = {} - - bst = train( - params=config, - dtrain=train_set, - evals=[(test_set, "eval")], - evals_result=evals_result, - ray_params=RayParams( - max_actor_restarts=1, - gpus_per_actor=0, - cpus_per_actor=1, - num_actors=num_actors), - verbose_eval=False, - num_boost_round=num_boost_round) - - model_path = "tuned.xgb" - bst.save_model(model_path) - print("Final validation error: {:.4f}".format( - evals_result["eval"]["error"][-1])) - - -def main(): - name = "large xgboost sweep" - - ray.init(address="auto") - - num_samples = 32 - num_actors_per_sample = 32 - - max_runtime = 2600 - - config = { - "tree_method": "approx", - "objective": "binary:logistic", - "eval_metric": ["logloss", "error"], - "eta": tune.loguniform(1e-4, 1e-1), - "subsample": tune.uniform(0.5, 1.0), - "max_depth": 4 - } - - start_time = time.monotonic() - tune.run( - tune.with_parameters( - xgboost_train, - num_actors=num_actors_per_sample, - num_boost_round=100), - config=config, - num_samples=num_samples) - time_taken = time.monotonic() - start_time - - assert time_taken < max_runtime, \ - f"The {name} test took {time_taken:.2f} seconds, but should not " \ - f"have exceeded {max_runtime:.2f} seconds. Test failed. \n\n" \ - f"--- FAILED: {name.upper()} ::: " \ - f"{time_taken:.2f} > {max_runtime:.2f} ---" - - print(f"The {name} test took {time_taken:.2f} seconds, which " - f"is below the budget of {max_runtime:.2f} seconds. " - f"Test successful. \n\n" - f"--- PASSED: {name.upper()} ::: " - f"{time_taken:.2f} <= {max_runtime:.2f} ---") - - -if __name__ == "__main__": - main() diff --git a/release/xgboost_tests/README.rst b/release/xgboost_tests/README.rst deleted file mode 100644 index 303b09ef92e9..000000000000 --- a/release/xgboost_tests/README.rst +++ /dev/null @@ -1,32 +0,0 @@ -XGBoost on Ray tests -==================== - -This directory contains various XGBoost on Ray release tests. - -You should run these tests with the `releaser `_ tool. - -Overview --------- -There are four kinds of tests: - -1. ``distributed_api_test`` - checks general API functionality and should finish very quickly (< 1 minute) -2. ``train_*`` - checks single trial training on different setups. -3. ``tune_*`` - checks multi trial training via Ray Tune. -4. ``ft_*`` - checks fault tolerance. **These tests are currently flaky** - -Generally the releaser tool will run all tests in parallel, but if you do -it sequentially, be sure to do it in the order above. If ``train_*`` fails, -``tune_*`` will fail, too. - -Flaky fault tolerance tests ---------------------------- -The fault tolerance tests are currently flaky. In some runs, more nodes die -than expected, causing the test to fail. In other cases, the re-scheduled -actors become available too soon after crashing, causing the assertions to -fail. Please consider re-running the test a couple of times or contact the -test owner with outputs from the tests for further questions. - -Acceptance criteria -------------------- -These tests are considered passing when they throw no error at the end of -the output log. diff --git a/release/xgboost_tests/cluster_cpu_moderate.yaml b/release/xgboost_tests/cluster_cpu_moderate.yaml index a65c49336a1c..18a18dceb56e 100644 --- a/release/xgboost_tests/cluster_cpu_moderate.yaml +++ b/release/xgboost_tests/cluster_cpu_moderate.yaml @@ -2,7 +2,9 @@ cluster_name: ray-xgboost-release-cpu-moderate min_workers: 31 max_workers: 31 +initial_workers: 31 +target_utilization_fraction: 0.8 idle_timeout_minutes: 15 docker: diff --git a/release/xgboost_tests/cluster_cpu_small.yaml b/release/xgboost_tests/cluster_cpu_small.yaml index 4b97439b9d59..fe9e997f85aa 100644 --- a/release/xgboost_tests/cluster_cpu_small.yaml +++ b/release/xgboost_tests/cluster_cpu_small.yaml @@ -2,7 +2,9 @@ cluster_name: ray-xgboost-release-cpu-small min_workers: 3 max_workers: 3 +initial_workers: 3 +target_utilization_fraction: 0.8 idle_timeout_minutes: 15 docker: diff --git a/release/xgboost_tests/cluster_gpu_small.yaml b/release/xgboost_tests/cluster_gpu_small.yaml index 535d28490f71..5bea4f19acf2 100644 --- a/release/xgboost_tests/cluster_gpu_small.yaml +++ b/release/xgboost_tests/cluster_gpu_small.yaml @@ -2,7 +2,9 @@ cluster_name: ray-xgboost-release-gpu-small min_workers: 4 max_workers: 4 +initial_workers: 4 +target_utilization_fraction: 0.8 idle_timeout_minutes: 15 docker: diff --git a/rllib/BUILD b/rllib/BUILD index a09a549b1712..daa623dff843 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -436,13 +436,13 @@ py_test( srcs = ["agents/a3c/tests/test_a3c.py"] ) -# APEXTrainer (DQN) -py_test( - name = "test_apex_dqn", - tags = ["agents_dir"], - size = "medium", - srcs = ["agents/dqn/tests/test_apex_dqn.py"] -) +## APEXTrainer (DQN) +#py_test( +# name = "test_apex_dqn", +# tags = ["agents_dir"], +# size = "large", +# srcs = ["agents/dqn/tests/test_apex_dqn.py"] +#) # APEXDDPGTrainer py_test( @@ -482,15 +482,6 @@ py_test( srcs = ["agents/dqn/tests/test_simple_q.py"] ) -# TODO: enable once we have a MuJoCo-independent test case. -## Dreamer -#py_test( -# name = "test_dreamer", -# tags = ["agents_dir"], -# size = "small", -# srcs = ["agents/dreamer/tests/test_dreamer.py"] -#) - # ES py_test( name = "test_es", @@ -517,7 +508,7 @@ py_test( py_test( name = "test_marwil", tags = ["agents_dir"], - size = "large", + size = "medium", # Include the json data file. data = ["tests/data/cartpole/large.json"], srcs = ["agents/marwil/tests/test_marwil.py"] @@ -527,7 +518,7 @@ py_test( py_test( name = "test_bc", tags = ["agents_dir"], - size = "large", + size = "medium", # Include the json data file. data = ["tests/data/cartpole/large.json"], srcs = ["agents/marwil/tests/test_bc.py"] @@ -542,12 +533,12 @@ py_test( ) # MBMPOTrainer -py_test( - name = "test_mbmpo", - tags = ["agents_dir"], - size = "medium", - srcs = ["agents/mbmpo/tests/test_mbmpo.py"] -) +#py_test( +# name = "test_mbmpo", +# tags = ["agents_dir"], +# size = "medium", +# srcs = ["agents/mbmpo/tests/test_mbmpo.py"] +#) # PGTrainer py_test( @@ -1069,13 +1060,6 @@ sh_test( data = glob(["examples/serving/*.py"]), ) -py_test( - name = "env/wrappers/tests/test_unity3d_env", - tags = ["env"], - size = "small", - srcs = ["env/wrappers/tests/test_unity3d_env.py"] -) - py_test( name = "env/wrappers/tests/test_recsim_wrapper", tags = ["env"], @@ -1466,29 +1450,29 @@ py_test( args = ["TestSupportedMultiAgentPG"] ) +#py_test( +# name = "tests/test_supported_multi_agent_off_policy", +# main = "tests/test_supported_multi_agent.py", +# tags = ["tests_dir", "tests_dir_S"], +# size = "medium", +# srcs = ["tests/test_supported_multi_agent.py"], +# args = ["TestSupportedMultiAgentOffPolicy"] +#) + py_test( - name = "tests/test_supported_multi_agent_off_policy", - main = "tests/test_supported_multi_agent.py", + name = "tests/test_supported_spaces_pg", + main = "tests/test_supported_spaces.py", tags = ["tests_dir", "tests_dir_S"], - size = "medium", - srcs = ["tests/test_supported_multi_agent.py"], - args = ["TestSupportedMultiAgentOffPolicy"] + size = "enormous", + srcs = ["tests/test_supported_spaces.py"], + args = ["TestSupportedSpacesPG"] ) -# py_test( -# name = "tests/test_supported_spaces_pg", -# main = "tests/test_supported_spaces.py", -# tags = ["tests_dir", "tests_dir_S"], -# size = "enormous", -# srcs = ["tests/test_supported_spaces.py"], -# args = ["TestSupportedSpacesPG"] -# ) - py_test( name = "tests/test_supported_spaces_off_policy", main = "tests/test_supported_spaces.py", tags = ["tests_dir", "tests_dir_S"], - size = "medium", + size = "enormous", srcs = ["tests/test_supported_spaces.py"], args = ["TestSupportedSpacesOffPolicy"] ) @@ -1497,7 +1481,7 @@ py_test( name = "tests/test_supported_spaces_evolution_algos", main = "tests/test_supported_spaces.py", tags = ["tests_dir", "tests_dir_S"], - size = "medium", + size = "large", srcs = ["tests/test_supported_spaces.py"], args = ["TestSupportedSpacesEvolutionAlgos"] ) @@ -1509,13 +1493,6 @@ py_test( srcs = ["tests/test_timesteps.py"] ) -py_test( - name = "tests/test_trainer", - tags = ["tests_dir", "tests_dir_T"], - size = "small", - srcs = ["tests/test_trainer.py"] -) - # -------------------------------------------------------------------- # examples/ directory # @@ -1753,7 +1730,7 @@ py_test( name = "examples/custom_eval_tf", main = "examples/custom_eval.py", tags = ["examples", "examples_C"], - size = "medium", + size = "small", srcs = ["examples/custom_eval.py"], args = ["--num-cpus=4", "--as-test"] ) @@ -1762,7 +1739,7 @@ py_test( name = "examples/custom_eval_torch", main = "examples/custom_eval.py", tags = ["examples", "examples_C"], - size = "medium", + size = "small", srcs = ["examples/custom_eval.py"], args = ["--num-cpus=4", "--as-test", "--torch"] ) @@ -2114,7 +2091,7 @@ py_test( tags = ["examples", "examples_T"], size = "medium", srcs = ["examples/trajectory_view_api.py"], - args = ["--as-test", "--framework=tf", "--stop-reward=100.0"] + args = ["--as-test", "--framework=tf", "--stop-reward=80.0"] ) py_test( @@ -2123,7 +2100,7 @@ py_test( tags = ["examples", "examples_T"], size = "medium", srcs = ["examples/trajectory_view_api.py"], - args = ["--as-test", "--framework=torch", "--stop-reward=100.0"] + args = ["--as-test", "--framework=torch", "--stop-reward=80.0"] ) py_test( diff --git a/rllib/__init__.py b/rllib/__init__.py index 4af44a28786f..d27194f692b3 100644 --- a/rllib/__init__.py +++ b/rllib/__init__.py @@ -27,12 +27,12 @@ def _setup_logger(): def _register_all(): from ray.rllib.agents.trainer import Trainer, with_common_config - from ray.rllib.agents.registry import ALGORITHMS, get_trainer_class + from ray.rllib.agents.registry import ALGORITHMS, get_agent_class from ray.rllib.contrib.registry import CONTRIBUTED_ALGORITHMS for key in list(ALGORITHMS.keys()) + list(CONTRIBUTED_ALGORITHMS.keys( )) + ["__fake", "__sigmoid_fake_data", "__parameter_tuning"]: - register_trainable(key, get_trainer_class(key)) + register_trainable(key, get_agent_class(key)) def _see_contrib(name): """Returns dummy agent class warning algo is in contrib/.""" diff --git a/rllib/agents/callbacks.py b/rllib/agents/callbacks.py index 1972fabec711..e84cf41485b7 100644 --- a/rllib/agents/callbacks.py +++ b/rllib/agents/callbacks.py @@ -7,6 +7,7 @@ from ray.rllib.utils.annotations import PublicAPI from ray.rllib.utils.deprecation import deprecation_warning from ray.rllib.utils.typing import AgentID, PolicyID +from ray.util.debug import log_once if TYPE_CHECKING: from ray.rllib.evaluation import RolloutWorker @@ -55,6 +56,10 @@ def on_episode_start(self, kwargs: Forward compatibility placeholder. """ + if env_index is not None: + if log_once("callbacks_env_index_deprecated"): + deprecation_warning("env_index", "episode.env_id", error=False) + if self.legacy_callbacks.get("on_episode_start"): self.legacy_callbacks["on_episode_start"]({ "env": base_env, @@ -84,6 +89,10 @@ def on_episode_step(self, kwargs: Forward compatibility placeholder. """ + if env_index is not None: + if log_once("callbacks_env_index_deprecated"): + deprecation_warning("env_index", "episode.env_id", error=False) + if self.legacy_callbacks.get("on_episode_step"): self.legacy_callbacks["on_episode_step"]({ "env": base_env, @@ -115,6 +124,10 @@ def on_episode_end(self, kwargs: Forward compatibility placeholder. """ + if env_index is not None: + if log_once("callbacks_env_index_deprecated"): + deprecation_warning("env_index", "episode.env_id", error=False) + if self.legacy_callbacks.get("on_episode_end"): self.legacy_callbacks["on_episode_end"]({ "env": base_env, @@ -175,7 +188,7 @@ def on_sample_end(self, *, worker: "RolloutWorker", samples: SampleBatch, }) def on_learn_on_batch(self, *, policy: Policy, train_batch: SampleBatch, - result: dict, **kwargs) -> None: + **kwargs) -> None: """Called at the beginning of Policy.learn_on_batch(). Note: This is called before 0-padding via @@ -185,7 +198,6 @@ def on_learn_on_batch(self, *, policy: Policy, train_batch: SampleBatch, policy (Policy): Reference to the current Policy object. train_batch (SampleBatch): SampleBatch to be trained on. You can mutate this object to modify the samples generated. - result (dict): A results dict to add custom metrics to. kwargs: Forward compatibility placeholder. """ diff --git a/rllib/agents/cql/cql.py b/rllib/agents/cql/cql.py index 30bbe89d4553..04a63be72751 100644 --- a/rllib/agents/cql/cql.py +++ b/rllib/agents/cql/cql.py @@ -15,8 +15,6 @@ SAC_CONFIG, { # You should override this to point to an offline dataset. "input": "sampler", - # Offline RL does not need IS estimators - "input_evaluation": [], # Number of iterations with Behavior Cloning Pretraining "bc_iters": 20000, # CQL Loss Temperature diff --git a/rllib/agents/ddpg/ddpg_tf_policy.py b/rllib/agents/ddpg/ddpg_tf_policy.py index 203add618ce6..414910cc33f8 100644 --- a/rllib/agents/ddpg/ddpg_tf_policy.py +++ b/rllib/agents/ddpg/ddpg_tf_policy.py @@ -13,15 +13,13 @@ PRIO_WEIGHTS from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.models import ModelCatalog -from ray.rllib.models.tf.tf_action_dist import Deterministic, Dirichlet -from ray.rllib.models.torch.torch_action_dist import TorchDeterministic, \ - TorchDirichlet +from ray.rllib.models.tf.tf_action_dist import Deterministic +from ray.rllib.models.torch.torch_action_dist import TorchDeterministic from ray.rllib.utils.annotations import override from ray.rllib.policy.tf_policy import TFPolicy from ray.rllib.policy.tf_policy_template import build_tf_policy from ray.rllib.utils.error import UnsupportedSpaceException from ray.rllib.utils.framework import get_variable, try_import_tf -from ray.rllib.utils.spaces.simplex import Simplex from ray.rllib.utils.tf_ops import huber_loss, make_tf_callable tf1, tf, tfv = try_import_tf() @@ -93,13 +91,9 @@ def get_distribution_inputs_and_class(policy, }, [], None) dist_inputs = model.get_policy_output(model_out) - if isinstance(policy.action_space, Simplex): - distr_class = TorchDirichlet if policy.config["framework"] == "torch" \ - else Dirichlet - else: - distr_class = TorchDeterministic if \ - policy.config["framework"] == "torch" else Deterministic - return dist_inputs, distr_class, [] # []=state out + return dist_inputs, (TorchDeterministic + if policy.config["framework"] == "torch" else + Deterministic), [] # []=state out def ddpg_actor_critic_loss(policy, model, _, train_batch): diff --git a/rllib/agents/ddpg/ddpg_torch_policy.py b/rllib/agents/ddpg/ddpg_torch_policy.py index 5041ae5fed46..f6c73f912da7 100644 --- a/rllib/agents/ddpg/ddpg_torch_policy.py +++ b/rllib/agents/ddpg/ddpg_torch_policy.py @@ -5,12 +5,10 @@ get_distribution_inputs_and_class, validate_spaces from ray.rllib.agents.dqn.dqn_tf_policy import postprocess_nstep_and_prio, \ PRIO_WEIGHTS -from ray.rllib.models.torch.torch_action_dist import TorchDeterministic, \ - TorchDirichlet +from ray.rllib.models.torch.torch_action_dist import TorchDeterministic from ray.rllib.policy.policy_template import build_policy_class from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.spaces.simplex import Simplex from ray.rllib.utils.torch_ops import apply_grad_clipping, huber_loss, l2_loss torch, nn = try_import_torch() @@ -26,11 +24,7 @@ def build_ddpg_models_and_action_dist(policy, obs_space, action_space, config): device = (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")) policy.target_model = policy.target_model.to(device) - - if isinstance(action_space, Simplex): - return model, TorchDirichlet - else: - return model, TorchDeterministic + return model, TorchDeterministic def ddpg_actor_critic_loss(policy, model, _, train_batch): diff --git a/rllib/agents/ddpg/tests/test_ddpg.py b/rllib/agents/ddpg/tests/test_ddpg.py index 0d5ddb8c5b0e..339f36fb537c 100644 --- a/rllib/agents/ddpg/tests/test_ddpg.py +++ b/rllib/agents/ddpg/tests/test_ddpg.py @@ -184,8 +184,15 @@ def test_ddpg_loss_function(self): env = SimpleEnv batch_size = 100 - obs_size = (batch_size, 1) - actions = np.random.random(size=(batch_size, 1)) + if env is SimpleEnv: + obs_size = (batch_size, 1) + actions = np.random.random(size=(batch_size, 1)) + elif env == "CartPole-v0": + obs_size = (batch_size, 4) + actions = np.random.randint(0, 2, size=(batch_size, )) + else: + obs_size = (batch_size, 3) + actions = np.random.random(size=(batch_size, 1)) # Batch of size=n. input_ = self._get_batch_helper(obs_size, actions, batch_size) diff --git a/rllib/agents/dreamer/dreamer.py b/rllib/agents/dreamer/dreamer.py index 21646d61871d..94774d9fec91 100644 --- a/rllib/agents/dreamer/dreamer.py +++ b/rllib/agents/dreamer/dreamer.py @@ -31,8 +31,6 @@ "discount": 0.99, # Lambda "lambda": 0.95, - # Clipping is done inherently via policy tanh. - "clip_actions": False, # Training iterations per data collection from real env "dreamer_train_iters": 100, # Horizon for Enviornment (1000 for Mujoco/DMC) diff --git a/rllib/agents/dreamer/dreamer_model.py b/rllib/agents/dreamer/dreamer_model.py index f2db417e512b..5483f664f839 100644 --- a/rllib/agents/dreamer/dreamer_model.py +++ b/rllib/agents/dreamer/dreamer_model.py @@ -1,6 +1,6 @@ import numpy as np from typing import Any, List, Tuple -from ray.rllib.models.torch.misc import Reshape +from ray.rllib.models.torch.modules.reshape import Reshape from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 from ray.rllib.utils.framework import try_import_torch from ray.rllib.utils.framework import TensorType diff --git a/rllib/agents/dreamer/tests/test_dreamer.py b/rllib/agents/dreamer/tests/test_dreamer.py deleted file mode 100644 index 2b318866ca48..000000000000 --- a/rllib/agents/dreamer/tests/test_dreamer.py +++ /dev/null @@ -1,41 +0,0 @@ -import unittest - -import ray -from ray import tune -import ray.rllib.agents.dreamer as dreamer -from ray.rllib.examples.env.dm_control_suite import hopper_hop -from ray.rllib.utils.test_utils import check_compute_single_action, \ - framework_iterator - - -class TestDreamer(unittest.TestCase): - """Sanity tests for DreamerTrainer.""" - - def setUp(self): - ray.init() - - def tearDown(self): - ray.shutdown() - - def test_dreamer_compilation(self): - """Test whether an DreamerTrainer can be built with all frameworks.""" - config = dreamer.DEFAULT_CONFIG.copy() - tune.register_env("dm_control_hopper_hop", lambda _: hopper_hop()) - - num_iterations = 1 - - # Test against all frameworks. - for _ in framework_iterator(config, frameworks="torch"): - for env in ["dm_control_hopper_hop"]: - trainer = dreamer.DREAMERTrainer(config=config, env=env) - for i in range(num_iterations): - results = trainer.train() - print(results) - check_compute_single_action(trainer) - trainer.stop() - - -if __name__ == "__main__": - import pytest - import sys - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/agents/maml/maml_torch_policy.py b/rllib/agents/maml/maml_torch_policy.py index 695826798272..2e0e1e2083b7 100644 --- a/rllib/agents/maml/maml_torch_policy.py +++ b/rllib/agents/maml/maml_torch_policy.py @@ -8,8 +8,8 @@ from ray.rllib.agents.ppo.ppo_tf_policy import setup_config from ray.rllib.agents.ppo.ppo_torch_policy import vf_preds_fetches, \ ValueNetworkMixin -from ray.rllib.utils.torch_ops import apply_grad_clipping from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.torch_ops import apply_grad_clipping torch, nn = try_import_torch() @@ -178,7 +178,7 @@ def __init__(self, # Meta Update ppo_loss, s_loss, kl_loss, v_loss, ent = self.compute_losses( - fnet, self.inner_adaptation_steps - 1, i, clip_loss=True) + fnet, self.inner_adaptation_steps, i, clip_loss=True) inner_loss = torch.mean( torch.stack([ @@ -271,14 +271,8 @@ def maml_loss(policy, model, dist_class, train_batch): # `split` may not exist yet (during test-loss call), use a dummy value. # Cannot use get here due to train_batch being a TrackingDict. - if "split" in train_batch: - split = train_batch["split"] - else: - split_shape = (policy.config["inner_adaptation_steps"], - policy.config["num_workers"]) - split_const = int(train_batch["obs"].shape[0] // - (split_shape[0] * split_shape[1])) - split = torch.ones(split_shape, dtype=int) * split_const + split = train_batch["split"] if "split" in train_batch else \ + torch.tensor([[8, 8], [8, 8]]) policy.loss_obj = MAMLLoss( model=model, dist_class=dist_class, diff --git a/rllib/agents/maml/tests/test_maml.py b/rllib/agents/maml/tests/test_maml.py index b84e02857190..e5ef3cf694b0 100644 --- a/rllib/agents/maml/tests/test_maml.py +++ b/rllib/agents/maml/tests/test_maml.py @@ -23,21 +23,15 @@ def test_maml_compilation(self): num_iterations = 1 # Test for tf framework (torch not implemented yet). - for fw in framework_iterator(config, frameworks=("tf", "torch")): - for env in [ - "pendulum_mass.PendulumMassEnv", - "cartpole_mass.CartPoleMassEnv" - ]: - if fw == "tf" and env.startswith("cartpole"): - continue - print("env={}".format(env)) - env_ = "ray.rllib.examples.env.{}".format(env) - trainer = maml.MAMLTrainer(config=config, env=env_) - for i in range(num_iterations): - trainer.train() - check_compute_single_action( - trainer, include_prev_action_reward=True) - trainer.stop() + for _ in framework_iterator(config, frameworks=("tf")): + trainer = maml.MAMLTrainer( + config=config, + env="ray.rllib.examples.env.pendulum_mass.PendulumMassEnv") + for i in range(num_iterations): + trainer.train() + check_compute_single_action( + trainer, include_prev_action_reward=True) + trainer.stop() if __name__ == "__main__": diff --git a/rllib/agents/marwil/marwil.py b/rllib/agents/marwil/marwil.py index d123b3ef5f5f..c4f88fdb8b30 100644 --- a/rllib/agents/marwil/marwil.py +++ b/rllib/agents/marwil/marwil.py @@ -21,8 +21,6 @@ "beta": 1.0, # Balancing value estimation loss and policy optimization loss. "vf_coeff": 1.0, - # If specified, clip the global norm of gradients by this amount. - "grad_clip": None, # Whether to calculate cumulative rewards. "postprocess_inputs": True, # Whether to rollout "complete_episodes" or "truncate_episodes". diff --git a/rllib/agents/marwil/marwil_tf_policy.py b/rllib/agents/marwil/marwil_tf_policy.py index 211f9467e7b0..44352be4f883 100644 --- a/rllib/agents/marwil/marwil_tf_policy.py +++ b/rllib/agents/marwil/marwil_tf_policy.py @@ -1,7 +1,6 @@ import logging import ray -from ray.rllib.agents.ppo.ppo_tf_policy import compute_and_clip_gradients from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.evaluation.postprocessing import compute_advantages, \ Postprocessing @@ -134,7 +133,7 @@ def __init__(self, policy, value_estimates, action_dist, actions, # Exponentially weighted advantages. c = tf.math.sqrt(policy._moving_average_sqd_adv_norm) - exp_advs = tf.math.exp(beta * (adv / (1e-8 + c))) + exp_advs = tf.math.exp(beta * (adv / c)) # Static graph. else: update_adv_norm = tf1.assign_add( @@ -201,5 +200,4 @@ def setup_mixins(policy, obs_space, action_space, config): stats_fn=stats, postprocess_fn=postprocess_advantages, before_loss_init=setup_mixins, - gradients_fn=compute_and_clip_gradients, mixins=[ValueNetworkMixin]) diff --git a/rllib/agents/marwil/marwil_torch_policy.py b/rllib/agents/marwil/marwil_torch_policy.py index 14ae943ecaf5..ef3558378794 100644 --- a/rllib/agents/marwil/marwil_torch_policy.py +++ b/rllib/agents/marwil/marwil_torch_policy.py @@ -4,7 +4,7 @@ from ray.rllib.policy.policy_template import build_policy_class from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.torch_ops import apply_grad_clipping, explained_variance +from ray.rllib.utils.torch_ops import explained_variance torch, _ = try_import_torch() @@ -98,6 +98,5 @@ def setup_mixins(policy, obs_space, action_space, config): get_default_config=lambda: ray.rllib.agents.marwil.marwil.DEFAULT_CONFIG, stats_fn=stats, postprocess_fn=postprocess_advantages, - extra_grad_process_fn=apply_grad_clipping, before_loss_init=setup_mixins, mixins=[ValueNetworkMixin]) diff --git a/rllib/agents/marwil/tests/test_marwil.py b/rllib/agents/marwil/tests/test_marwil.py index a0b3caa1079e..afb3ec9ee261 100644 --- a/rllib/agents/marwil/tests/test_marwil.py +++ b/rllib/agents/marwil/tests/test_marwil.py @@ -51,7 +51,7 @@ def test_marwil_compilation_and_learning_from_offline_file(self): min_reward = 70.0 # Test for all frameworks. - for _ in framework_iterator(config, frameworks=("tf", "torch")): + for _ in framework_iterator(config): trainer = marwil.MARWILTrainer(config=config, env="CartPole-v0") learnt = False for i in range(num_iterations): diff --git a/rllib/agents/mbmpo/mbmpo_torch_policy.py b/rllib/agents/mbmpo/mbmpo_torch_policy.py index 5dc03435c43b..06e65042e35f 100644 --- a/rllib/agents/mbmpo/mbmpo_torch_policy.py +++ b/rllib/agents/mbmpo/mbmpo_torch_policy.py @@ -1,5 +1,4 @@ import gym -from gym.spaces import Box, Discrete import logging from typing import Tuple, Type @@ -14,7 +13,6 @@ from ray.rllib.models.torch.torch_action_dist import TorchDistributionWrapper from ray.rllib.policy.policy import Policy from ray.rllib.policy.policy_template import build_policy_class -from ray.rllib.utils.error import UnsupportedSpaceException from ray.rllib.utils.framework import try_import_torch from ray.rllib.utils.torch_ops import apply_grad_clipping from ray.rllib.utils.typing import TrainerConfigDict @@ -24,35 +22,6 @@ logger = logging.getLogger(__name__) -def validate_spaces(policy: Policy, observation_space: gym.spaces.Space, - action_space: gym.spaces.Space, - config: TrainerConfigDict) -> None: - """Validates the observation- and action spaces used for the Policy. - - Args: - policy (Policy): The policy, whose spaces are being validated. - observation_space (gym.spaces.Space): The observation space to - validate. - action_space (gym.spaces.Space): The action space to validate. - config (TrainerConfigDict): The Policy's config dict. - - Raises: - UnsupportedSpaceException: If one of the spaces is not supported. - """ - # Only support single Box or single Discrete spaces. - if not isinstance(action_space, (Box, Discrete)): - raise UnsupportedSpaceException( - "Action space ({}) of {} is not supported for " - "MB-MPO. Must be [Box|Discrete].".format(action_space, policy)) - # If Box, make sure it's a 1D vector space. - elif isinstance(action_space, Box) and len(action_space.shape) > 1: - raise UnsupportedSpaceException( - "Action space ({}) of {} has multiple dimensions " - "{}. ".format(action_space, policy, action_space.shape) + - "Consider reshaping this into a single dimension Box space " - "or using the multi-agent API.") - - def make_model_and_action_dist( policy: Policy, obs_space: gym.spaces.Space, diff --git a/rllib/agents/mbmpo/model_ensemble.py b/rllib/agents/mbmpo/model_ensemble.py index 1d0f13b719cb..2bb9513dabfb 100644 --- a/rllib/agents/mbmpo/model_ensemble.py +++ b/rllib/agents/mbmpo/model_ensemble.py @@ -136,8 +136,6 @@ def __init__(self, obs_space, action_space, num_outputs, model_config, obs_space.low[0], obs_space.high[0], shape=(obs_space.shape[0] + action_space.shape[0], )) - else: - raise NotImplementedError super(DynamicsEnsembleCustomModel, self).__init__( input_space, action_space, num_outputs, model_config, name) @@ -200,9 +198,6 @@ def loss(self, x, y): def fit(self): # Add env samples to Replay Buffer local_worker = get_global_worker() - for pid, pol in local_worker.policy_map.items(): - pol.view_requirements[ - SampleBatch.NEXT_OBS].used_for_training = True new_samples = local_worker.sample() # Initial Exploration of 8000 timesteps if not self.global_itr: diff --git a/rllib/agents/mock.py b/rllib/agents/mock.py index 1a9017252567..90bfffe83bd8 100644 --- a/rllib/agents/mock.py +++ b/rllib/agents/mock.py @@ -118,14 +118,14 @@ def step(self): info={}) -def _trainer_import_failed(trace): +def _agent_import_failed(trace): """Returns dummy agent class for if PyTorch etc. is not installed.""" - class _TrainerImportFailed(Trainer): - _name = "TrainerImportFailed" + class _AgentImportFailed(Trainer): + _name = "AgentImportFailed" _default_config = with_common_config({}) def setup(self, config): raise ImportError(trace) - return _TrainerImportFailed + return _AgentImportFailed diff --git a/rllib/agents/ppo/ppo_tf_policy.py b/rllib/agents/ppo/ppo_tf_policy.py index 5991da84e328..57874ba296b3 100644 --- a/rllib/agents/ppo/ppo_tf_policy.py +++ b/rllib/agents/ppo/ppo_tf_policy.py @@ -182,15 +182,9 @@ def compute_and_clip_gradients(policy: Policy, optimizer: LocalOptimizer, # Clip by global norm, if necessary. if policy.config["grad_clip"] is not None: - # Defuse inf gradients (due to super large losses). grads = [g for (g, v) in grads_and_vars] - grads, _ = tf.clip_by_global_norm(grads, policy.config["grad_clip"]) - # If the global_norm is inf -> All grads will be NaN. Stabilize this - # here by setting them to 0.0. This will simply ignore destructive loss - # calculations. - policy.grads = [ - tf.where(tf.math.is_nan(g), tf.zeros_like(g), g) for g in grads - ] + policy.grads, _ = tf.clip_by_global_norm(grads, + policy.config["grad_clip"]) clipped_grads_and_vars = list(zip(policy.grads, variables)) return clipped_grads_and_vars else: diff --git a/rllib/agents/registry.py b/rllib/agents/registry.py index efed5a21742f..8ec4a4582ede 100644 --- a/rllib/agents/registry.py +++ b/rllib/agents/registry.py @@ -3,127 +3,126 @@ import traceback from ray.rllib.contrib.registry import CONTRIBUTED_ALGORITHMS -from ray.rllib.utils.deprecation import deprecation_warning def _import_a2c(): from ray.rllib.agents import a3c - return a3c.A2CTrainer, a3c.a2c.A2C_DEFAULT_CONFIG + return a3c.A2CTrainer def _import_a3c(): from ray.rllib.agents import a3c - return a3c.A3CTrainer, a3c.DEFAULT_CONFIG + return a3c.A3CTrainer def _import_apex(): from ray.rllib.agents import dqn - return dqn.ApexTrainer, dqn.apex.APEX_DEFAULT_CONFIG + return dqn.ApexTrainer def _import_apex_ddpg(): from ray.rllib.agents import ddpg - return ddpg.ApexDDPGTrainer, ddpg.apex.APEX_DDPG_DEFAULT_CONFIG + return ddpg.ApexDDPGTrainer def _import_appo(): from ray.rllib.agents import ppo - return ppo.APPOTrainer, ppo.appo.DEFAULT_CONFIG + return ppo.APPOTrainer def _import_ars(): from ray.rllib.agents import ars - return ars.ARSTrainer, ars.DEFAULT_CONFIG + return ars.ARSTrainer def _import_bc(): from ray.rllib.agents import marwil - return marwil.BCTrainer, marwil.DEFAULT_CONFIG + return marwil.BCTrainer def _import_cql(): from ray.rllib.agents import cql - return cql.CQLTrainer, cql.CQL_DEFAULT_CONFIG + return cql.CQLTrainer def _import_ddpg(): from ray.rllib.agents import ddpg - return ddpg.DDPGTrainer, ddpg.DEFAULT_CONFIG + return ddpg.DDPGTrainer def _import_ddppo(): from ray.rllib.agents import ppo - return ppo.DDPPOTrainer, ppo.DEFAULT_CONFIG + return ppo.DDPPOTrainer def _import_dqn(): from ray.rllib.agents import dqn - return dqn.DQNTrainer, dqn.DEFAULT_CONFIG + return dqn.DQNTrainer def _import_dreamer(): from ray.rllib.agents import dreamer - return dreamer.DREAMERTrainer, dreamer.DEFAULT_CONFIG + return dreamer.DREAMERTrainer def _import_es(): from ray.rllib.agents import es - return es.ESTrainer, es.DEFAULT_CONFIG + return es.ESTrainer def _import_impala(): from ray.rllib.agents import impala - return impala.ImpalaTrainer, impala.DEFAULT_CONFIG + return impala.ImpalaTrainer def _import_maml(): from ray.rllib.agents import maml - return maml.MAMLTrainer, maml.DEFAULT_CONFIG + return maml.MAMLTrainer def _import_marwil(): from ray.rllib.agents import marwil - return marwil.MARWILTrainer, marwil.DEFAULT_CONFIG + return marwil.MARWILTrainer def _import_mbmpo(): from ray.rllib.agents import mbmpo - return mbmpo.MBMPOTrainer, mbmpo.DEFAULT_CONFIG + return mbmpo.MBMPOTrainer def _import_pg(): from ray.rllib.agents import pg - return pg.PGTrainer, pg.DEFAULT_CONFIG + return pg.PGTrainer def _import_ppo(): from ray.rllib.agents import ppo - return ppo.PPOTrainer, ppo.DEFAULT_CONFIG + return ppo.PPOTrainer def _import_qmix(): from ray.rllib.agents import qmix - return qmix.QMixTrainer, qmix.DEFAULT_CONFIG + return qmix.QMixTrainer def _import_sac(): from ray.rllib.agents import sac - return sac.SACTrainer, sac.DEFAULT_CONFIG + return sac.SACTrainer def _import_simple_q(): from ray.rllib.agents import dqn - return dqn.SimpleQTrainer, dqn.simple_q.DEFAULT_CONFIG + return dqn.SimpleQTrainer def _import_slate_q(): from ray.rllib.agents import slateq - return slateq.SlateQTrainer, slateq.DEFAULT_CONFIG + return slateq.SlateQTrainer def _import_td3(): from ray.rllib.agents import ddpg - return ddpg.TD3Trainer, ddpg.td3.TD3_DEFAULT_CONFIG + return ddpg.TD3Trainer ALGORITHMS = { @@ -154,47 +153,32 @@ def _import_td3(): } -def get_trainer_class(alg: str, return_config=False) -> type: - """Returns the class of a known Trainer given its name.""" +def get_agent_class(alg: str) -> type: + """Returns the class of a known agent given its name.""" try: - return _get_trainer_class(alg, return_config=return_config) + return _get_agent_class(alg) except ImportError: - from ray.rllib.agents.mock import _trainer_import_failed - class_ = _trainer_import_failed(traceback.format_exc()) - config = class_._default_config - if return_config: - return class_, config - return class_ - + from ray.rllib.agents.mock import _agent_import_failed + return _agent_import_failed(traceback.format_exc()) -# Deprecated: Use `get_trainer_class` instead. -def get_agent_class(alg: str) -> type: - deprecation_warning("get_agent_class", "get_trainer_class", error=False) - return get_trainer_class(alg) - -def _get_trainer_class(alg: str, return_config=False) -> type: +def _get_agent_class(alg: str) -> type: if alg in ALGORITHMS: - class_, config = ALGORITHMS[alg]() + return ALGORITHMS[alg]() elif alg in CONTRIBUTED_ALGORITHMS: - class_, config = CONTRIBUTED_ALGORITHMS[alg]() + return CONTRIBUTED_ALGORITHMS[alg]() elif alg == "script": from ray.tune import script_runner - class_, config = script_runner.ScriptRunner, {} + return script_runner.ScriptRunner elif alg == "__fake": from ray.rllib.agents.mock import _MockTrainer - class_, config = _MockTrainer, _MockTrainer._default_config + return _MockTrainer elif alg == "__sigmoid_fake_data": from ray.rllib.agents.mock import _SigmoidFakeData - class_, config = _SigmoidFakeData, _SigmoidFakeData._default_config + return _SigmoidFakeData elif alg == "__parameter_tuning": from ray.rllib.agents.mock import _ParameterTuningTrainer - class_, config = _ParameterTuningTrainer, \ - _ParameterTuningTrainer._default_config + return _ParameterTuningTrainer else: raise Exception(("Unknown algorithm {}.").format(alg)) - - if return_config: - return class_, config - return class_ diff --git a/rllib/agents/sac/sac.py b/rllib/agents/sac/sac.py index 97d0f7d77147..5c476248c737 100644 --- a/rllib/agents/sac/sac.py +++ b/rllib/agents/sac/sac.py @@ -16,7 +16,6 @@ from ray.rllib.agents.dqn.dqn import GenericOffPolicyTrainer from ray.rllib.agents.sac.sac_tf_policy import SACTFPolicy from ray.rllib.policy.policy import Policy -from ray.rllib.utils.deprecation import DEPRECATED_VALUE, deprecation_warning from ray.rllib.utils.typing import TrainerConfigDict logger = logging.getLogger(__name__) @@ -40,37 +39,16 @@ # Use a e.g. conv2D state preprocessing network before concatenating the # resulting (feature) vector with the action input for the input to # the Q-networks. - "use_state_preprocessor": DEPRECATED_VALUE, - # Model options for the Q network(s). These will override MODEL_DEFAULTS. - # The `Q_model` dict is treated just as the top-level `model` dict in - # setting up the Q-network(s) (2 if twin_q=True). - # That means, you can do for different observation spaces: - # obs=Box(1D) -> Tuple(Box(1D) + Action) -> concat -> post_fcnet - # obs=Box(3D) -> Tuple(Box(3D) + Action) -> vision-net -> concat w/ action - # -> post_fcnet - # obs=Tuple(Box(1D), Box(3D)) -> Tuple(Box(1D), Box(3D), Action) - # -> vision-net -> concat w/ Box(1D) and action -> post_fcnet - # You can also have SAC use your custom_model as Q-model(s), by simply - # specifying the `custom_model` sub-key in below dict (just like you would - # do in the top-level `model` dict. + "use_state_preprocessor": False, + # Model options for the Q network(s). "Q_model": { - "fcnet_hiddens": [256, 256], "fcnet_activation": "relu", - "post_fcnet_hiddens": [], - "post_fcnet_activation": None, - "custom_model": None, # Use this to define custom Q-model(s). - "custom_model_config": {}, + "fcnet_hiddens": [256, 256], }, - # Model options for the policy function (see `Q_model` above for details). - # The difference to `Q_model` above is that no action concat'ing is - # performed before the post_fcnet stack. + # Model options for the policy function. "policy_model": { - "fcnet_hiddens": [256, 256], "fcnet_activation": "relu", - "post_fcnet_hiddens": [], - "post_fcnet_activation": None, - "custom_model": None, # Use this to define a custom policy model. - "custom_model_config": {}, + "fcnet_hiddens": [256, 256], }, # Unsquash actions to the upper and lower bounds of env's action space. # Ignored for discrete action spaces. @@ -167,10 +145,11 @@ def validate_config(config: TrainerConfigDict) -> None: Raises: ValueError: In case something is wrong with the config. """ - if config["use_state_preprocessor"] != DEPRECATED_VALUE: - deprecation_warning( - old="config['use_state_preprocessor']", error=False) - config["use_state_preprocessor"] = DEPRECATED_VALUE + if config["model"].get("custom_model"): + logger.warning( + "Setting use_state_preprocessor=True since a custom model " + "was specified.") + config["use_state_preprocessor"] = True if config["grad_clip"] is not None and config["grad_clip"] <= 0.0: raise ValueError("`grad_clip` value must be > 0.0!") diff --git a/rllib/agents/sac/sac_tf_model.py b/rllib/agents/sac/sac_tf_model.py index b457f1e947e0..4c890385f58f 100644 --- a/rllib/agents/sac/sac_tf_model.py +++ b/rllib/agents/sac/sac_tf_model.py @@ -1,12 +1,9 @@ import gym from gym.spaces import Box, Discrete import numpy as np -from typing import Dict, List, Optional +from typing import Optional, Tuple -from ray.rllib.models.catalog import ModelCatalog from ray.rllib.models.tf.tf_modelv2 import TFModelV2 -from ray.rllib.utils import force_list -from ray.rllib.utils.annotations import override from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.spaces.simplex import Simplex from ray.rllib.utils.typing import ModelConfigDict, TensorType @@ -17,21 +14,14 @@ class SACTFModel(TFModelV2): """Extension of the standard TFModelV2 for SAC. - To customize, do one of the following: - - sub-class SACTFModel and override one or more of its methods. - - Use SAC's `Q_model` and `policy_model` keys to tweak the default model - behaviors (e.g. fcnet_hiddens, conv_filters, etc..). - - Use SAC's `Q_model->custom_model` and `policy_model->custom_model` keys - to specify your own custom Q-model(s) and policy-models, which will be - created within this SACTFModel (see `build_policy_model` and - `build_q_model`. - - Note: It is not recommended to override the `forward` method for SAC. This - would lead to shared weights (between policy and Q-nets), which will then - not be optimized by either of the critic- or actor-optimizers! + Instances of this Model get created via wrapping this class around another + default- or custom model (inside + rllib/agents/sac/sac_tf_policy.py::build_sac_model). Doing so simply adds + this class' methods (`get_q_values`, etc..) to the wrapped model, such that + the wrapped model can be used by the SAC algorithm. Data flow: - `obs` -> forward() (should stay a noop method!) -> `model_out` + `obs` -> forward() -> `model_out` `model_out` -> get_policy_output() -> pi(actions|obs) `model_out`, `actions` -> get_q_values() -> Q(s, a) `model_out`, `actions` -> get_twin_q_values() -> Q_twin(s, a) @@ -43,18 +33,20 @@ def __init__(self, num_outputs: Optional[int], model_config: ModelConfigDict, name: str, - policy_model_config: ModelConfigDict = None, - q_model_config: ModelConfigDict = None, + actor_hidden_activation: str = "relu", + actor_hiddens: Tuple[int] = (256, 256), + critic_hidden_activation: str = "relu", + critic_hiddens: Tuple[int] = (256, 256), twin_q: bool = False, initial_alpha: float = 1.0, target_entropy: Optional[float] = None): """Initialize a SACTFModel instance. Args: - policy_model_config (ModelConfigDict): The config dict for the - policy network. - q_model_config (ModelConfigDict): The config dict for the - Q-network(s) (2 if twin_q=True). + actor_hidden_activation (str): Activation for the actor network. + actor_hiddens (list): Hidden layers sizes for the actor network. + critic_hidden_activation (str): Activation for the critic network. + critic_hiddens (list): Hidden layers sizes for the critic network. twin_q (bool): Build twin Q networks (Q-net and target) for more stable Q-learning. initial_alpha (float): The initial value for the to-be-optimized @@ -85,15 +77,54 @@ def __init__(self, action_outs = self.action_dim q_outs = 1 - self.action_model = self.build_policy_model( - self.obs_space, action_outs, policy_model_config, "policy_model") + self.model_out = tf.keras.layers.Input( + shape=(self.num_outputs, ), name="model_out") + self.action_model = tf.keras.Sequential([ + tf.keras.layers.Dense( + units=hidden, + activation=getattr(tf.nn, actor_hidden_activation, None), + name="action_{}".format(i + 1)) + for i, hidden in enumerate(actor_hiddens) + ] + [ + tf.keras.layers.Dense( + units=action_outs, activation=None, name="action_out") + ]) + self.shift_and_log_scale_diag = self.action_model(self.model_out) + + self.actions_input = None + if not self.discrete: + self.actions_input = tf.keras.layers.Input( + shape=(self.action_dim, ), name="actions") + + def build_q_net(name, observations, actions): + # For continuous actions: Feed obs and actions (concatenated) + # through the NN. For discrete actions, only obs. + q_net = tf.keras.Sequential(([ + tf.keras.layers.Concatenate(axis=1), + ] if not self.discrete else []) + [ + tf.keras.layers.Dense( + units=units, + activation=getattr(tf.nn, critic_hidden_activation, None), + name="{}_hidden_{}".format(name, i)) + for i, units in enumerate(critic_hiddens) + ] + [ + tf.keras.layers.Dense( + units=q_outs, activation=None, name="{}_out".format(name)) + ]) + + # TODO(hartikainen): Remove the unnecessary Model calls here + if self.discrete: + q_net = tf.keras.Model(observations, q_net(observations)) + else: + q_net = tf.keras.Model([observations, actions], + q_net([observations, actions])) + return q_net + + self.q_net = build_q_net("q", self.model_out, self.actions_input) - self.q_net = self.build_q_model(self.obs_space, self.action_space, - q_outs, q_model_config, "q") if twin_q: - self.twin_q_net = self.build_q_model(self.obs_space, - self.action_space, q_outs, - q_model_config, "twin_q") + self.twin_q_net = build_q_net("twin_q", self.model_out, + self.actions_input) else: self.twin_q_net = None @@ -112,80 +143,6 @@ def __init__(self, target_entropy = -np.prod(action_space.shape) self.target_entropy = target_entropy - @override(TFModelV2) - def forward(self, input_dict: Dict[str, TensorType], - state: List[TensorType], - seq_lens: TensorType) -> (TensorType, List[TensorType]): - """The common (Q-net and policy-net) forward pass. - - NOTE: It is not(!) recommended to override this method as it would - introduce a shared pre-network, which would be updated by both - actor- and critic optimizers. - """ - return input_dict["obs"], state - - def build_policy_model(self, obs_space, num_outputs, policy_model_config, - name): - """Builds the policy model used by this SAC. - - Override this method in a sub-class of SACTFModel to implement your - own policy net. Alternatively, simply set `custom_model` within the - top level SAC `policy_model` config key to make this default - implementation of `build_policy_model` use your custom policy network. - - Returns: - TFModelV2: The TFModelV2 policy sub-model. - """ - model = ModelCatalog.get_model_v2( - obs_space, - self.action_space, - num_outputs, - policy_model_config, - framework="tf", - name=name) - return model - - def build_q_model(self, obs_space, action_space, num_outputs, - q_model_config, name): - """Builds one of the (twin) Q-nets used by this SAC. - - Override this method in a sub-class of SACTFModel to implement your - own Q-nets. Alternatively, simply set `custom_model` within the - top level SAC `Q_model` config key to make this default implementation - of `build_q_model` use your custom Q-nets. - - Returns: - TFModelV2: The TFModelV2 Q-net sub-model. - """ - self.concat_obs_and_actions = False - if self.discrete: - input_space = obs_space - else: - orig_space = getattr(obs_space, "original_space", obs_space) - if isinstance(orig_space, Box) and len(orig_space.shape) == 1: - input_space = Box( - float("-inf"), - float("inf"), - shape=(orig_space.shape[0] + action_space.shape[0], )) - self.concat_obs_and_actions = True - else: - if isinstance(orig_space, gym.spaces.Tuple): - spaces = orig_space.spaces - elif isinstance(orig_space, gym.spaces.Dict): - spaces = list(orig_space.spaces.values()) - else: - spaces = [obs_space] - input_space = gym.spaces.Tuple(spaces + [action_space]) - - model = ModelCatalog.get_model_v2( - input_space, - action_space, - num_outputs, - q_model_config, - framework="tf", - name=name) - return model - def get_q_values(self, model_out: TensorType, actions: Optional[TensorType] = None) -> TensorType: @@ -204,7 +161,12 @@ def get_q_values(self, Returns: TensorType: Q-values tensor of shape [BATCH_SIZE, 1]. """ - return self._get_q_value(model_out, actions, self.q_net) + # Continuous case -> concat actions to model_out. + if actions is not None: + return self.q_net([model_out, actions]) + # Discrete case -> return q-vals for all actions. + else: + return self.q_net(model_out) def get_twin_q_values(self, model_out: TensorType, @@ -223,34 +185,12 @@ def get_twin_q_values(self, Returns: TensorType: Q-values tensor of shape [BATCH_SIZE, 1]. """ - return self._get_q_value(model_out, actions, self.twin_q_net) - - def _get_q_value(self, model_out, actions, net): - # Model outs may come as original Tuple/Dict observations, concat them - # here if this is the case. - if isinstance(net.obs_space, Box): - if isinstance(model_out, (list, tuple)): - model_out = tf.concat(model_out, axis=-1) - elif isinstance(model_out, dict): - model_out = tf.concat(list(model_out.values()), axis=-1) - elif isinstance(model_out, dict): - model_out = list(model_out.values()) - # Continuous case -> concat actions to model_out. if actions is not None: - if self.concat_obs_and_actions: - input_dict = {"obs": tf.concat([model_out, actions], axis=-1)} - else: - input_dict = {"obs": force_list(model_out) + [actions]} + return self.twin_q_net([model_out, actions]) # Discrete case -> return q-vals for all actions. else: - input_dict = {"obs": model_out} - # Switch on training mode (when getting Q-values, we are usually in - # training). - input_dict["is_training"] = True - - out, _ = net(input_dict, [], None) - return out + return self.twin_q_net(model_out) def get_policy_output(self, model_out: TensorType) -> TensorType: """Returns policy outputs, given the output of self.__call__(). @@ -267,23 +207,15 @@ def get_policy_output(self, model_out: TensorType) -> TensorType: Returns: TensorType: Distribution inputs for sampling actions. """ - # Model outs may come as original Tuple observations, concat them - # here if this is the case. - if isinstance(self.action_model.obs_space, Box): - if isinstance(model_out, (list, tuple)): - model_out = tf.concat(model_out, axis=-1) - elif isinstance(model_out, dict): - model_out = tf.concat(list(model_out.values()), axis=-1) - out, _ = self.action_model({"obs": model_out}, [], None) - return out + return self.action_model(model_out) def policy_variables(self): """Return the list of variables for the policy net.""" - return self.action_model.variables() + return list(self.action_model.variables) def q_variables(self): """Return the list of variables for Q / twin Q nets.""" - return self.q_net.variables() + (self.twin_q_net.variables() - if self.twin_q_net else []) + return self.q_net.variables + (self.twin_q_net.variables + if self.twin_q_net else []) diff --git a/rllib/agents/sac/sac_tf_policy.py b/rllib/agents/sac/sac_tf_policy.py index e4cc080afc66..44ddbff1fd84 100644 --- a/rllib/agents/sac/sac_tf_policy.py +++ b/rllib/agents/sac/sac_tf_policy.py @@ -6,7 +6,6 @@ from gym.spaces import Box, Discrete from functools import partial import logging -import numpy as np from typing import Dict, List, Optional, Tuple, Type, Union import ray @@ -18,7 +17,7 @@ from ray.rllib.agents.sac.sac_tf_model import SACTFModel from ray.rllib.agents.sac.sac_torch_model import SACTorchModel from ray.rllib.evaluation.episode import MultiAgentEpisode -from ray.rllib.models import ModelCatalog, MODEL_DEFAULTS +from ray.rllib.models import ModelCatalog from ray.rllib.models.modelv2 import ModelV2 from ray.rllib.models.tf.tf_action_dist import Beta, Categorical, \ DiagGaussian, Dirichlet, SquashedGaussian, TFActionDistribution @@ -56,35 +55,40 @@ def build_sac_model(policy: Policy, obs_space: gym.spaces.Space, `policy.target_model`. """ # With separate state-preprocessor (before obs+action concat). - num_outputs = int(np.product(obs_space.shape)) + if config["use_state_preprocessor"]: + num_outputs = 256 # Flatten last Conv2D to this many nodes. + # No separate state-preprocessor: concat obs+actions right away. + else: + num_outputs = 0 + # No state preprocessor: fcnet_hiddens should be empty. + if config["model"]["fcnet_hiddens"]: + logger.warning( + "When not using a state-preprocessor with SAC, `fcnet_hiddens`" + " will be set to an empty list! Any hidden layer sizes are " + "defined via `policy_model.fcnet_hiddens` and " + "`Q_model.fcnet_hiddens`.") + config["model"]["fcnet_hiddens"] = [] # Force-ignore any additionally provided hidden layer sizes. # Everything should be configured using SAC's "Q_model" and "policy_model" # settings. - policy_model_config = MODEL_DEFAULTS.copy() - policy_model_config.update(config["policy_model"]) - q_model_config = MODEL_DEFAULTS.copy() - q_model_config.update(config["Q_model"]) - - default_model_cls = SACTorchModel if config["framework"] == "torch" \ - else SACTFModel - model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework=config["framework"], - default_model=default_model_cls, + model_interface=SACTorchModel + if config["framework"] == "torch" else SACTFModel, name="sac_model", - policy_model_config=policy_model_config, - q_model_config=q_model_config, + actor_hidden_activation=config["policy_model"]["fcnet_activation"], + actor_hiddens=config["policy_model"]["fcnet_hiddens"], + critic_hidden_activation=config["Q_model"]["fcnet_activation"], + critic_hiddens=config["Q_model"]["fcnet_hiddens"], twin_q=config["twin_q"], initial_alpha=config["initial_alpha"], target_entropy=config["target_entropy"]) - assert isinstance(model, default_model_cls) - # Create an exact copy of the model and store it in `policy.target_model`. # This will be used for tau-synched Q-target models that run behind the # actual Q-networks and are used for target q-value calculations in the @@ -95,16 +99,17 @@ def build_sac_model(policy: Policy, obs_space: gym.spaces.Space, num_outputs=num_outputs, model_config=config["model"], framework=config["framework"], - default_model=default_model_cls, + model_interface=SACTorchModel + if config["framework"] == "torch" else SACTFModel, name="target_sac_model", - policy_model_config=policy_model_config, - q_model_config=q_model_config, + actor_hidden_activation=config["policy_model"]["fcnet_activation"], + actor_hiddens=config["policy_model"]["fcnet_hiddens"], + critic_hidden_activation=config["Q_model"]["fcnet_activation"], + critic_hiddens=config["Q_model"]["fcnet_hiddens"], twin_q=config["twin_q"], initial_alpha=config["initial_alpha"], target_entropy=config["target_entropy"]) - assert isinstance(policy.target_model, default_model_cls) - return model @@ -193,14 +198,14 @@ def get_distribution_inputs_and_class( dist inputs, dist class, and a list of internal state outputs (in the RNN case). """ - # Get base-model (forward) output (this should be a noop call). - forward_out, state_out = model({ + # Get base-model output (w/o the SAC specific parts of the network). + model_out, state_out = model({ "obs": obs_batch, "is_training": policy._get_is_training_placeholder(), }, [], None) # Use the base output to get the policy outputs from the SAC model's # policy components. - distribution_inputs = model.get_policy_output(forward_out) + distribution_inputs = model.get_policy_output(model_out) # Get a distribution class to be used with the just calculated dist-inputs. action_dist_class = _get_dist_class(policy.config, policy.action_space) @@ -652,7 +657,7 @@ def validate_spaces(policy: Policy, observation_space: gym.spaces.Space, Raises: UnsupportedSpaceException: If one of the spaces is not supported. """ - # Only support single Box or single Discrete spaces. + # Only support single Box or single Discreete spaces. if not isinstance(action_space, (Box, Discrete, Simplex)): raise UnsupportedSpaceException( "Action space ({}) of {} is not supported for " diff --git a/rllib/agents/sac/sac_torch_model.py b/rllib/agents/sac/sac_torch_model.py index 1288d20da362..5f8b05980fed 100644 --- a/rllib/agents/sac/sac_torch_model.py +++ b/rllib/agents/sac/sac_torch_model.py @@ -1,12 +1,11 @@ import gym from gym.spaces import Box, Discrete import numpy as np -from typing import Dict, List, Optional +from typing import Optional, Tuple -from ray.rllib.models.catalog import ModelCatalog +from ray.rllib.models.torch.misc import SlimFC from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 -from ray.rllib.utils import force_list -from ray.rllib.utils.annotations import override +from ray.rllib.models.utils import get_activation_fn from ray.rllib.utils.framework import try_import_torch from ray.rllib.utils.spaces.simplex import Simplex from ray.rllib.utils.typing import ModelConfigDict, TensorType @@ -17,21 +16,14 @@ class SACTorchModel(TorchModelV2, nn.Module): """Extension of the standard TorchModelV2 for SAC. - To customize, do one of the following: - - sub-class SACTorchModel and override one or more of its methods. - - Use SAC's `Q_model` and `policy_model` keys to tweak the default model - behaviors (e.g. fcnet_hiddens, conv_filters, etc..). - - Use SAC's `Q_model->custom_model` and `policy_model->custom_model` keys - to specify your own custom Q-model(s) and policy-models, which will be - created within this SACTFModel (see `build_policy_model` and - `build_q_model`. - - Note: It is not recommended to override the `forward` method for SAC. This - would lead to shared weights (between policy and Q-nets), which will then - not be optimized by either of the critic- or actor-optimizers! + Instances of this Model get created via wrapping this class around another + default- or custom model (inside + rllib/agents/sac/sac_torch_policy.py::build_sac_model). Doing so simply + adds this class' methods (`get_q_values`, etc..) to the wrapped model, such + that the wrapped model can be used by the SAC algorithm. Data flow: - `obs` -> forward() (should stay a noop method!) -> `model_out` + `obs` -> forward() -> `model_out` `model_out` -> get_policy_output() -> pi(actions|obs) `model_out`, `actions` -> get_q_values() -> Q(s, a) `model_out`, `actions` -> get_twin_q_values() -> Q_twin(s, a) @@ -43,18 +35,20 @@ def __init__(self, num_outputs: Optional[int], model_config: ModelConfigDict, name: str, - policy_model_config: ModelConfigDict = None, - q_model_config: ModelConfigDict = None, + actor_hidden_activation: str = "relu", + actor_hiddens: Tuple[int] = (256, 256), + critic_hidden_activation: str = "relu", + critic_hiddens: Tuple[int] = (256, 256), twin_q: bool = False, initial_alpha: float = 1.0, target_entropy: Optional[float] = None): """Initializes a SACTorchModel instance. 7 Args: - policy_model_config (ModelConfigDict): The config dict for the - policy network. - q_model_config (ModelConfigDict): The config dict for the - Q-network(s) (2 if twin_q=True). + actor_hidden_activation (str): Activation for the actor network. + actor_hiddens (list): Hidden layers sizes for the actor network. + critic_hidden_activation (str): Activation for the critic network. + critic_hiddens (list): Hidden layers sizes for the critic network. twin_q (bool): Build twin Q networks (Q-net and target) for more stable Q-learning. initial_alpha (float): The initial value for the to-be-optimized @@ -75,29 +69,74 @@ def __init__(self, self.action_dim = action_space.n self.discrete = True action_outs = q_outs = self.action_dim + action_ins = None # No action inputs for the discrete case. elif isinstance(action_space, Box): self.action_dim = np.product(action_space.shape) self.discrete = False action_outs = 2 * self.action_dim + action_ins = self.action_dim q_outs = 1 else: assert isinstance(action_space, Simplex) self.action_dim = np.product(action_space.shape) self.discrete = False action_outs = self.action_dim + action_ins = self.action_dim q_outs = 1 # Build the policy network. - self.action_model = self.build_policy_model( - self.obs_space, action_outs, policy_model_config, "policy_model") - - # Build the Q-network(s). - self.q_net = self.build_q_model(self.obs_space, self.action_space, - q_outs, q_model_config, "q") + self.action_model = nn.Sequential() + ins = self.num_outputs + self.obs_ins = ins + activation = get_activation_fn( + actor_hidden_activation, framework="torch") + for i, n in enumerate(actor_hiddens): + self.action_model.add_module( + "action_{}".format(i), + SlimFC( + ins, + n, + initializer=torch.nn.init.xavier_uniform_, + activation_fn=activation)) + ins = n + self.action_model.add_module( + "action_out", + SlimFC( + ins, + action_outs, + initializer=torch.nn.init.xavier_uniform_, + activation_fn=None)) + + # Build the Q-net(s), including target Q-net(s). + def build_q_net(name_): + activation = get_activation_fn( + critic_hidden_activation, framework="torch") + # For continuous actions: Feed obs and actions (concatenated) + # through the NN. For discrete actions, only obs. + q_net = nn.Sequential() + ins = self.obs_ins + (0 if self.discrete else action_ins) + for i, n in enumerate(critic_hiddens): + q_net.add_module( + "{}_hidden_{}".format(name_, i), + SlimFC( + ins, + n, + initializer=torch.nn.init.xavier_uniform_, + activation_fn=activation)) + ins = n + + q_net.add_module( + "{}_out".format(name_), + SlimFC( + ins, + q_outs, + initializer=torch.nn.init.xavier_uniform_, + activation_fn=None)) + return q_net + + self.q_net = build_q_net("q") if twin_q: - self.twin_q_net = self.build_q_model(self.obs_space, - self.action_space, q_outs, - q_model_config, "twin_q") + self.twin_q_net = build_q_net("twin_q") else: self.twin_q_net = None @@ -118,80 +157,6 @@ def __init__(self, self.target_entropy = torch.tensor( data=[target_entropy], dtype=torch.float32, requires_grad=False) - @override(TorchModelV2) - def forward(self, input_dict: Dict[str, TensorType], - state: List[TensorType], - seq_lens: TensorType) -> (TensorType, List[TensorType]): - """The common (Q-net and policy-net) forward pass. - - NOTE: It is not(!) recommended to override this method as it would - introduce a shared pre-network, which would be updated by both - actor- and critic optimizers. - """ - return input_dict["obs"], state - - def build_policy_model(self, obs_space, num_outputs, policy_model_config, - name): - """Builds the policy model used by this SAC. - - Override this method in a sub-class of SACTFModel to implement your - own policy net. Alternatively, simply set `custom_model` within the - top level SAC `policy_model` config key to make this default - implementation of `build_policy_model` use your custom policy network. - - Returns: - TorchModelV2: The TorchModelV2 policy sub-model. - """ - model = ModelCatalog.get_model_v2( - obs_space, - self.action_space, - num_outputs, - policy_model_config, - framework="torch", - name=name) - return model - - def build_q_model(self, obs_space, action_space, num_outputs, - q_model_config, name): - """Builds one of the (twin) Q-nets used by this SAC. - - Override this method in a sub-class of SACTFModel to implement your - own Q-nets. Alternatively, simply set `custom_model` within the - top level SAC `Q_model` config key to make this default implementation - of `build_q_model` use your custom Q-nets. - - Returns: - TorchModelV2: The TorchModelV2 Q-net sub-model. - """ - self.concat_obs_and_actions = False - if self.discrete: - input_space = obs_space - else: - orig_space = getattr(obs_space, "original_space", obs_space) - if isinstance(orig_space, Box) and len(orig_space.shape) == 1: - input_space = Box( - float("-inf"), - float("inf"), - shape=(orig_space.shape[0] + action_space.shape[0], )) - self.concat_obs_and_actions = True - else: - if isinstance(orig_space, gym.spaces.Tuple): - spaces = orig_space.spaces - elif isinstance(orig_space, gym.spaces.Dict): - spaces = list(orig_space.spaces.values()) - else: - spaces = [obs_space] - input_space = gym.spaces.Tuple(spaces + [action_space]) - - model = ModelCatalog.get_model_v2( - input_space, - action_space, - num_outputs, - q_model_config, - framework="torch", - name=name) - return model - def get_q_values(self, model_out: TensorType, actions: Optional[TensorType] = None) -> TensorType: @@ -210,7 +175,12 @@ def get_q_values(self, Returns: TensorType: Q-values tensor of shape [BATCH_SIZE, 1]. """ - return self._get_q_value(model_out, actions, self.q_net) + # Continuous case -> concat actions to model_out. + if actions is not None: + return self.q_net(torch.cat([model_out, actions], -1)) + # Discrete case -> return q-vals for all actions. + else: + return self.q_net(model_out) def get_twin_q_values(self, model_out: TensorType, @@ -229,34 +199,12 @@ def get_twin_q_values(self, Returns: TensorType: Q-values tensor of shape [BATCH_SIZE, 1]. """ - return self._get_q_value(model_out, actions, self.twin_q_net) - - def _get_q_value(self, model_out, actions, net): - # Model outs may come as original Tuple observations, concat them - # here if this is the case. - if isinstance(net.obs_space, Box): - if isinstance(model_out, (list, tuple)): - model_out = torch.cat(model_out, dim=-1) - elif isinstance(model_out, dict): - model_out = torch.cat(list(model_out.values()), dim=-1) - elif isinstance(model_out, dict): - model_out = list(model_out.values()) - # Continuous case -> concat actions to model_out. if actions is not None: - if self.concat_obs_and_actions: - input_dict = {"obs": torch.cat([model_out, actions], dim=-1)} - else: - input_dict = {"obs": force_list(model_out) + [actions]} + return self.twin_q_net(torch.cat([model_out, actions], -1)) # Discrete case -> return q-vals for all actions. else: - input_dict = {"obs": model_out} - # Switch on training mode (when getting Q-values, we are usually in - # training). - input_dict["is_training"] = True - - out, _ = net(input_dict, [], None) - return out + return self.twin_q_net(model_out) def get_policy_output(self, model_out: TensorType) -> TensorType: """Returns policy outputs, given the output of self.__call__(). @@ -273,23 +221,15 @@ def get_policy_output(self, model_out: TensorType) -> TensorType: Returns: TensorType: Distribution inputs for sampling actions. """ - # Model outs may come as original Tuple observations, concat them - # here if this is the case. - if isinstance(self.action_model.obs_space, Box): - if isinstance(model_out, (list, tuple)): - model_out = torch.cat(model_out, dim=-1) - elif isinstance(model_out, dict): - model_out = torch.cat(list(model_out.values()), dim=-1) - out, _ = self.action_model({"obs": model_out}, [], None) - return out + return self.action_model(model_out) def policy_variables(self): """Return the list of variables for the policy net.""" - return self.action_model.variables() + return list(self.action_model.parameters()) def q_variables(self): """Return the list of variables for Q / twin Q nets.""" - return self.q_net.variables() + (self.twin_q_net.variables() - if self.twin_q_net else []) + return list(self.q_net.parameters()) + \ + (list(self.twin_q_net.parameters()) if self.twin_q_net else []) diff --git a/rllib/agents/sac/sac_torch_policy.py b/rllib/agents/sac/sac_torch_policy.py index 60a206e91453..d000e183913c 100644 --- a/rllib/agents/sac/sac_torch_policy.py +++ b/rllib/agents/sac/sac_torch_policy.py @@ -32,29 +32,6 @@ logger = logging.getLogger(__name__) -def _get_dist_class(config: TrainerConfigDict, action_space: gym.spaces.Space - ) -> Type[TorchDistributionWrapper]: - """Helper function to return a dist class based on config and action space. - - Args: - config (TrainerConfigDict): The Trainer's config dict. - action_space (gym.spaces.Space): The action space used. - - Returns: - Type[TFActionDistribution]: A TF distribution class. - """ - if isinstance(action_space, Discrete): - return TorchCategorical - elif isinstance(action_space, Simplex): - return TorchDirichlet - else: - if config["normalize_actions"]: - return TorchSquashedGaussian if \ - not config["_use_beta_distribution"] else TorchBeta - else: - return TorchDiagGaussian - - def build_sac_model_and_action_dist( policy: Policy, obs_space: gym.spaces.Space, @@ -79,6 +56,29 @@ def build_sac_model_and_action_dist( return model, action_dist_class +def _get_dist_class(config: TrainerConfigDict, action_space: gym.spaces.Space + ) -> Type[TorchDistributionWrapper]: + """Helper function to return a dist class based on config and action space. + + Args: + config (TrainerConfigDict): The Trainer's config dict. + action_space (gym.spaces.Space): The action space used. + + Returns: + Type[TFActionDistribution]: A TF distribution class. + """ + if isinstance(action_space, Discrete): + return TorchCategorical + elif isinstance(action_space, Simplex): + return TorchDirichlet + else: + if config["normalize_actions"]: + return TorchSquashedGaussian if \ + not config["_use_beta_distribution"] else TorchBeta + else: + return TorchDiagGaussian + + def action_distribution_fn( policy: Policy, model: ModelV2, diff --git a/rllib/agents/sac/tests/test_sac.py b/rllib/agents/sac/tests/test_sac.py index b32beaac13fd..6a84b19c7478 100644 --- a/rllib/agents/sac/tests/test_sac.py +++ b/rllib/agents/sac/tests/test_sac.py @@ -1,5 +1,5 @@ from gym import Env -from gym.spaces import Box, Discrete, Tuple +from gym.spaces import Box import numpy as np import re import unittest @@ -9,10 +9,6 @@ from ray.rllib.agents.sac.sac_tf_policy import sac_actor_critic_loss as tf_loss from ray.rllib.agents.sac.sac_torch_policy import actor_critic_loss as \ loss_torch -from ray.rllib.examples.env.random_env import RandomEnv -from ray.rllib.examples.models.batch_norm_model import KerasBatchNormModel, \ - TorchBatchNormModel -from ray.rllib.models.catalog import ModelCatalog from ray.rllib.models.tf.tf_action_dist import Dirichlet from ray.rllib.models.torch.torch_action_dist import TorchDirichlet from ray.rllib.execution.replay_buffer import LocalReplayBuffer @@ -56,7 +52,7 @@ def step(self, action): class TestSAC(unittest.TestCase): @classmethod def setUpClass(cls) -> None: - ray.init(local_mode=True) + ray.init() @classmethod def tearDownClass(cls) -> None: @@ -65,46 +61,22 @@ def tearDownClass(cls) -> None: def test_sac_compilation(self): """Tests whether an SACTrainer can be built with all frameworks.""" config = sac.DEFAULT_CONFIG.copy() - config["Q_model"] = sac.DEFAULT_CONFIG["Q_model"].copy() config["num_workers"] = 0 # Run locally. config["twin_q"] = True + config["soft_horizon"] = True config["clip_actions"] = False config["normalize_actions"] = True config["learning_starts"] = 0 config["prioritized_replay"] = True - config["rollout_fragment_length"] = 10 - config["train_batch_size"] = 10 num_iterations = 1 - - ModelCatalog.register_custom_model("batch_norm", KerasBatchNormModel) - ModelCatalog.register_custom_model("batch_norm_torch", - TorchBatchNormModel) - - image_space = Box(-1.0, 1.0, shape=(84, 84, 3)) - simple_space = Box(-1.0, 1.0, shape=(3, )) - - for fw in framework_iterator(config): + for _ in framework_iterator(config): # Test for different env types (discrete w/ and w/o image, + cont). for env in [ - RandomEnv, - "MsPacmanNoFrameskip-v4", - "CartPole-v0", + "Pendulum-v0", "MsPacmanNoFrameskip-v4", "CartPole-v0" ]: print("Env={}".format(env)) - if env == RandomEnv: - config["env_config"] = { - "observation_space": Tuple( - [simple_space, - Discrete(2), image_space]), - "action_space": Box(-1.0, 1.0, shape=(1, )), - } - else: - config["env_config"] = {} - # Test making the Q-model a custom one for CartPole, otherwise, - # use the default model. - config["Q_model"]["custom_model"] = "batch_norm{}".format( - "_torch" - if fw == "torch" else "") if env == "CartPole-v0" else None + config["use_state_preprocessor"] = \ + env == "MsPacmanNoFrameskip-v4" trainer = sac.SACTrainer(config=config, env=env) for i in range(num_iterations): results = trainer.train() @@ -131,63 +103,63 @@ def test_sac_loss_function(self): config["env_config"] = {"simplex_actions": True} map_ = { - # Action net. - "default_policy/fc_1/kernel": "action_model._hidden_layers.0." + # Normal net. + "default_policy/sequential/action_1/kernel": "action_model." + "action_0._model.0.weight", + "default_policy/sequential/action_1/bias": "action_model." + "action_0._model.0.bias", + "default_policy/sequential/action_out/kernel": "action_model." + "action_out._model.0.weight", + "default_policy/sequential/action_out/bias": "action_model." + "action_out._model.0.bias", + "default_policy/sequential_1/q_hidden_0/kernel": "q_net." + "q_hidden_0._model.0.weight", + "default_policy/sequential_1/q_hidden_0/bias": "q_net." + "q_hidden_0._model.0.bias", + "default_policy/sequential_1/q_out/kernel": "q_net." + "q_out._model.0.weight", + "default_policy/sequential_1/q_out/bias": "q_net." + "q_out._model.0.bias", + "default_policy/value_out/kernel": "_value_branch." "_model.0.weight", - "default_policy/fc_1/bias": "action_model._hidden_layers.0." + "default_policy/value_out/bias": "_value_branch." "_model.0.bias", - "default_policy/fc_out/kernel": "action_model." - "_logits._model.0.weight", - "default_policy/fc_out/bias": "action_model._logits._model.0.bias", - "default_policy/value_out/kernel": "action_model." - "_value_branch._model.0.weight", - "default_policy/value_out/bias": "action_model." - "_value_branch._model.0.bias", - # Q-net. - "default_policy/fc_1_1/kernel": "q_net." - "_hidden_layers.0._model.0.weight", - "default_policy/fc_1_1/bias": "q_net." - "_hidden_layers.0._model.0.bias", - "default_policy/fc_out_1/kernel": "q_net._logits._model.0.weight", - "default_policy/fc_out_1/bias": "q_net._logits._model.0.bias", - "default_policy/value_out_1/kernel": "q_net." - "_value_branch._model.0.weight", - "default_policy/value_out_1/bias": "q_net." - "_value_branch._model.0.bias", "default_policy/log_alpha": "log_alpha", - # Target action-net. - "default_policy/fc_1_2/kernel": "action_model." - "_hidden_layers.0._model.0.weight", - "default_policy/fc_1_2/bias": "action_model." - "_hidden_layers.0._model.0.bias", - "default_policy/fc_out_2/kernel": "action_model." - "_logits._model.0.weight", - "default_policy/fc_out_2/bias": "action_model." - "_logits._model.0.bias", - "default_policy/value_out_2/kernel": "action_model." - "_value_branch._model.0.weight", - "default_policy/value_out_2/bias": "action_model." - "_value_branch._model.0.bias", - # Target Q-net - "default_policy/fc_1_3/kernel": "q_net." - "_hidden_layers.0._model.0.weight", - "default_policy/fc_1_3/bias": "q_net." - "_hidden_layers.0._model.0.bias", - "default_policy/fc_out_3/kernel": "q_net." - "_logits._model.0.weight", - "default_policy/fc_out_3/bias": "q_net." - "_logits._model.0.bias", - "default_policy/value_out_3/kernel": "q_net." - "_value_branch._model.0.weight", - "default_policy/value_out_3/bias": "q_net." - "_value_branch._model.0.bias", + # Target net. + "default_policy/sequential_2/action_1/kernel": "action_model." + "action_0._model.0.weight", + "default_policy/sequential_2/action_1/bias": "action_model." + "action_0._model.0.bias", + "default_policy/sequential_2/action_out/kernel": "action_model." + "action_out._model.0.weight", + "default_policy/sequential_2/action_out/bias": "action_model." + "action_out._model.0.bias", + "default_policy/sequential_3/q_hidden_0/kernel": "q_net." + "q_hidden_0._model.0.weight", + "default_policy/sequential_3/q_hidden_0/bias": "q_net." + "q_hidden_0._model.0.bias", + "default_policy/sequential_3/q_out/kernel": "q_net." + "q_out._model.0.weight", + "default_policy/sequential_3/q_out/bias": "q_net." + "q_out._model.0.bias", + "default_policy/value_out_1/kernel": "_value_branch." + "_model.0.weight", + "default_policy/value_out_1/bias": "_value_branch." + "_model.0.bias", "default_policy/log_alpha_1": "log_alpha", } env = SimpleEnv batch_size = 100 - obs_size = (batch_size, 1) - actions = np.random.random(size=(batch_size, 2)) + if env is SimpleEnv: + obs_size = (batch_size, 1) + actions = np.random.random(size=(batch_size, 2)) + elif env == "CartPole-v0": + obs_size = (batch_size, 4) + actions = np.random.randint(0, 2, size=(batch_size, )) + else: + obs_size = (batch_size, 3) + actions = np.random.random(size=(batch_size, 1)) # Batch of size=n. input_ = self._get_batch_helper(obs_size, actions, batch_size) @@ -253,12 +225,10 @@ def test_sac_loss_function(self): policy.td_error, policy.optimizer().compute_gradients( policy.critic_loss[0], - [v for v in policy.model.q_variables() if - "value_" not in v.name]), + policy.model.q_variables()), policy.optimizer().compute_gradients( policy.actor_loss, - [v for v in policy.model.policy_variables() if - "value_" not in v.name]), + policy.model.policy_variables()), policy.optimizer().compute_gradients( policy.alpha_loss, policy.model.log_alpha)], feed_dict=policy._get_loss_inputs_dict( @@ -291,6 +261,8 @@ def test_sac_loss_function(self): a.backward() # `actor_loss` depends on Q-net vars (but these grads must # be ignored and overridden in critic_loss.backward!). + assert not any(v.grad is None + for v in policy.model.q_variables()) assert not all( torch.mean(v.grad) == 0 for v in policy.model.policy_variables()) @@ -301,38 +273,45 @@ def test_sac_loss_function(self): # Compare with tf ones. torch_a_grads = [ v.grad for v in policy.model.policy_variables() - if v.grad is not None ] - check(tf_a_grads[2], - np.transpose(torch_a_grads[0].detach().cpu())) + for tf_g, torch_g in zip(tf_a_grads, torch_a_grads): + if tf_g.shape != torch_g.shape: + check(tf_g, np.transpose(torch_g.detach().cpu())) + else: + check(tf_g, torch_g) # Test critic gradients. policy.critic_optims[0].zero_grad() assert all( torch.mean(v.grad) == 0.0 - for v in policy.model.q_variables() if v.grad is not None) + for v in policy.model.q_variables()) assert all( torch.min(v.grad) == 0.0 - for v in policy.model.q_variables() if v.grad is not None) + for v in policy.model.q_variables()) assert policy.model.log_alpha.grad is None c[0].backward() assert not all( torch.mean(v.grad) == 0 - for v in policy.model.q_variables() if v.grad is not None) + for v in policy.model.q_variables()) assert not all( - torch.min(v.grad) == 0 for v in policy.model.q_variables() - if v.grad is not None) + torch.min(v.grad) == 0 for v in policy.model.q_variables()) assert policy.model.log_alpha.grad is None # Compare with tf ones. torch_c_grads = [v.grad for v in policy.model.q_variables()] - check(tf_c_grads[0], - np.transpose(torch_c_grads[2].detach().cpu())) + for tf_g, torch_g in zip(tf_c_grads, torch_c_grads): + if tf_g.shape != torch_g.shape: + check(tf_g, np.transpose(torch_g.detach().cpu())) + else: + check(tf_g, torch_g) # Compare (unchanged(!) actor grads) with tf ones. torch_a_grads = [ v.grad for v in policy.model.policy_variables() ] - check(tf_a_grads[2], - np.transpose(torch_a_grads[0].detach().cpu())) + for tf_g, torch_g in zip(tf_a_grads, torch_a_grads): + if tf_g.shape != torch_g.shape: + check(tf_g, np.transpose(torch_g.detach().cpu())) + else: + check(tf_g, torch_g) # Test alpha gradient. policy.alpha_optim.zero_grad() @@ -357,7 +336,7 @@ def test_sac_loss_function(self): prev_fw_loss = (c, a, e, t) # Update weights from our batch (n times). - for update_iteration in range(5): + for update_iteration in range(10): print("train iteration {}".format(update_iteration)) if fw == "tf": in_ = self._get_batch_helper(obs_size, actions, batch_size) @@ -371,9 +350,10 @@ def test_sac_loss_function(self): # Net must have changed. if tf_updated_weights: check( - updated_weights["default_policy/fc_1/kernel"], + updated_weights[ + "default_policy/sequential/action_1/kernel"], tf_updated_weights[-1][ - "default_policy/fc_1/kernel"], + "default_policy/sequential/action_1/kernel"], false=True) tf_updated_weights.append(updated_weights) @@ -387,9 +367,7 @@ def test_sac_loss_function(self): buf._fake_batch = in_ trainer.train() # Compare updated model. - for tf_key in sorted(tf_weights.keys()): - if re.search("_[23]|alpha", tf_key): - continue + for tf_key in sorted(tf_weights.keys())[2:10]: tf_var = tf_weights[tf_key] torch_var = policy.model.state_dict()[map_[tf_key]] if tf_var.shape != torch_var.shape: @@ -403,9 +381,7 @@ def test_sac_loss_function(self): check(policy.model.log_alpha, tf_weights["default_policy/log_alpha"]) # Compare target nets. - for tf_key in sorted(tf_weights.keys()): - if not re.search("_[23]", tf_key): - continue + for tf_key in sorted(tf_weights.keys())[10:18]: tf_var = tf_weights[tf_key] torch_var = policy.target_model.state_dict()[map_[ tf_key]] @@ -461,9 +437,9 @@ def _sac_loss_helper(self, train_batch, weights, ks, log_alpha, fw, gamma, fc( relu( fc(model_out_t, - weights[ks[1]], - weights[ks[0]], - framework=fw)), weights[ks[9]], weights[ks[8]]), None) + weights[ks[3]], + weights[ks[2]], + framework=fw)), weights[ks[5]], weights[ks[4]]), None) policy_t = action_dist_t.deterministic_sample() log_pis_t = action_dist_t.logp(policy_t) if sess: @@ -476,9 +452,9 @@ def _sac_loss_helper(self, train_batch, weights, ks, log_alpha, fw, gamma, fc( relu( fc(model_out_tp1, - weights[ks[1]], - weights[ks[0]], - framework=fw)), weights[ks[9]], weights[ks[8]]), None) + weights[ks[3]], + weights[ks[2]], + framework=fw)), weights[ks[5]], weights[ks[4]]), None) policy_tp1 = action_dist_tp1.deterministic_sample() log_pis_tp1 = action_dist_tp1.logp(policy_tp1) if sess: @@ -492,11 +468,11 @@ def _sac_loss_helper(self, train_batch, weights, ks, log_alpha, fw, gamma, relu( fc(np.concatenate( [model_out_t, train_batch[SampleBatch.ACTIONS]], -1), - weights[ks[3]], - weights[ks[2]], + weights[ks[7]], + weights[ks[6]], framework=fw)), - weights[ks[11]], - weights[ks[10]], + weights[ks[9]], + weights[ks[8]], framework=fw) # Q-values for current policy in given current state. @@ -504,11 +480,11 @@ def _sac_loss_helper(self, train_batch, weights, ks, log_alpha, fw, gamma, q_t_det_policy = fc( relu( fc(np.concatenate([model_out_t, policy_t], -1), - weights[ks[3]], - weights[ks[2]], + weights[ks[7]], + weights[ks[6]], framework=fw)), - weights[ks[11]], - weights[ks[10]], + weights[ks[9]], + weights[ks[8]], framework=fw) # Target q network evaluation. @@ -517,11 +493,11 @@ def _sac_loss_helper(self, train_batch, weights, ks, log_alpha, fw, gamma, q_tp1 = fc( relu( fc(np.concatenate([target_model_out_tp1, policy_tp1], -1), - weights[ks[7]], - weights[ks[6]], + weights[ks[15]], + weights[ks[14]], framework=fw)), - weights[ks[15]], - weights[ks[14]], + weights[ks[17]], + weights[ks[16]], framework=fw) else: assert fw == "tfe" @@ -562,9 +538,9 @@ def _translate_weights_to_torch(self, weights_dict, map_): map_[k]: convert_to_torch_tensor( np.transpose(v) if re.search("kernel", k) else np.array([v]) if re.search("log_alpha", k) else v) - for i, (k, v) in enumerate(weights_dict.items()) if i < 13 + for k, v in weights_dict.items() + if re.search("(sequential(/|_1)|value_out/|log_alpha)", k) } - return model_dict def _translate_tfe_weights(self, weights_dict, map_): diff --git a/rllib/agents/trainer.py b/rllib/agents/trainer.py index b2c57d0b1311..9055fe378a36 100644 --- a/rllib/agents/trainer.py +++ b/rllib/agents/trainer.py @@ -52,7 +52,7 @@ # Number of rollout worker actors to create for parallel sampling. Setting # this to 0 will force rollouts to be done in the trainer actor. "num_workers": 2, - # Number of environments to evaluate vector-wise per worker. This enables + # Number of environments to evaluate vectorwise per worker. This enables # model inference batching, which can improve performance for inference # bottlenecked workloads. "num_envs_per_worker": 1, @@ -120,18 +120,10 @@ # set this if soft_horizon=True, unless your env is actually running # forever without returning done=True. "no_done_at_end": False, - # Environment name can also be passed via config. - "env": None, # Arguments to pass to the env creator. "env_config": {}, - # If True, try to render the environment on the local worker or on worker - # 1 (if num_workers > 0). For vectorized envs, this usually means that only - # the first sub-environment will be rendered. - "render_env": False, - # If True, store evaluation videos in the output dir. - # Alternatively, provide a path (str) to a directory here, where the env - # recordings should be stored instead. - "record_env": False, + # Environment name can also be passed via config. + "env": None, # Unsquash actions to the upper and lower bounds of env's action space "normalize_actions": False, # Whether to clip rewards during Policy's postprocessing. @@ -221,10 +213,9 @@ }, # Number of parallel workers to use for evaluation. Note that this is set # to zero by default, which means evaluation will be run in the trainer - # process (only if evaluation_interval is not None). If you increase this, - # it will increase the Ray resource usage of the trainer since evaluation - # workers are created separately from rollout workers (used to sample data - # for training). + # process. If you increase this, it will increase the Ray resource usage + # of the trainer since evaluation workers are created separately from + # rollout workers. "evaluation_num_workers": 0, # Customize the evaluation method. This must be a function of signature # (trainer: Trainer, eval_workers: WorkerSet) -> metrics: dict. See the @@ -544,6 +535,14 @@ def train(self) -> ResultDict: if hasattr(self, "workers") and isinstance(self.workers, WorkerSet): self._sync_filters_if_needed(self.workers) + if self.config["evaluation_interval"] == 1 or ( + self._iteration > 0 and self.config["evaluation_interval"] + and self._iteration % self.config["evaluation_interval"] == 0): + evaluation_metrics = self._evaluate() + assert isinstance(evaluation_metrics, dict), \ + "_evaluate() needs to return a dict." + result.update(evaluation_metrics) + return result def _sync_filters_if_needed(self, workers: WorkerSet): @@ -671,6 +670,7 @@ def get_scope(): extra_config["in_evaluation"] is True extra_config.update({ "batch_mode": "complete_episodes", + "rollout_fragment_length": 1, "in_evaluation": True, }) logger.debug( @@ -1102,7 +1102,7 @@ def _validate_config(config: PartialTrainerConfigDict): if model_config.get("_time_major"): raise ValueError("`model._time_major` only supported " "iff `_use_trajectory_view_api` is True!") - elif traj_view_framestacks not in ["auto", 0]: + elif traj_view_framestacks != "auto": raise ValueError("`model.num_framestacks` only supported " "iff `_use_trajectory_view_api` is True!") model_config["num_framestacks"] = 0 diff --git a/rllib/agents/trainer_template.py b/rllib/agents/trainer_template.py index 600cbef12bd9..b896958b6bf1 100644 --- a/rllib/agents/trainer_template.py +++ b/rllib/agents/trainer_template.py @@ -146,18 +146,6 @@ def _init(self, config: TrainerConfigDict, @override(Trainer) def step(self): res = next(self.train_exec_impl) - - # self._iteration gets incremented after this function returns, - # meaning that e. g. the first time this function is called, - # self._iteration will be 0. We check `self._iteration+1` in the - # if-statement below to reflect that the first training iteration - # is already over. - if (self.config["evaluation_interval"] and (self._iteration + 1) % - self.config["evaluation_interval"] == 0): - evaluation_metrics = self._evaluate() - assert isinstance(evaluation_metrics, dict), \ - "_evaluate() needs to return a dict." - res.update(evaluation_metrics) return res @override(Trainer) diff --git a/rllib/contrib/registry.py b/rllib/contrib/registry.py index 301516602c24..aed8712bbc0c 100644 --- a/rllib/contrib/registry.py +++ b/rllib/contrib/registry.py @@ -3,29 +3,28 @@ def _import_random_agent(): from ray.rllib.contrib.random_agent.random_agent import RandomAgent - return RandomAgent, RandomAgent._default_config + return RandomAgent def _import_maddpg(): from ray.rllib.contrib import maddpg - return maddpg.MADDPGTrainer, maddpg.DEFAULT_CONFIG + return maddpg.MADDPGTrainer def _import_alphazero(): from ray.rllib.contrib.alpha_zero.core.alpha_zero_trainer import\ - AlphaZeroTrainer, DEFAULT_CONFIG - return AlphaZeroTrainer, DEFAULT_CONFIG + AlphaZeroTrainer + return AlphaZeroTrainer def _import_bandit_lints(): - from ray.rllib.contrib.bandits.agents.lin_ts import LinTSTrainer, TS_CONFIG - return LinTSTrainer, TS_CONFIG + from ray.rllib.contrib.bandits.agents.lin_ts import LinTSTrainer + return LinTSTrainer def _import_bandit_linucb(): - from ray.rllib.contrib.bandits.agents.lin_ucb import LinUCBTrainer, \ - UCB_CONFIG - return LinUCBTrainer, UCB_CONFIG + from ray.rllib.contrib.bandits.agents.lin_ucb import LinUCBTrainer + return LinUCBTrainer CONTRIBUTED_ALGORITHMS = { diff --git a/rllib/env/base_env.py b/rllib/env/base_env.py index 081fae6fe13c..9ff16ac5ac6c 100644 --- a/rllib/env/base_env.py +++ b/rllib/env/base_env.py @@ -5,8 +5,8 @@ from ray.rllib.env.multi_agent_env import MultiAgentEnv from ray.rllib.env.vector_env import VectorEnv from ray.rllib.utils.annotations import override, PublicAPI -from ray.rllib.utils.typing import AgentID, EnvID, EnvType, MultiAgentDict, \ - MultiEnvDict, PartialTrainerConfigDict +from ray.rllib.utils.typing import EnvType, MultiEnvDict, EnvID, \ + AgentID, MultiAgentDict if TYPE_CHECKING: from ray.rllib.models.preprocessors import Preprocessor @@ -80,14 +80,11 @@ class BaseEnv: """ @staticmethod - def to_base_env( - env: EnvType, - make_env: Callable[[int], EnvType] = None, - num_envs: int = 1, - remote_envs: bool = False, - remote_env_batch_wait_ms: int = 0, - policy_config: PartialTrainerConfigDict = None, - ) -> "BaseEnv": + def to_base_env(env: EnvType, + make_env: Callable[[int], EnvType] = None, + num_envs: int = 1, + remote_envs: bool = False, + remote_env_batch_wait_ms: int = 0) -> "BaseEnv": """Wraps any env type as needed to expose the async interface.""" from ray.rllib.env.remote_vector_env import RemoteVectorEnv @@ -132,9 +129,7 @@ def to_base_env( existing_envs=[env], num_envs=num_envs, action_space=env.action_space, - observation_space=env.observation_space, - policy_config=policy_config, - ) + observation_space=env.observation_space) env = _VectorEnvToBaseEnv(env) assert isinstance(env, BaseEnv), env return env @@ -210,18 +205,6 @@ def stop(self) -> None: if hasattr(env, "close"): env.close() - # Experimental method. - def try_render(self, env_id: Optional[EnvID] = None) -> None: - """Tries to render the environment. - - Args: - env_id (Optional[int]): The sub-env ID if applicable. If None, - renders the entire Env (i.e. all sub-envs). - """ - - # By default, do nothing. - pass - # Fixed agent identifier when there is only the single agent in the env _DUMMY_AGENT_ID = "agent0" @@ -363,19 +346,14 @@ def send_actions(self, action_dict: MultiEnvDict) -> None: self.vector_env.vector_step(action_vector) @override(BaseEnv) - def try_reset(self, env_id: Optional[EnvID] = None) -> MultiAgentDict: - assert env_id is None or isinstance(env_id, int) + def try_reset(self, + env_id: Optional[EnvID] = None) -> Optional[MultiAgentDict]: return {_DUMMY_AGENT_ID: self.vector_env.reset_at(env_id)} @override(BaseEnv) def get_unwrapped(self) -> List[EnvType]: return self.vector_env.get_unwrapped() - @override(BaseEnv) - def try_render(self, env_id: Optional[EnvID] = None) -> None: - assert env_id is None or isinstance(env_id, int) - return self.vector_env.try_render_at(env_id) - class _MultiAgentEnvToBaseEnv(BaseEnv): """Internal adapter of MultiAgentEnv to BaseEnv. diff --git a/rllib/env/policy_client.py b/rllib/env/policy_client.py index 39a85a5cf91b..232f74f1a17f 100644 --- a/rllib/env/policy_client.py +++ b/rllib/env/policy_client.py @@ -17,6 +17,7 @@ EnvActionType logger = logging.getLogger(__name__) +logger.setLevel("INFO") # TODO(ekl) seems to be needed for cartpole_client.py try: import requests # `requests` is not part of stdlib. diff --git a/rllib/env/policy_server_input.py b/rllib/env/policy_server_input.py index 952130ac5306..45c2a00d292c 100644 --- a/rllib/env/policy_server_input.py +++ b/rllib/env/policy_server_input.py @@ -13,6 +13,7 @@ from ray.rllib.utils.annotations import override, PublicAPI logger = logging.getLogger(__name__) +logger.setLevel("INFO") # TODO(ekl) this is needed for cartpole_server.py class PolicyServerInput(ThreadingMixIn, HTTPServer, InputReader): diff --git a/rllib/env/vector_env.py b/rllib/env/vector_env.py index f07098d0a352..49d4bdf6d855 100644 --- a/rllib/env/vector_env.py +++ b/rllib/env/vector_env.py @@ -1,12 +1,11 @@ import logging import gym -from gym import wrappers as gym_wrappers import numpy as np -from typing import Callable, List, Optional, Tuple +from typing import Callable, List, Tuple from ray.rllib.utils.annotations import override, PublicAPI -from ray.rllib.utils.typing import EnvActionType, EnvConfigDict, EnvInfoDict, \ - EnvObsType, EnvType, PartialTrainerConfigDict +from ray.rllib.utils.typing import EnvType, EnvConfigDict, EnvObsType, \ + EnvInfoDict, EnvActionType logger = logging.getLogger(__name__) @@ -31,22 +30,19 @@ def __init__(self, observation_space: gym.Space, action_space: gym.Space, self.num_envs = num_envs @staticmethod - def wrap(make_env: Optional[Callable[[int], EnvType]] = None, - existing_envs: Optional[List[gym.Env]] = None, + def wrap(make_env: Callable[[int], EnvType] = None, + existing_envs: List[gym.Env] = None, num_envs: int = 1, - action_space: Optional[gym.Space] = None, - observation_space: Optional[gym.Space] = None, - env_config: Optional[EnvConfigDict] = None, - policy_config: Optional[PartialTrainerConfigDict] = None): + action_space: gym.Space = None, + observation_space: gym.Space = None, + env_config: EnvConfigDict = None): return _VectorizedGymEnv( make_env=make_env, existing_envs=existing_envs or [], num_envs=num_envs, observation_space=observation_space, action_space=action_space, - env_config=env_config, - policy_config=policy_config, - ) + env_config=env_config) @PublicAPI def vector_reset(self) -> List[EnvObsType]: @@ -58,12 +54,9 @@ def vector_reset(self) -> List[EnvObsType]: raise NotImplementedError @PublicAPI - def reset_at(self, index: Optional[int] = None) -> EnvObsType: + def reset_at(self, index: int) -> EnvObsType: """Resets a single environment. - Args: - index (Optional[int]): An optional sub-env index to reset. - Returns: obs (obj): Observations from the reset sub environment. """ @@ -95,31 +88,19 @@ def get_unwrapped(self) -> List[EnvType]: """ raise NotImplementedError - # Experimental method. - def try_render_at(self, index: Optional[int] = None) -> None: - """Renders a single environment. - - Args: - index (Optional[int]): An optional sub-env index to render. - """ - pass - class _VectorizedGymEnv(VectorEnv): """Internal wrapper to translate any gym envs into a VectorEnv object. """ - def __init__( - self, - make_env=None, - existing_envs=None, - num_envs=1, - *, - observation_space=None, - action_space=None, - env_config=None, - policy_config=None, - ): + def __init__(self, + make_env=None, + existing_envs=None, + num_envs=1, + *, + observation_space=None, + action_space=None, + env_config=None): """Initializes a _VectorizedGymEnv object. Args: @@ -135,27 +116,11 @@ def __init__( If None, use existing_envs[0]'s action space. env_config (Optional[dict]): Additional sub env config to pass to make_env as first arg. - policy_config (Optional[PartialTrainerConfigDict]): An optional - trainer/policy config dict. """ + self.make_env = make_env self.envs = existing_envs - - # Fill up missing envs (so we have exactly num_envs sub-envs in this - # VectorEnv. while len(self.envs) < num_envs: - self.envs.append(make_env(len(self.envs))) - - # Wrap all envs with video recorder if necessary. - if policy_config is not None and policy_config.get("record_env"): - - def wrapper_(env): - return gym_wrappers.Monitor( - env=env, - directory=policy_config["record_env"], - video_callable=lambda _: True, - force=True) - - self.envs = [wrapper_(e) for e in self.envs] + self.envs.append(self.make_env(len(self.envs))) super().__init__( observation_space=observation_space @@ -168,9 +133,7 @@ def vector_reset(self): return [e.reset() for e in self.envs] @override(VectorEnv) - def reset_at(self, index: Optional[int] = None) -> EnvObsType: - if index is None: - index = 0 + def reset_at(self, index): return self.envs[index].reset() @override(VectorEnv) @@ -194,9 +157,3 @@ def vector_step(self, actions): @override(VectorEnv) def get_unwrapped(self): return self.envs - - @override(VectorEnv) - def try_render_at(self, index: Optional[int] = None): - if index is None: - index = 0 - return self.envs[index].render() diff --git a/rllib/env/wrappers/dm_control_wrapper.py b/rllib/env/wrappers/dm_control_wrapper.py index 3286aae28adf..6734e2a3ab66 100644 --- a/rllib/env/wrappers/dm_control_wrapper.py +++ b/rllib/env/wrappers/dm_control_wrapper.py @@ -31,7 +31,7 @@ specs = None try: from dm_control import suite -except (ImportError, OSError): +except ImportError: suite = None import numpy as np diff --git a/rllib/env/wrappers/tests/test_unity3d_env.py b/rllib/env/wrappers/tests/test_unity3d_env.py deleted file mode 100644 index 5e347ed0ec05..000000000000 --- a/rllib/env/wrappers/tests/test_unity3d_env.py +++ /dev/null @@ -1,55 +0,0 @@ -import unittest -from unittest.mock import patch - -from ray.rllib.env.wrappers.unity3d_env import Unity3DEnv - - -@patch("mlagents_envs.environment.UnityEnvironment") -class TestUnity3DEnv(unittest.TestCase): - def test_port_editor(self, mock_unity3d): - """Test if the environment uses the editor port - when no environment file is provided""" - - _ = Unity3DEnv(port=None) - args, kwargs = mock_unity3d.call_args - mock_unity3d.assert_called_once() - self.assertEqual(5004, kwargs.get("base_port")) - - def test_port_app(self, mock_unity3d): - """Test if the environment uses the correct port - when the environment file is provided""" - - _ = Unity3DEnv(file_name="app", port=None) - args, kwargs = mock_unity3d.call_args - mock_unity3d.assert_called_once() - self.assertEqual(5005, kwargs.get("base_port")) - - def test_ports_multi_app(self, mock_unity3d): - """Test if the base_port + worker_id - is different for each environment""" - - _ = Unity3DEnv(file_name="app", port=None) - args, kwargs_first = mock_unity3d.call_args - _ = Unity3DEnv(file_name="app", port=None) - args, kwargs_second = mock_unity3d.call_args - self.assertNotEqual( - kwargs_first.get("base_port") + kwargs_first.get("worker_id"), - kwargs_second.get("base_port") + kwargs_second.get("worker_id")) - - def test_custom_port_app(self, mock_unity3d): - """Test if the base_port + worker_id is different - for each environment when using custom ports""" - - _ = Unity3DEnv(file_name="app", port=5010) - args, kwargs_first = mock_unity3d.call_args - _ = Unity3DEnv(file_name="app", port=5010) - args, kwargs_second = mock_unity3d.call_args - self.assertNotEqual( - kwargs_first.get("base_port") + kwargs_first.get("worker_id"), - kwargs_second.get("base_port") + kwargs_second.get("worker_id")) - - -if __name__ == "__main__": - import pytest - import sys - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/env/wrappers/unity3d_env.py b/rllib/env/wrappers/unity3d_env.py index 876c06e96508..753c234439d7 100644 --- a/rllib/env/wrappers/unity3d_env.py +++ b/rllib/env/wrappers/unity3d_env.py @@ -27,12 +27,7 @@ class Unity3DEnv(MultiAgentEnv): inside an RLlib PolicyClient for cloud/distributed training of Unity games. """ - # Default base port when connecting directly to the Editor - _BASE_PORT_EDITOR = 5004 - # Default base port when connecting to a compiled environment - _BASE_PORT_ENVIRONMENT = 5005 - # The worker_id for each environment instance - _WORKER_ID = 0 + _BASE_PORT = 5004 def __init__(self, file_name: str = None, @@ -78,24 +73,18 @@ def __init__(self, # environments (num_workers >> 1). Otherwise, would lead to port # conflicts sometimes. time.sleep(random.randint(1, 10)) - port_ = port or (self._BASE_PORT_ENVIRONMENT - if file_name else self._BASE_PORT_EDITOR) - # cache the worker_id and - # increase it for the next environment - worker_id_ = Unity3DEnv._WORKER_ID if file_name else 0 - Unity3DEnv._WORKER_ID += 1 + port_ = port or self._BASE_PORT + self._BASE_PORT += 1 try: self.unity_env = UnityEnvironment( file_name=file_name, - worker_id=worker_id_, + worker_id=0, base_port=port_, seed=seed, no_graphics=no_graphics, timeout_wait=timeout_wait, ) - print( - "Created UnityEnvironment for port {}".format(port_ + - worker_id_)) + print("Created UnityEnvironment for port {}".format(port_)) except mlagents_envs.exception.UnityWorkerInUseException: pass else: diff --git a/rllib/evaluation/metrics.py b/rllib/evaluation/metrics.py index e44b301f42d3..6ed723b156d2 100644 --- a/rllib/evaluation/metrics.py +++ b/rllib/evaluation/metrics.py @@ -1,7 +1,7 @@ import logging import numpy as np import collections -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Union import ray from ray.rllib.evaluation.rollout_metrics import RolloutMetrics @@ -14,19 +14,6 @@ logger = logging.getLogger(__name__) -def extract_stats(stats: Dict, key: str) -> Dict[str, Any]: - if key in stats: - return stats[key] - - multiagent_stats = {} - for k, v in stats.items(): - if isinstance(v, dict): - if key in v: - multiagent_stats[k] = v[key] - - return multiagent_stats - - @DeveloperAPI def get_learner_stats(grad_info: GradInfoDict) -> LearnerStatsDict: """Return optimization stats reported from the policy. diff --git a/rllib/evaluation/rollout_worker.py b/rllib/evaluation/rollout_worker.py index e824a01747d7..d0770cdf7dbb 100644 --- a/rllib/evaluation/rollout_worker.py +++ b/rllib/evaluation/rollout_worker.py @@ -32,7 +32,7 @@ from ray.rllib.utils import merge_dicts from ray.rllib.utils.annotations import DeveloperAPI from ray.rllib.utils.debug import summarize -from ray.rllib.utils.deprecation import DEPRECATED_VALUE, deprecation_warning +from ray.rllib.utils.deprecation import deprecation_warning from ray.rllib.utils.filter import get_filter, Filter from ray.rllib.utils.framework import try_import_tf, try_import_torch from ray.rllib.utils.sgd import do_minibatch_sgd @@ -396,22 +396,15 @@ def wrap(env): if clip_rewards is None: clip_rewards = True - # Deprecated way of framestacking is used. - framestack = model_config.get("framestack") is True # framestacking via trajectory view API is enabled. num_framestacks = model_config.get("num_framestacks", 0) - - # No trajectory view API: No traj. view based framestacking. if not policy_config["_use_trajectory_view_api"]: model_config["num_framestacks"] = num_framestacks = 0 - # Trajectory view API is on and num_framestacks=auto: Only - # stack traj. view based if old `framestack=[invalid value]`. elif num_framestacks == "auto": - if framestack == DEPRECATED_VALUE: - model_config["num_framestacks"] = num_framestacks = 4 - else: - model_config["num_framestacks"] = num_framestacks = 0 + model_config["num_framestacks"] = num_framestacks = 4 framestack_traj_view = num_framestacks > 1 + # Deprecated way of framestacking is used. + framestack = model_config.get("framestack") is True def wrap(env): env = wrap_deepmind( @@ -546,9 +539,7 @@ def make_env(vector_index): make_env=make_env, num_envs=num_envs, remote_envs=remote_worker_envs, - remote_env_batch_wait_ms=remote_env_batch_wait_ms, - policy_config=policy_config, - ) + remote_env_batch_wait_ms=remote_env_batch_wait_ms) # `truncate_episodes`: Allow a batch to contain more than one episode # (fragments) and always make the batch `rollout_fragment_length` @@ -585,11 +576,6 @@ def make_env(vector_index): raise ValueError( "Unknown evaluation method: {}".format(method)) - render = False - if policy_config.get("render_env") is True and \ - (num_workers == 0 or worker_index == 1): - render = True - if self.env is None: self.sampler = None elif sample_async: @@ -615,7 +601,6 @@ def make_env(vector_index): _use_trajectory_view_api=_use_trajectory_view_api, sample_collector_class=policy_config.get( "sample_collector_class"), - render=render, ) # Start the Sampler thread. self.sampler.start() @@ -641,7 +626,6 @@ def make_env(vector_index): _use_trajectory_view_api=_use_trajectory_view_api, sample_collector_class=policy_config.get( "sample_collector_class"), - render=render, ) self.input_reader: InputReader = input_creator(self.io_context) diff --git a/rllib/evaluation/sampler.py b/rllib/evaluation/sampler.py index 1eea70fc3cdf..eb81b65de9c9 100644 --- a/rllib/evaluation/sampler.py +++ b/rllib/evaluation/sampler.py @@ -65,16 +65,17 @@ class _PerfStats: def __init__(self): self.iters = 0 + self.env_wait_time = 0.0 self.raw_obs_processing_time = 0.0 self.inference_time = 0.0 self.action_processing_time = 0.0 - self.env_wait_time = 0.0 - self.env_render_time = 0.0 def get(self): # Mean multiplicator (1000 = ms -> sec). factor = 1000 / self.iters return { + # Waiting for environment (during poll). + "mean_env_wait_ms": self.env_wait_time * factor, # Raw observation preprocessing. "mean_raw_obs_processing_ms": self.raw_obs_processing_time * factor, @@ -82,10 +83,6 @@ def get(self): "mean_inference_ms": self.inference_time * factor, # Processing actions (to be sent to env, e.g. clipping). "mean_action_processing_ms": self.action_processing_time * factor, - # Waiting for environment (during poll). - "mean_env_wait_ms": self.env_wait_time * factor, - # Environment rendering (False by default). - "mean_env_render_ms": self.env_render_time * factor, } @@ -144,9 +141,7 @@ def __init__( no_done_at_end: bool = False, observation_fn: "ObservationFunction" = None, _use_trajectory_view_api: bool = False, - sample_collector_class: Optional[Type[SampleCollector]] = None, - render: bool = False, - ): + sample_collector_class: Optional[Type[SampleCollector]] = None): """Initializes a SyncSampler object. Args: @@ -189,8 +184,6 @@ def __init__( sample_collector_class (Optional[Type[SampleCollector]]): An optional Samplecollector sub-class to use to collect, store, and retrieve environment-, model-, and sampler data. - render (bool): Whether to try to render the environment after each - step. """ self.base_env = BaseEnv.to_base_env(env) @@ -214,7 +207,6 @@ def __init__( count_steps_by=count_steps_by) else: self.sample_collector = None - self.render = render # Create the rollout generator to use for calls to `get_data()`. self.rollout_provider = _env_runner( @@ -223,7 +215,7 @@ def __init__( self.preprocessors, self.obs_filters, clip_rewards, clip_actions, multiple_episodes_in_batch, callbacks, tf_sess, self.perf_stats, soft_horizon, no_done_at_end, observation_fn, - _use_trajectory_view_api, self.sample_collector, self.render) + _use_trajectory_view_api, self.sample_collector) self.metrics_queue = queue.Queue() @override(SamplerInput) @@ -288,7 +280,6 @@ def __init__( observation_fn: "ObservationFunction" = None, _use_trajectory_view_api: bool = False, sample_collector_class: Optional[Type[SampleCollector]] = None, - render: bool = False, ): """Initializes a AsyncSampler object. @@ -336,8 +327,6 @@ def __init__( sample_collector_class (Optional[Type[SampleCollector]]): An optional Samplecollector sub-class to use to collect, store, and retrieve environment-, model-, and sampler data. - render (bool): Whether to try to render the environment after each - step. """ for _, f in obs_filters.items(): assert getattr(f, "is_concurrent", False), \ @@ -367,7 +356,6 @@ def __init__( self.shutdown = False self.observation_fn = observation_fn self._use_trajectory_view_api = _use_trajectory_view_api - self.render = render if _use_trajectory_view_api: if not sample_collector_class: sample_collector_class = SimpleListCollector @@ -404,7 +392,7 @@ def _run(self): self.clip_actions, self.multiple_episodes_in_batch, self.callbacks, self.tf_sess, self.perf_stats, self.soft_horizon, self.no_done_at_end, self.observation_fn, - self._use_trajectory_view_api, self.sample_collector, self.render) + self._use_trajectory_view_api, self.sample_collector) while not self.shutdown: # The timeout variable exists because apparently, if one worker # dies, the other workers won't die with it, unless the timeout is @@ -470,7 +458,6 @@ def _env_runner( observation_fn: "ObservationFunction", _use_trajectory_view_api: bool = False, sample_collector: Optional[SampleCollector] = None, - render: bool = None, ) -> Iterable[SampleBatchType]: """This implements the common experience collection logic. @@ -510,9 +497,7 @@ def _env_runner( `_use_trajectory_view_api` to make generic trajectory views available to Models. Default: False. sample_collector (Optional[SampleCollector]): An optional - SampleCollector object to use. - render (bool): Whether to try to render the environment after each - step. + SampleCollector object to use Yields: rollout (SampleBatch): Object containing state, action, reward, @@ -701,12 +686,6 @@ def new_episode(env_id): base_env.send_actions(actions_to_send) perf_stats.env_wait_time += time.time() - t4 - # Try to render the env, if required. - if render: - t5 = time.time() - base_env.try_render() - perf_stats.env_render_time += time.time() - t5 - def _process_observations( *, diff --git a/rllib/evaluation/tests/test_trajectory_view_api.py b/rllib/evaluation/tests/test_trajectory_view_api.py index 1c56ef2b9e65..1601e07f3666 100644 --- a/rllib/evaluation/tests/test_trajectory_view_api.py +++ b/rllib/evaluation/tests/test_trajectory_view_api.py @@ -25,7 +25,7 @@ class MyCallbacks(DefaultCallbacks): @override(DefaultCallbacks) - def on_learn_on_batch(self, *, policy, train_batch, result, **kwargs): + def on_learn_on_batch(self, *, policy, train_batch, **kwargs): assert train_batch.count == 201 assert sum(train_batch.seq_lens) == 201 for k, v in train_batch.data.items(): diff --git a/rllib/evaluation/worker_set.py b/rllib/evaluation/worker_set.py index 8361e0af8777..80cf617bb029 100644 --- a/rllib/evaluation/worker_set.py +++ b/rllib/evaluation/worker_set.py @@ -8,7 +8,7 @@ from ray.rllib.evaluation.rollout_worker import RolloutWorker, \ _validate_multiagent_config from ray.rllib.offline import NoopOutput, JsonReader, MixedInput, JsonWriter, \ - ShuffledInput, D4RLReader + ShuffledInput from ray.rllib.env.env_context import EnvContext from ray.rllib.policy import Policy from ray.rllib.utils import merge_dicts @@ -266,9 +266,6 @@ def session_creator(): input_creator = ( lambda ioctx: ShuffledInput(MixedInput(config["input"], ioctx), config["shuffle_buffer_size"])) - elif "d4rl" in config["input"]: - env_name = config["input"].split(".")[1] - input_creator = (lambda ioctx: D4RLReader(env_name, ioctx)) else: input_creator = ( lambda ioctx: ShuffledInput(JsonReader(config["input"], ioctx), diff --git a/rllib/examples/custom_metrics_and_callbacks.py b/rllib/examples/custom_metrics_and_callbacks.py index ecbe99bd7baa..745a94029a2e 100644 --- a/rllib/examples/custom_metrics_and_callbacks.py +++ b/rllib/examples/custom_metrics_and_callbacks.py @@ -59,12 +59,6 @@ def on_train_result(self, *, trainer, result: dict, **kwargs): # you can mutate the result dict to add new fields to return result["callback_ok"] = True - def on_learn_on_batch(self, *, policy: Policy, train_batch: SampleBatch, - result: dict, **kwargs) -> None: - result["sum_actions_in_train_batch"] = np.sum(train_batch["actions"]) - print("policy.learn_on_batch() result: {} -> sum actions: {}".format( - policy, result["sum_actions_in_train_batch"])) - def on_postprocess_trajectory( self, *, worker: RolloutWorker, episode: MultiAgentEpisode, agent_id: str, policy_id: str, policies: Dict[str, Policy], @@ -94,7 +88,7 @@ def on_postprocess_trajectory( "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), }).trials - # Verify episode-related custom metrics are there. + # verify custom metrics for integration tests custom_metrics = trials[0].last_result["custom_metrics"] print(custom_metrics) assert "pole_angle_mean" in custom_metrics @@ -102,8 +96,3 @@ def on_postprocess_trajectory( assert "pole_angle_max" in custom_metrics assert "num_batches_mean" in custom_metrics assert "callback_ok" in trials[0].last_result - - # Verify `on_learn_on_batch` custom metrics are there (per policy). - info_custom_metrics = custom_metrics["default_policy"] - print(info_custom_metrics) - assert "sum_actions_in_train_batch" in info_custom_metrics diff --git a/rllib/examples/env/cartpole_mass.py b/rllib/examples/env/cartpole_mass.py deleted file mode 100644 index a0519cb17869..000000000000 --- a/rllib/examples/env/cartpole_mass.py +++ /dev/null @@ -1,31 +0,0 @@ -import numpy as np -import gym -from gym.envs.classic_control.cartpole import CartPoleEnv -from ray.rllib.env.meta_env import MetaEnv - - -class CartPoleMassEnv(CartPoleEnv, gym.utils.EzPickle, MetaEnv): - """CartPoleMassEnv varies the weights of the cart and the pole. - """ - - def sample_tasks(self, n_tasks): - # Sample new cart- and pole masses (random floats between 0.5 and 2.0 - # (cart) and between 0.05 and 0.2 (pole)). - cart_masses = np.random.uniform(low=0.5, high=2.0, size=(n_tasks, 1)) - pole_masses = np.random.uniform(low=0.05, high=0.2, size=(n_tasks, 1)) - return np.concatenate([cart_masses, pole_masses], axis=-1) - - def set_task(self, task): - """ - Args: - task (Tuple[float]): Masses of the cart and the pole. - """ - self.masscart = task[0] - self.masspole = task[1] - - def get_task(self): - """ - Returns: - Tuple[float]: The current mass of the cart- and pole. - """ - return np.array([self.masscart, self.masspole]) diff --git a/rllib/examples/env/mbmpo_env.py b/rllib/examples/env/mbmpo_env.py index 87c367611d98..c49ef77be78c 100644 --- a/rllib/examples/env/mbmpo_env.py +++ b/rllib/examples/env/mbmpo_env.py @@ -1,12 +1,12 @@ +import gym from gym.envs.classic_control import PendulumEnv, CartPoleEnv import numpy as np # MuJoCo may not be installed. HalfCheetahEnv = HopperEnv = None - try: from gym.envs.mujoco import HalfCheetahEnv, HopperEnv -except Exception: +except (ImportError, gym.error.DependencyNotInstalled): pass @@ -22,12 +22,11 @@ def reward(self, obs, action, obs_next): x = obs_next[:, 0] theta = obs_next[:, 2] - # 1.0 if we are still on, 0.0 if we are terminated due to bounds - # (angular or x-axis) being breached. - rew = 1.0 - ((x < -self.x_threshold) | (x > self.x_threshold) | - (theta < -self.theta_threshold_radians) | - (theta > self.theta_threshold_radians)).astype(np.float32) + rew = (x < -self.x_threshold) | (x > self.x_threshold) | ( + theta < -self.theta_threshold_radians) | ( + theta > self.theta_threshold_radians) + rew = rew.astype(float) return rew @@ -55,45 +54,46 @@ def angle_normalize(x): return (((x + np.pi) % (2 * np.pi)) - np.pi) -class HalfCheetahWrapper(HalfCheetahEnv or object): - """Wrapper for the MuJoCo HalfCheetah-v2 environment. - - Adds an additional `reward` method for some model-based RL algos (e.g. - MB-MPO). - """ - - def reward(self, obs, action, obs_next): - if obs.ndim == 2 and action.ndim == 2: - assert obs.shape == obs_next.shape - forward_vel = obs_next[:, 8] - ctrl_cost = 0.1 * np.sum(np.square(action), axis=1) - reward = forward_vel - ctrl_cost - return np.minimum(np.maximum(-1000.0, reward), 1000.0) - else: - forward_vel = obs_next[8] - ctrl_cost = 0.1 * np.square(action).sum() - reward = forward_vel - ctrl_cost +if HalfCheetahEnv: + + class HalfCheetahWrapper(HalfCheetahEnv): + """Wrapper for the MuJoCo HalfCheetah-v2 environment. + + Adds an additional `reward` method for some model-based RL algos (e.g. + MB-MPO). + """ + + def reward(self, obs, action, obs_next): + if obs.ndim == 2 and action.ndim == 2: + assert obs.shape == obs_next.shape + forward_vel = obs_next[:, 8] + ctrl_cost = 0.1 * np.sum(np.square(action), axis=1) + reward = forward_vel - ctrl_cost + return np.minimum(np.maximum(-1000.0, reward), 1000.0) + else: + forward_vel = obs_next[8] + ctrl_cost = 0.1 * np.square(action).sum() + reward = forward_vel - ctrl_cost + return np.minimum(np.maximum(-1000.0, reward), 1000.0) + + class HopperWrapper(HopperEnv): + """Wrapper for the MuJoCo Hopper-v2 environment. + + Adds an additional `reward` method for some model-based RL algos (e.g. + MB-MPO). + """ + + def reward(self, obs, action, obs_next): + alive_bonus = 1.0 + assert obs.ndim == 2 and action.ndim == 2 + assert (obs.shape == obs_next.shape + and action.shape[0] == obs.shape[0]) + vel = obs_next[:, 5] + ctrl_cost = 1e-3 * np.sum(np.square(action), axis=1) + reward = vel + alive_bonus - ctrl_cost return np.minimum(np.maximum(-1000.0, reward), 1000.0) -class HopperWrapper(HopperEnv or object): - """Wrapper for the MuJoCo Hopper-v2 environment. - - Adds an additional `reward` method for some model-based RL algos (e.g. - MB-MPO). - """ - - def reward(self, obs, action, obs_next): - alive_bonus = 1.0 - assert obs.ndim == 2 and action.ndim == 2 - assert (obs.shape == obs_next.shape - and action.shape[0] == obs.shape[0]) - vel = obs_next[:, 5] - ctrl_cost = 1e-3 * np.sum(np.square(action), axis=1) - reward = vel + alive_bonus - ctrl_cost - return np.minimum(np.maximum(-1000.0, reward), 1000.0) - - if __name__ == "__main__": env = PendulumWrapper() env.reset() diff --git a/rllib/examples/env/pendulum_mass.py b/rllib/examples/env/pendulum_mass.py index b68b283e7410..c4dc93ed7342 100644 --- a/rllib/examples/env/pendulum_mass.py +++ b/rllib/examples/env/pendulum_mass.py @@ -11,22 +11,19 @@ class PendulumMassEnv(PendulumEnv, gym.utils.EzPickle, MetaEnv): """ def sample_tasks(self, n_tasks): - # Sample new pendulum masses (random floats between 0.5 and 2). + # Mass is a random float between 0.5 and 2 return np.random.uniform(low=0.5, high=2.0, size=(n_tasks, )) def set_task(self, task): """ Args: - task (float): Task of the meta-learning environment (here: mass of - the pendulum). + task: task of the meta-learning environment """ - # self.m is the mass property of the pendulum. self.m = task def get_task(self): """ Returns: - float: The current mass of the pendulum (self.m in the PendulumEnv - object). + task: task of the meta-learning environment """ return self.m diff --git a/rllib/examples/export/cartpole_dqn_export.py b/rllib/examples/export/cartpole_dqn_export.py index 8d0ac7abaf87..8b315dd79a34 100644 --- a/rllib/examples/export/cartpole_dqn_export.py +++ b/rllib/examples/export/cartpole_dqn_export.py @@ -3,7 +3,7 @@ import os import ray -from ray.rllib.agents.registry import get_trainer_class +from ray.rllib.agents.registry import get_agent_class from ray.rllib.utils.framework import try_import_tf tf1, tf, tfv = try_import_tf() @@ -12,7 +12,7 @@ def train_and_export(algo_name, num_steps, model_dir, ckpt_dir, prefix): - cls = get_trainer_class(algo_name) + cls = get_agent_class(algo_name) alg = cls(config={}, env="CartPole-v0") for _ in range(num_steps): alg.train() diff --git a/rllib/examples/models/cnn_plus_fc_concat_model.py b/rllib/examples/models/cnn_plus_fc_concat_model.py new file mode 100644 index 000000000000..6f8e3d85e4e2 --- /dev/null +++ b/rllib/examples/models/cnn_plus_fc_concat_model.py @@ -0,0 +1,218 @@ +from gym.spaces import Discrete, Tuple + +from ray.rllib.examples.models.impala_vision_nets import TorchImpalaVisionNet +from ray.rllib.models.catalog import ModelCatalog +from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.models.tf.misc import normc_initializer +from ray.rllib.models.tf.tf_modelv2 import TFModelV2 +from ray.rllib.models.torch.misc import normc_initializer as \ + torch_normc_initializer, SlimFC +from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 +from ray.rllib.models.utils import get_filter_config +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_tf, try_import_torch + +tf1, tf, tfv = try_import_tf() +torch, nn = try_import_torch() + + +# __sphinx_doc_begin__ +class CNNPlusFCConcatModel(TFModelV2): + """TFModelV2 concat'ing CNN outputs to flat input(s), followed by FC(s). + + Note: This model should be used for complex (Dict or Tuple) observation + spaces that have one or more image components. + """ + + def __init__(self, obs_space, action_space, num_outputs, model_config, + name): + # TODO: (sven) Support Dicts as well. + assert isinstance(obs_space.original_space, (Tuple)), \ + "`obs_space.original_space` must be Tuple!" + + super().__init__(obs_space, action_space, num_outputs, model_config, + name) + + # Build the CNN(s) given obs_space's image components. + self.cnns = {} + concat_size = 0 + for i, component in enumerate(obs_space.original_space): + # Image space. + if len(component.shape) == 3: + config = { + "conv_filters": model_config.get( + "conv_filters", get_filter_config(component.shape)), + "conv_activation": model_config.get("conv_activation"), + } + cnn = ModelCatalog.get_model_v2( + component, + action_space, + num_outputs=None, + model_config=config, + framework="tf", + name="cnn_{}".format(i)) + concat_size += cnn.num_outputs + self.cnns[i] = cnn + # Discrete inputs -> One-hot encode. + elif isinstance(component, Discrete): + concat_size += component.n + # TODO: (sven) Multidiscrete (see e.g. our auto-LSTM wrappers). + # Everything else (1D Box). + else: + assert len(component.shape) == 1, \ + "Only input Box 1D or 3D spaces allowed!" + concat_size += component.shape[-1] + + self.logits_and_value_model = None + self._value_out = None + if num_outputs: + # Action-distribution head. + concat_layer = tf.keras.layers.Input((concat_size, )) + logits_layer = tf.keras.layers.Dense( + num_outputs, + activation=tf.keras.activations.linear, + name="logits")(concat_layer) + + # Create the value branch model. + value_layer = tf.keras.layers.Dense( + 1, + name="value_out", + activation=None, + kernel_initializer=normc_initializer(0.01))(concat_layer) + self.logits_and_value_model = tf.keras.models.Model( + concat_layer, [logits_layer, value_layer]) + else: + self.num_outputs = concat_size + + @override(ModelV2) + def forward(self, input_dict, state, seq_lens): + # Push image observations through our CNNs. + outs = [] + for i, component in enumerate(input_dict["obs"]): + if i in self.cnns: + cnn_out, _ = self.cnns[i]({"obs": component}) + outs.append(cnn_out) + else: + outs.append(component) + # Concat all outputs and the non-image inputs. + out = tf.concat(outs, axis=1) + if not self.logits_and_value_model: + return out, [] + + # Value branch. + logits, values = self.logits_and_value_model(out) + self._value_out = tf.reshape(values, [-1]) + return logits, [] + + @override(ModelV2) + def value_function(self): + return self._value_out + + +# __sphinx_doc_end__ + + +class TorchCNNPlusFCConcatModel(TorchModelV2, nn.Module): + """TorchModelV2 concat'ing CNN outputs to flat input(s), followed by FC(s). + + Note: This model should be used for complex (Dict or Tuple) observation + spaces that have one or more image components. + """ + + def __init__(self, obs_space, action_space, num_outputs, model_config, + name): + # TODO: (sven) Support Dicts as well. + assert isinstance(obs_space.original_space, (Tuple)), \ + "`obs_space.original_space` must be Tuple!" + + nn.Module.__init__(self) + TorchModelV2.__init__(self, obs_space, action_space, num_outputs, + model_config, name) + + # Atari type CNNs or IMPALA type CNNs (with residual layers)? + self.cnn_type = self.model_config["custom_model_config"].get( + "conv_type", "atari") + + # Build the CNN(s) given obs_space's image components. + self.cnns = {} + concat_size = 0 + for i, component in enumerate(obs_space.original_space): + # Image space. + if len(component.shape) == 3: + config = { + "conv_filters": model_config.get( + "conv_filters", get_filter_config(component.shape)), + "conv_activation": model_config.get("conv_activation"), + } + if self.cnn_type == "atari": + cnn = ModelCatalog.get_model_v2( + component, + action_space, + num_outputs=None, + model_config=config, + framework="torch", + name="cnn_{}".format(i)) + else: + cnn = TorchImpalaVisionNet( + component, + action_space, + num_outputs=None, + model_config=config, + name="cnn_{}".format(i)) + + concat_size += cnn.num_outputs + self.cnns[i] = cnn + self.add_module("cnn_{}".format(i), cnn) + # Discrete inputs -> One-hot encode. + elif isinstance(component, Discrete): + concat_size += component.n + # TODO: (sven) Multidiscrete (see e.g. our auto-LSTM wrappers). + # Everything else (1D Box). + else: + assert len(component.shape) == 1, \ + "Only input Box 1D or 3D spaces allowed!" + concat_size += component.shape[-1] + + self.logits_layer = None + self.value_layer = None + self._value_out = None + + if num_outputs: + # Action-distribution head. + self.logits_layer = SlimFC( + in_size=concat_size, + out_size=num_outputs, + activation_fn=None, + ) + # Create the value branch model. + self.value_layer = SlimFC( + in_size=concat_size, + out_size=1, + activation_fn=None, + initializer=torch_normc_initializer(0.01)) + else: + self.num_outputs = concat_size + + @override(ModelV2) + def forward(self, input_dict, state, seq_lens): + # Push image observations through our CNNs. + outs = [] + for i, component in enumerate(input_dict["obs"]): + if i in self.cnns: + cnn_out, _ = self.cnns[i]({"obs": component}) + outs.append(cnn_out) + else: + outs.append(component) + # Concat all outputs and the non-image inputs. + out = torch.cat(outs, dim=1) + if self.logits_layer is None: + return out, [] + + # Value branch. + logits, values = self.logits_layer(out), self.value_layer(out) + self._value_out = torch.reshape(values, [-1]) + return logits, [] + + @override(ModelV2) + def value_function(self): + return self._value_out diff --git a/rllib/examples/models/trajectory_view_utilizing_models.py b/rllib/examples/models/trajectory_view_utilizing_models.py index 0fd4e22cb145..41f53d8724c4 100644 --- a/rllib/examples/models/trajectory_view_utilizing_models.py +++ b/rllib/examples/models/trajectory_view_utilizing_models.py @@ -3,8 +3,6 @@ from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 from ray.rllib.policy.view_requirement import ViewRequirement from ray.rllib.utils.framework import try_import_tf, try_import_torch -from ray.rllib.utils.tf_ops import one_hot -from ray.rllib.utils.torch_ops import one_hot as torch_one_hot tf1, tf, tfv = try_import_tf() torch, nn = try_import_torch() @@ -30,42 +28,27 @@ def __init__(self, # Construct actual (very simple) FC model. assert len(obs_space.shape) == 1 - obs = tf.keras.layers.Input( + input_ = tf.keras.layers.Input( shape=(self.num_frames, obs_space.shape[0])) - obs_reshaped = tf.keras.layers.Reshape( - [obs_space.shape[0] * self.num_frames])(obs) - rewards = tf.keras.layers.Input(shape=(self.num_frames)) - rewards_reshaped = tf.keras.layers.Reshape([self.num_frames])(rewards) - actions = tf.keras.layers.Input( - shape=(self.num_frames, self.action_space.n)) - actions_reshaped = tf.keras.layers.Reshape( - [action_space.n * self.num_frames])(actions) - input_ = tf.keras.layers.Concatenate(axis=-1)( - [obs_reshaped, actions_reshaped, rewards_reshaped]) - layer1 = tf.keras.layers.Dense(256, activation=tf.nn.relu)(input_) - layer2 = tf.keras.layers.Dense(256, activation=tf.nn.relu)(layer1) - out = tf.keras.layers.Dense(self.num_outputs)(layer2) + reshaped = tf.keras.layers.Reshape( + [obs_space.shape[0] * self.num_frames])(input_) + layer1 = tf.keras.layers.Dense(64, activation=tf.nn.relu)(reshaped) + out = tf.keras.layers.Dense(self.num_outputs)(layer1) values = tf.keras.layers.Dense(1)(layer1) - self.base_model = tf.keras.models.Model([obs, actions, rewards], - [out, values]) + self.base_model = tf.keras.models.Model([input_], [out, values]) + self._last_value = None self.view_requirements["prev_n_obs"] = ViewRequirement( data_col="obs", shift="-{}:0".format(num_frames - 1), space=obs_space) - self.view_requirements["prev_n_rewards"] = ViewRequirement( - data_col="rewards", shift="-{}:-1".format(self.num_frames)) - self.view_requirements["prev_n_actions"] = ViewRequirement( - data_col="actions", - shift="-{}:-1".format(self.num_frames), - space=self.action_space) + self.view_requirements["prev_rewards"] = ViewRequirement( + data_col="rewards", shift=-1) def forward(self, input_dict, states, seq_lens): - obs = tf.cast(input_dict["prev_n_obs"], tf.float32) - rewards = tf.cast(input_dict["prev_n_rewards"], tf.float32) - actions = one_hot(input_dict["prev_n_actions"], self.action_space) - out, self._last_value = self.base_model([obs, actions, rewards]) + obs = input_dict["prev_n_obs"] + out, self._last_value = self.base_model(obs) return out, [] def value_function(self): @@ -94,13 +77,13 @@ def __init__(self, # Construct actual (very simple) FC model. assert len(obs_space.shape) == 1 - in_size = self.num_frames * (obs_space.shape[0] + action_space.n + 1) self.layer1 = SlimFC( - in_size=in_size, out_size=256, activation_fn="relu") - self.layer2 = SlimFC(in_size=256, out_size=256, activation_fn="relu") + in_size=obs_space.shape[0] * self.num_frames, + out_size=64, + activation_fn="relu") self.out = SlimFC( - in_size=256, out_size=self.num_outputs, activation_fn="linear") - self.values = SlimFC(in_size=256, out_size=1, activation_fn="linear") + in_size=64, out_size=self.num_outputs, activation_fn="linear") + self.values = SlimFC(in_size=64, out_size=1, activation_fn="linear") self._last_value = None @@ -108,26 +91,14 @@ def __init__(self, data_col="obs", shift="-{}:0".format(num_frames - 1), space=obs_space) - self.view_requirements["prev_n_rewards"] = ViewRequirement( - data_col="rewards", shift="-{}:-1".format(self.num_frames)) - self.view_requirements["prev_n_actions"] = ViewRequirement( - data_col="actions", - shift="-{}:-1".format(self.num_frames), - space=self.action_space) + self.view_requirements["prev_rewards"] = ViewRequirement( + data_col="rewards", shift=-1) def forward(self, input_dict, states, seq_lens): obs = input_dict["prev_n_obs"] obs = torch.reshape(obs, [-1, self.obs_space.shape[0] * self.num_frames]) - rewards = torch.reshape(input_dict["prev_n_rewards"], - [-1, self.num_frames]) - actions = torch_one_hot(input_dict["prev_n_actions"], - self.action_space) - actions = torch.reshape(actions, - [-1, self.num_frames * actions.shape[-1]]) - input_ = torch.cat([obs, actions, rewards], dim=-1) - features = self.layer1(input_) - features = self.layer2(features) + features = self.layer1(obs) out = self.out(features) self._last_value = self.values(features) return out, [] diff --git a/rllib/examples/pettingzoo_env.py b/rllib/examples/pettingzoo_env.py index da49ccbdc22d..bd9901a17954 100644 --- a/rllib/examples/pettingzoo_env.py +++ b/rllib/examples/pettingzoo_env.py @@ -4,7 +4,7 @@ from supersuit import normalize_obs_v0, dtype_v0, color_reduction_v0 import ray -from ray.rllib.agents.registry import get_trainer_class +from ray.rllib.agents.registry import get_agent_class from ray.rllib.env import PettingZooEnv from pettingzoo.butterfly import pistonball_v1 @@ -33,7 +33,7 @@ def env_creator(config): num_rollouts = 2 # 1. Gets default training configuration and specifies the POMgame to load. - config = deepcopy(get_trainer_class(alg_name)._default_config) + config = deepcopy(get_agent_class(alg_name)._default_config) # 2. Set environment config. This will be passed to # the env_creator function via the register env lambda below. @@ -76,7 +76,7 @@ def env_creator(config): # 6. Initialize ray and trainer object ray.init(num_cpus=num_cpus + 1) - trainer = get_trainer_class(alg_name)(env="pistonball", config=config) + trainer = get_agent_class(alg_name)(env="pistonball", config=config) # 7. Train once trainer.train() diff --git a/rllib/examples/rock_paper_scissors_multiagent.py b/rllib/examples/rock_paper_scissors_multiagent.py index 0eb3709c14a0..dde72248e9b8 100644 --- a/rllib/examples/rock_paper_scissors_multiagent.py +++ b/rllib/examples/rock_paper_scissors_multiagent.py @@ -14,7 +14,7 @@ from ray import tune from ray.rllib.agents.pg import PGTrainer, PGTFPolicy, PGTorchPolicy -from ray.rllib.agents.registry import get_trainer_class +from ray.rllib.agents.registry import get_agent_class from ray.rllib.examples.env.rock_paper_scissors import RockPaperScissors from ray.rllib.examples.policy.rock_paper_scissors_dummies import \ BeatLastHeuristic, AlwaysSameHeuristic @@ -87,7 +87,7 @@ def select_policy(agent_id): }, "framework": "torch" if args.torch else "tf", } - cls = get_trainer_class(trainer) if isinstance(trainer, str) else trainer + cls = get_agent_class(trainer) if isinstance(trainer, str) else trainer trainer_obj = cls(config=config) env = trainer_obj.workers.local_worker().env for _ in range(args.stop_iters): diff --git a/rllib/examples/serving/cartpole_client.py b/rllib/examples/serving/cartpole_client.py index f2d45b5b3ea2..3541e0f6f7c6 100755 --- a/rllib/examples/serving/cartpole_client.py +++ b/rllib/examples/serving/cartpole_client.py @@ -17,7 +17,7 @@ parser.add_argument( "--no-train", action="store_true", help="Whether to disable training.") parser.add_argument( - "--inference-mode", type=str, default="local", choices=["local", "remote"]) + "--inference-mode", type=str, required=True, choices=["local", "remote"]) parser.add_argument( "--off-policy", action="store_true", diff --git a/rllib/examples/serving/cartpole_server.py b/rllib/examples/serving/cartpole_server.py index f76a34a91fc1..297320422ca0 100755 --- a/rllib/examples/serving/cartpole_server.py +++ b/rllib/examples/serving/cartpole_server.py @@ -13,7 +13,6 @@ from ray.rllib.agents.dqn import DQNTrainer from ray.rllib.agents.ppo import PPOTrainer from ray.rllib.env.policy_server_input import PolicyServerInput -from ray.rllib.examples.custom_metrics_and_callbacks import MyCallbacks from ray.tune.logger import pretty_print SERVER_ADDRESS = "localhost" @@ -44,7 +43,6 @@ "num_workers": 0, # Disable OPE, since the rollouts are coming from online clients. "input_evaluation": [], - "callbacks": MyCallbacks, } if args.run == "DQN": diff --git a/rllib/examples/trajectory_view_api.py b/rllib/examples/trajectory_view_api.py index a720617793d2..400051ad506f 100644 --- a/rllib/examples/trajectory_view_api.py +++ b/rllib/examples/trajectory_view_api.py @@ -2,7 +2,6 @@ import ray from ray import tune -from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole from ray.rllib.examples.models.trajectory_view_utilizing_models import \ FrameStackingCartPoleModel, TorchFrameStackingCartPoleModel from ray.rllib.models.catalog import ModelCatalog @@ -17,7 +16,7 @@ "--framework", choices=["tf2", "tf", "tfe", "torch"], default="tf") parser.add_argument("--as-test", action="store_true") parser.add_argument("--stop-iters", type=int, default=50) -parser.add_argument("--stop-timesteps", type=int, default=200000) +parser.add_argument("--stop-timesteps", type=int, default=100000) parser.add_argument("--stop-reward", type=float, default=150.0) if __name__ == "__main__": @@ -27,14 +26,13 @@ ModelCatalog.register_custom_model( "frame_stack_model", FrameStackingCartPoleModel if args.framework != "torch" else TorchFrameStackingCartPoleModel) - tune.register_env("stateless_cartpole", lambda c: StatelessCartPole()) config = { - "env": "stateless_cartpole", + "env": "CartPole-v0", "model": { "custom_model": "frame_stack_model", "custom_model_config": { - "num_frames": 16, + "num_frames": 4, } }, "framework": args.framework, diff --git a/rllib/execution/learner_thread.py b/rllib/execution/learner_thread.py index 4f1f6e84275f..8f5350fa146d 100644 --- a/rllib/execution/learner_thread.py +++ b/rllib/execution/learner_thread.py @@ -1,7 +1,8 @@ +from typing import Dict +import threading import copy + from six.moves import queue -import threading -from typing import Dict from ray.rllib.evaluation.metrics import get_learner_stats from ray.rllib.execution.minibatch_buffer import MinibatchBuffer @@ -68,10 +69,7 @@ def run(self) -> None: def step(self) -> None: with self.queue_timer: - try: - batch, _ = self.minibatch_buffer.get() - except queue.Empty: - return + batch, _ = self.minibatch_buffer.get() with self.grad_timer: fetches = self.local_worker.learn_on_batch(batch) diff --git a/rllib/execution/metric_ops.py b/rllib/execution/metric_ops.py index 06857f674a8e..70ae38e3fbf8 100644 --- a/rllib/execution/metric_ops.py +++ b/rllib/execution/metric_ops.py @@ -88,7 +88,6 @@ def __call__(self, _: Any) -> Dict: # Add in iterator metrics. metrics = _get_shared_metrics() - custom_metrics_from_info = metrics.info.pop("custom_metrics", {}) timers = {} counters = {} info = {} @@ -107,8 +106,6 @@ def __call__(self, _: Any) -> Dict: res["timers"] = timers res["info"] = info res["info"].update(counters) - res["custom_metrics"] = res.get("custom_metrics", {}) - res["custom_metrics"].update(custom_metrics_from_info) return res diff --git a/rllib/execution/train_ops.py b/rllib/execution/train_ops.py index fe8e7b95b6f5..e2411ed3279a 100644 --- a/rllib/execution/train_ops.py +++ b/rllib/execution/train_ops.py @@ -5,8 +5,7 @@ from typing import List, Tuple, Any import ray -from ray.rllib.evaluation.metrics import extract_stats, get_learner_stats, \ - LEARNER_STATS_KEY +from ray.rllib.evaluation.metrics import get_learner_stats, LEARNER_STATS_KEY from ray.rllib.evaluation.worker_set import WorkerSet from ray.rllib.execution.common import \ STEPS_SAMPLED_COUNTER, STEPS_TRAINED_COUNTER, LEARNER_INFO, \ @@ -59,25 +58,18 @@ def __call__(self, learn_timer = metrics.timers[LEARN_ON_BATCH_TIMER] with learn_timer: if self.num_sgd_iter > 1 or self.sgd_minibatch_size > 0: - lw = self.workers.local_worker() + w = self.workers.local_worker() info = do_minibatch_sgd( - batch, {pid: lw.get_policy(pid) - for pid in self.policies}, lw, self.num_sgd_iter, + batch, {p: w.get_policy(p) + for p in self.policies}, w, self.num_sgd_iter, self.sgd_minibatch_size, []) # TODO(ekl) shouldn't be returning learner stats directly here - # TODO(sven): Skips `custom_metrics` key from on_learn_on_batch - # callback (shouldn't). metrics.info[LEARNER_INFO] = info else: info = self.workers.local_worker().learn_on_batch(batch) - metrics.info[LEARNER_INFO] = extract_stats( - info, LEARNER_STATS_KEY) - metrics.info["custom_metrics"] = extract_stats( - info, "custom_metrics") + metrics.info[LEARNER_INFO] = get_learner_stats(info) learn_timer.push_units_processed(batch.count) metrics.counters[STEPS_TRAINED_COUNTER] += batch.count - # Update weights - after learning on the local worker - on all remote - # workers. if self.workers.remote_workers(): with metrics.timers[WORKER_UPDATE_TIMER]: weights = ray.put(self.workers.local_worker().get_weights( diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index 74ddcbeab2f5..8e3e43dd08b3 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -19,7 +19,7 @@ TorchDeterministic, TorchDiagGaussian, \ TorchMultiActionDistribution, TorchMultiCategorical from ray.rllib.utils.annotations import DeveloperAPI, PublicAPI -from ray.rllib.utils.deprecation import DEPRECATED_VALUE +from ray.rllib.utils.deprecation import DEPRECATED_VALUE, deprecation_warning from ray.rllib.utils.error import UnsupportedSpaceException from ray.rllib.utils.framework import try_import_tf, try_import_torch from ray.rllib.utils.spaces.simplex import Simplex @@ -56,18 +56,6 @@ # "linear" (or None). "conv_activation": "relu", - # Some default models support a final FC stack of n Dense layers with given - # activation: - # - Complex observation spaces: Image components are fed through - # VisionNets, flat Boxes are left as-is, Discrete are one-hot'd, then - # everything is concated and pushed through this final FC stack. - # - VisionNets (CNNs), e.g. after the CNN stack, there may be - # additional Dense layers. - # - FullyConnectedNetworks will have this additional FCStack as well - # (that's why it's empty by default). - "post_fcnet_hiddens": [], - "post_fcnet_activation": "relu", - # For DiagGaussian action distributions, make the second half of the model # outputs floating bias variables instead of state-dependent. This only # has an effect is using the default fully connected net. @@ -211,14 +199,13 @@ def get_action_dist( config = config or MODEL_DEFAULTS # Custom distribution given. if config.get("custom_action_dist"): - custom_action_config = config.copy() - action_dist_name = custom_action_config.pop("custom_action_dist") + action_dist_name = config["custom_action_dist"] logger.debug( "Using custom action distribution {}".format(action_dist_name)) dist_cls = _global_registry.get(RLLIB_ACTION_DIST, action_dist_name) - return ModelCatalog._get_multi_action_distribution( - dist_cls, action_space, custom_action_config, framework) + dist_cls = ModelCatalog._get_multi_action_distribution( + dist_cls, action_space, {}, framework) # Dist_type is given directly as a class. elif type(dist_type) is type and \ @@ -700,22 +687,17 @@ def _get_v2_model_class(input_space: gym.Space, framework: str = "tf") -> Type[ModelV2]: VisionNet = None - ComplexNet = None if framework in ["tf2", "tf", "tfe"]: from ray.rllib.models.tf.fcnet import \ FullyConnectedNetwork as FCNet from ray.rllib.models.tf.visionnet import \ VisionNetwork as VisionNet - from ray.rllib.models.tf.complex_input_net import \ - ComplexInputNetwork as ComplexNet elif framework == "torch": from ray.rllib.models.torch.fcnet import (FullyConnectedNetwork as FCNet) from ray.rllib.models.torch.visionnet import (VisionNetwork as VisionNet) - from ray.rllib.models.torch.complex_input_net import \ - ComplexInputNetwork as ComplexNet elif framework == "jax": from ray.rllib.models.jax.fcnet import (FullyConnectedNetwork as FCNet) @@ -727,29 +709,16 @@ def _get_v2_model_class(input_space: gym.Space, # Discrete/1D obs-spaces or 2D obs space but traj. view framestacking # disabled. num_framestacks = model_config.get("num_framestacks", "auto") - - # Tuple space, where at least one sub-space is image. - # -> Complex input model. - space_to_check = input_space if not hasattr( - input_space, "original_space") else input_space.original_space - if isinstance(input_space, - Tuple) or (isinstance(space_to_check, Tuple) and any( - isinstance(s, Box) and len(s.shape) >= 2 - for s in space_to_check.spaces)): - return ComplexNet - - # Single, flattenable/one-hot-abe space -> Simple FCNet. if isinstance(input_space, (Discrete, MultiDiscrete)) or \ len(input_space.shape) == 1 or ( len(input_space.shape) == 2 and ( num_framestacks == "auto" or num_framestacks <= 1)): return FCNet - - elif framework == "jax": - raise NotImplementedError("No non-FC default net for JAX yet!") - - # Last resort: Conv2D stack for single image spaces. - return VisionNet + # Default Conv2D net. + else: + if framework == "jax": + raise NotImplementedError("No Conv2D default net for JAX yet!") + return VisionNet @staticmethod def _get_multi_action_distribution(dist_class, action_space, config, @@ -771,8 +740,7 @@ def _get_multi_action_distribution(dist_class, action_space, config, action_space=action_space, child_distributions=child_dists, input_lens=input_lens), int(sum(input_lens)) - return dist_class, dist_class.required_model_output_shape( - action_space, config) + return dist_class @staticmethod def _validate_config(config: ModelConfigDict, framework: str) -> None: @@ -798,8 +766,8 @@ def _validate_config(config: ModelConfigDict, framework: str) -> None: "framework=jax so far!") if config.get("framestack") != DEPRECATED_VALUE: - # deprecation_warning( - # old="framestack", new="num_framestacks (int)", error=False) + deprecation_warning( + old="framestack", new="num_framestacks (int)", error=False) # If old behavior is desired, disable traj. view-style # framestacking. config["num_framestacks"] = 0 diff --git a/rllib/models/modelv2.py b/rllib/models/modelv2.py index bd5ee113219b..70ad50202421 100644 --- a/rllib/models/modelv2.py +++ b/rllib/models/modelv2.py @@ -203,13 +203,9 @@ def __call__( restored = input_dict.copy() restored["obs"] = restore_original_dimensions( input_dict["obs"], self.obs_space, self.framework) - try: - if len(input_dict["obs"].shape) > 2: - restored["obs_flat"] = flatten(input_dict["obs"], - self.framework) - else: - restored["obs_flat"] = input_dict["obs"] - except AttributeError: + if len(input_dict["obs"].shape) > 2: + restored["obs_flat"] = flatten(input_dict["obs"], self.framework) + else: restored["obs_flat"] = input_dict["obs"] with self.context(): res = self.forward(restored, state or [], seq_lens) @@ -220,6 +216,15 @@ def __call__( "got {}".format(res)) outputs, state = res + try: + shape = outputs.shape + except AttributeError: + raise ValueError("Output is not a tensor: {}".format(outputs)) + else: + if len(shape) != 2 or int(shape[1]) != self.num_outputs: + raise ValueError( + "Expected output shape of [None, {}], got {}".format( + self.num_outputs, shape)) if not isinstance(state, list): raise ValueError("State output is not a list: {}".format(state)) @@ -413,15 +418,15 @@ def restore_original_dimensions(obs: TensorType, observation space. """ - if tensorlib == "tf": - tensorlib = tf - elif tensorlib == "torch": - assert torch is not None - tensorlib = torch - original_space = getattr(obs_space, "original_space", obs_space) - if original_space is obs_space: + if hasattr(obs_space, "original_space"): + if tensorlib == "tf": + tensorlib = tf + elif tensorlib == "torch": + assert torch is not None + tensorlib = torch + return _unpack_obs(obs, obs_space.original_space, tensorlib=tensorlib) + else: return obs - return _unpack_obs(obs, original_space, tensorlib=tensorlib) # Cache of preprocessors, for if the user is calling unpack obs often. @@ -485,8 +490,7 @@ def _unpack_obs(obs: TensorType, space: gym.Space, tensorlib.reshape(obs_slice, batch_dims + list(p.shape)), v, tensorlib=tensorlib) - # Repeated space. - else: + elif isinstance(space, Repeated): assert isinstance(prep, RepeatedValuesPreprocessor), prep child_size = prep.child_preprocessor.size # The list lengths are stored in the first slot of the flat obs. @@ -499,6 +503,8 @@ def _unpack_obs(obs: TensorType, space: gym.Space, with_repeat_dim, space.child_space, tensorlib=tensorlib) return RepeatedValues( u, lengths=lengths, max_len=prep._obs_space.max_len) + else: + assert False, space return u else: return obs diff --git a/rllib/models/preprocessors.py b/rllib/models/preprocessors.py index 0abfb8658080..2b0bcb092062 100644 --- a/rllib/models/preprocessors.py +++ b/rllib/models/preprocessors.py @@ -140,7 +140,7 @@ def _init_shape(self, obs_space: gym.Space, options: dict) -> List[int]: @override(Preprocessor) def transform(self, observation: TensorType) -> np.ndarray: self.check_shape(observation) - return (observation.astype("float32") - 128) / 128 + return (observation - 128) / 128 class OneHotPreprocessor(Preprocessor): @@ -174,7 +174,7 @@ def transform(self, observation: TensorType) -> np.ndarray: @override(Preprocessor) def write(self, observation: TensorType, array: np.ndarray, offset: int) -> None: - array[offset:offset + self.size] = self.transform(observation) + array[offset + observation] = 1 class NoPreprocessor(Preprocessor): diff --git a/rllib/models/tests/test_preprocessors.py b/rllib/models/tests/test_preprocessors.py index 4ce7b73e7e74..5515b6fea6b1 100644 --- a/rllib/models/tests/test_preprocessors.py +++ b/rllib/models/tests/test_preprocessors.py @@ -71,17 +71,6 @@ def test_one_hot_preprocessor(self): pp.transform(np.array([0, 1, 3])), [1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0]) - def test_nested_multidiscrete_one_hot_preprocessor(self): - space = Tuple((MultiDiscrete([2, 3, 4]), )) - pp = get_preprocessor(space)(space) - self.assertTrue(pp.shape == (9, )) - check( - pp.transform((np.array([1, 2, 0]), )), - [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0]) - check( - pp.transform((np.array([0, 1, 3]), )), - [1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0]) - if __name__ == "__main__": import pytest diff --git a/rllib/models/tf/complex_input_net.py b/rllib/models/tf/complex_input_net.py deleted file mode 100644 index 8bc691e2405e..000000000000 --- a/rllib/models/tf/complex_input_net.py +++ /dev/null @@ -1,156 +0,0 @@ -from gym.spaces import Box, Discrete, Tuple -import numpy as np - -from ray.rllib.models.catalog import ModelCatalog -from ray.rllib.models.modelv2 import ModelV2, restore_original_dimensions -from ray.rllib.models.tf.misc import normc_initializer -from ray.rllib.models.tf.tf_modelv2 import TFModelV2 -from ray.rllib.models.utils import get_filter_config -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.tf_ops import one_hot - -tf1, tf, tfv = try_import_tf() - - -# __sphinx_doc_begin__ -class ComplexInputNetwork(TFModelV2): - """TFModelV2 concat'ing CNN outputs to flat input(s), followed by FC(s). - - Note: This model should be used for complex (Dict or Tuple) observation - spaces that have one or more image components. - - The data flow is as follows: - - `obs` (e.g. Tuple[img0, img1, discrete0]) -> `CNN0 + CNN1 + ONE-HOT` - `CNN0 + CNN1 + ONE-HOT` -> concat all flat outputs -> `out` - `out` -> (optional) FC-stack -> `out2` - `out2` -> action (logits) and vaulue heads. - """ - - def __init__(self, obs_space, action_space, num_outputs, model_config, - name): - # TODO: (sven) Support Dicts as well. - self.original_space = obs_space.original_space if \ - hasattr(obs_space, "original_space") else obs_space - assert isinstance(self.original_space, (Tuple)), \ - "`obs_space.original_space` must be Tuple!" - - super().__init__(self.original_space, action_space, num_outputs, - model_config, name) - - # Build the CNN(s) given obs_space's image components. - self.cnns = {} - self.one_hot = {} - self.flatten = {} - concat_size = 0 - for i, component in enumerate(self.original_space): - # Image space. - if len(component.shape) == 3: - config = { - "conv_filters": model_config.get( - "conv_filters", get_filter_config(component.shape)), - "conv_activation": model_config.get("conv_activation"), - "post_fcnet_hiddens": [], - } - cnn = ModelCatalog.get_model_v2( - component, - action_space, - num_outputs=None, - model_config=config, - framework="tf", - name="cnn_{}".format(i)) - concat_size += cnn.num_outputs - self.cnns[i] = cnn - # Discrete inputs -> One-hot encode. - elif isinstance(component, Discrete): - self.one_hot[i] = True - concat_size += component.n - # TODO: (sven) Multidiscrete (see e.g. our auto-LSTM wrappers). - # Everything else (1D Box). - else: - self.flatten[i] = int(np.product(component.shape)) - concat_size += self.flatten[i] - - # Optional post-concat FC-stack. - post_fc_stack_config = { - "fcnet_hiddens": model_config.get("post_fcnet_hiddens", []), - "fcnet_activation": model_config.get("post_fcnet_activation", - "relu") - } - self.post_fc_stack = ModelCatalog.get_model_v2( - Box(float("-inf"), - float("inf"), - shape=(concat_size, ), - dtype=np.float32), - self.action_space, - None, - post_fc_stack_config, - framework="tf", - name="post_fc_stack") - - # Actions and value heads. - self.logits_and_value_model = None - self._value_out = None - if num_outputs: - # Action-distribution head. - concat_layer = tf.keras.layers.Input( - (self.post_fc_stack.num_outputs, )) - logits_layer = tf.keras.layers.Dense( - num_outputs, - activation=tf.keras.activations.linear, - name="logits")(concat_layer) - - # Create the value branch model. - value_layer = tf.keras.layers.Dense( - 1, - name="value_out", - activation=None, - kernel_initializer=normc_initializer(0.01))(concat_layer) - self.logits_and_value_model = tf.keras.models.Model( - concat_layer, [logits_layer, value_layer]) - else: - self.num_outputs = self.post_fc_stack.num_outputs - - @override(ModelV2) - def forward(self, input_dict, state, seq_lens): - if SampleBatch.OBS in input_dict and "obs_flat" in input_dict: - orig_obs = input_dict[SampleBatch.OBS] - else: - orig_obs = restore_original_dimensions(input_dict[SampleBatch.OBS], - self.obs_space, "tf") - # Push image observations through our CNNs. - outs = [] - for i, component in enumerate(orig_obs): - if i in self.cnns: - cnn_out, _ = self.cnns[i]({SampleBatch.OBS: component}) - outs.append(cnn_out) - elif i in self.one_hot: - if component.dtype in [tf.int32, tf.int64, tf.uint8]: - outs.append( - one_hot(component, self.original_space.spaces[i])) - else: - outs.append(component) - else: - outs.append(tf.reshape(component, [-1, self.flatten[i]])) - # Concat all outputs and the non-image inputs. - out = tf.concat(outs, axis=1) - # Push through (optional) FC-stack (this may be an empty stack). - out, _ = self.post_fc_stack({SampleBatch.OBS: out}, [], None) - - # No logits/value branches. - if not self.logits_and_value_model: - return out, [] - - # Logits- and value branches. - logits, values = self.logits_and_value_model(out) - self._value_out = tf.reshape(values, [-1]) - return logits, [] - - @override(ModelV2) - def value_function(self): - return self._value_out - - -# __sphinx_doc_end__ diff --git a/rllib/models/tf/fcnet.py b/rllib/models/tf/fcnet.py index 9b0e8c565374..eea01014db9e 100644 --- a/rllib/models/tf/fcnet.py +++ b/rllib/models/tf/fcnet.py @@ -19,12 +19,8 @@ def __init__(self, obs_space: gym.spaces.Space, super(FullyConnectedNetwork, self).__init__( obs_space, action_space, num_outputs, model_config, name) - hiddens = model_config.get("fcnet_hiddens", []) + \ - model_config.get("post_fcnet_hiddens", []) - activation = model_config.get("fcnet_activation") - if not model_config.get("fcnet_hiddens", []): - activation = model_config.get("post_fcnet_activation") - activation = get_activation_fn(activation) + activation = get_activation_fn(model_config.get("fcnet_activation")) + hiddens = model_config.get("fcnet_hiddens", []) no_final_linear = model_config.get("no_final_linear") vf_share_layers = model_config.get("vf_share_layers") free_log_std = model_config.get("free_log_std") diff --git a/rllib/models/tf/tf_modelv2.py b/rllib/models/tf/tf_modelv2.py index dfb850a339f7..4394d321304a 100644 --- a/rllib/models/tf/tf_modelv2.py +++ b/rllib/models/tf/tf_modelv2.py @@ -107,8 +107,7 @@ def _find_sub_modules(current_key, struct): if isinstance(struct, tf.keras.models.Model): ret = {} for var in struct.variables: - name = re.sub("/", ".", var.name) - key = current_key + "." + name + key = current_key + "." + re.sub("/", ".", var.name) ret[key] = var return ret # Other TFModelV2: Include its vars into ours. @@ -119,7 +118,7 @@ def _find_sub_modules(current_key, struct): } # tf.Variable elif isinstance(struct, tf.Variable): - return {current_key: struct} + return {current_key + "." + struct.name: struct} # List/Tuple. elif isinstance(struct, (tuple, list)): ret = {} @@ -134,7 +133,7 @@ def _find_sub_modules(current_key, struct): current_key += "_" ret = {} for key, value in struct.items(): - sub_vars = TFModelV2._find_sub_modules(current_key + str(key), + sub_vars = TFModelV2._find_sub_modules(current_key + key, value) ret.update(sub_vars) return ret diff --git a/rllib/models/tf/visionnet.py b/rllib/models/tf/visionnet.py index 955ac1e52e7f..b83e867b6545 100644 --- a/rllib/models/tf/visionnet.py +++ b/rllib/models/tf/visionnet.py @@ -13,17 +13,7 @@ class VisionNetwork(TFModelV2): - """Generic vision network implemented in ModelV2 API. - - An additional post-conv fully connected stack can be added and configured - via the config keys: - `post_fcnet_hiddens`: Dense layer sizes after the Conv2D stack. - `post_fcnet_activation`: Activation function to use for this FC stack. - - Examples: - - - """ + """Generic vision network implemented in ModelV2 API.""" def __init__(self, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, num_outputs: int, @@ -39,12 +29,6 @@ def __init__(self, obs_space: gym.spaces.Space, filters = self.model_config["conv_filters"] assert len(filters) > 0,\ "Must provide at least 1 entry in `conv_filters`!" - - # Post FC net config. - post_fcnet_hiddens = model_config.get("post_fcnet_hiddens", []) - post_fcnet_activation = get_activation_fn( - model_config.get("post_fcnet_activation"), framework="tf") - no_final_linear = self.model_config.get("no_final_linear") vf_share_layers = self.model_config.get("vf_share_layers") self.traj_view_framestacking = False @@ -78,29 +62,17 @@ def __init__(self, obs_space: gym.spaces.Space, out_size, kernel, stride = filters[-1] - # No final linear: Last layer has activation function and exits with - # num_outputs nodes (this could be a 1x1 conv or a FC layer, depending - # on `post_fcnet_...` settings). + # No final linear: Last layer is a Conv2D and uses num_outputs. if no_final_linear and num_outputs: last_layer = tf.keras.layers.Conv2D( - out_size if post_fcnet_hiddens else num_outputs, + num_outputs, kernel, strides=(stride, stride), activation=activation, padding="valid", data_format="channels_last", name="conv_out")(last_layer) - # Add (optional) post-fc-stack after last Conv2D layer. - layer_sizes = post_fcnet_hiddens[:-1] + ([num_outputs] - if post_fcnet_hiddens else - []) - for i, out_size in enumerate(layer_sizes): - last_layer = tf.keras.layers.Dense( - out_size, - name="post_fcnet_{}".format(i), - activation=post_fcnet_activation, - kernel_initializer=normc_initializer(1.0))(last_layer) - + conv_out = last_layer # Finish network normally (w/o overriding last layer size with # `num_outputs`), then add another linear one of size `num_outputs`. else: @@ -116,56 +88,29 @@ def __init__(self, obs_space: gym.spaces.Space, # num_outputs defined. Use that to create an exact # `num_output`-sized (1,1)-Conv2D. if num_outputs: - if post_fcnet_hiddens: - last_cnn = last_layer = tf.keras.layers.Conv2D( - post_fcnet_hiddens[0], [1, 1], - activation=post_fcnet_activation, - padding="same", - data_format="channels_last", - name="conv_out")(last_layer) - # Add (optional) post-fc-stack after last Conv2D layer. - for i, out_size in enumerate(post_fcnet_hiddens[1:] + - [num_outputs]): - last_layer = tf.keras.layers.Dense( - out_size, - name="post_fcnet_{}".format(i + 1), - activation=post_fcnet_activation - if i < len(post_fcnet_hiddens) - 1 else None, - kernel_initializer=normc_initializer(1.0))( - last_layer) - else: - last_cnn = last_layer = tf.keras.layers.Conv2D( - num_outputs, [1, 1], - activation=None, - padding="same", - data_format="channels_last", - name="conv_out")(last_layer) - - if last_cnn.shape[1] != 1 or last_cnn.shape[2] != 1: + conv_out = tf.keras.layers.Conv2D( + num_outputs, [1, 1], + activation=None, + padding="same", + data_format="channels_last", + name="conv_out")(last_layer) + + if conv_out.shape[1] != 1 or conv_out.shape[2] != 1: raise ValueError( "Given `conv_filters` ({}) do not result in a [B, 1, " "1, {} (`num_outputs`)] shape (but in {})! Please " "adjust your Conv2D stack such that the dims 1 and 2 " "are both 1.".format(self.model_config["conv_filters"], self.num_outputs, - list(last_cnn.shape))) + list(conv_out.shape))) # num_outputs not known -> Flatten, then set self.num_outputs # to the resulting number of nodes. else: self.last_layer_is_flattened = True - last_layer = tf.keras.layers.Flatten( + conv_out = tf.keras.layers.Flatten( data_format="channels_last")(last_layer) - - # Add (optional) post-fc-stack after last Conv2D layer. - for i, out_size in enumerate(post_fcnet_hiddens): - last_layer = tf.keras.layers.Dense( - out_size, - name="post_fcnet_{}".format(i), - activation=post_fcnet_activation, - kernel_initializer=normc_initializer(1.0))(last_layer) - self.num_outputs = last_layer.shape[1] - logits_out = last_layer + self.num_outputs = conv_out.shape[1] # Build the value layers if vf_share_layers: @@ -206,7 +151,7 @@ def __init__(self, obs_space: gym.spaces.Space, value_out = tf.keras.layers.Lambda( lambda x: tf.squeeze(x, axis=[1, 2]))(last_layer) - self.base_model = tf.keras.Model(inputs, [logits_out, value_out]) + self.base_model = tf.keras.Model(inputs, [conv_out, value_out]) # Optional: framestacking obs/new_obs for Atari. if self.traj_view_framestacking: diff --git a/rllib/models/torch/complex_input_net.py b/rllib/models/torch/complex_input_net.py deleted file mode 100644 index 2b9601947a5e..000000000000 --- a/rllib/models/torch/complex_input_net.py +++ /dev/null @@ -1,163 +0,0 @@ -from gym.spaces import Box, Discrete, Tuple -import numpy as np - -# TODO (sven): add IMPALA-style option. -# from ray.rllib.examples.models.impala_vision_nets import TorchImpalaVisionNet -from ray.rllib.models.torch.misc import normc_initializer as \ - torch_normc_initializer, SlimFC -from ray.rllib.models.catalog import ModelCatalog -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 -from ray.rllib.models.utils import get_filter_config -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.torch_ops import one_hot - -torch, nn = try_import_torch() - - -class ComplexInputNetwork(TorchModelV2, nn.Module): - """TorchModelV2 concat'ing CNN outputs to flat input(s), followed by FC(s). - - Note: This model should be used for complex (Dict or Tuple) observation - spaces that have one or more image components. - - The data flow is as follows: - - `obs` (e.g. Tuple[img0, img1, discrete0]) -> `CNN0 + CNN1 + ONE-HOT` - `CNN0 + CNN1 + ONE-HOT` -> concat all flat outputs -> `out` - `out` -> (optional) FC-stack -> `out2` - `out2` -> action (logits) and vaulue heads. - """ - - def __init__(self, obs_space, action_space, num_outputs, model_config, - name): - # TODO: (sven) Support Dicts as well. - self.original_space = obs_space.original_space if \ - hasattr(obs_space, "original_space") else obs_space - assert isinstance(self.original_space, (Tuple)), \ - "`obs_space.original_space` must be Tuple!" - - nn.Module.__init__(self) - TorchModelV2.__init__(self, self.original_space, action_space, - num_outputs, model_config, name) - - # Atari type CNNs or IMPALA type CNNs (with residual layers)? - # self.cnn_type = self.model_config["custom_model_config"].get( - # "conv_type", "atari") - - # Build the CNN(s) given obs_space's image components. - self.cnns = {} - self.one_hot = {} - self.flatten = {} - concat_size = 0 - for i, component in enumerate(self.original_space): - # Image space. - if len(component.shape) == 3: - config = { - "conv_filters": model_config.get( - "conv_filters", get_filter_config(component.shape)), - "conv_activation": model_config.get("conv_activation"), - "post_fcnet_hiddens": [], - } - # if self.cnn_type == "atari": - cnn = ModelCatalog.get_model_v2( - component, - action_space, - num_outputs=None, - model_config=config, - framework="torch", - name="cnn_{}".format(i)) - # TODO (sven): add IMPALA-style option. - # else: - # cnn = TorchImpalaVisionNet( - # component, - # action_space, - # num_outputs=None, - # model_config=config, - # name="cnn_{}".format(i)) - - concat_size += cnn.num_outputs - self.cnns[i] = cnn - self.add_module("cnn_{}".format(i), cnn) - # Discrete inputs -> One-hot encode. - elif isinstance(component, Discrete): - self.one_hot[i] = True - concat_size += component.n - # TODO: (sven) Multidiscrete (see e.g. our auto-LSTM wrappers). - # Everything else (1D Box). - else: - self.flatten[i] = int(np.product(component.shape)) - concat_size += self.flatten[i] - - # Optional post-concat FC-stack. - post_fc_stack_config = { - "fcnet_hiddens": model_config.get("post_fcnet_hiddens", []), - "fcnet_activation": model_config.get("post_fcnet_activation", - "relu") - } - self.post_fc_stack = ModelCatalog.get_model_v2( - Box(float("-inf"), - float("inf"), - shape=(concat_size, ), - dtype=np.float32), - self.action_space, - None, - post_fc_stack_config, - framework="torch", - name="post_fc_stack") - - # Actions and value heads. - self.logits_layer = None - self.value_layer = None - self._value_out = None - - if num_outputs: - # Action-distribution head. - self.logits_layer = SlimFC( - in_size=self.post_fc_stack.num_outputs, - out_size=num_outputs, - activation_fn=None, - ) - # Create the value branch model. - self.value_layer = SlimFC( - in_size=self.post_fc_stack.num_outputs, - out_size=1, - activation_fn=None, - initializer=torch_normc_initializer(0.01)) - else: - self.num_outputs = concat_size - - @override(ModelV2) - def forward(self, input_dict, state, seq_lens): - # Push image observations through our CNNs. - outs = [] - for i, component in enumerate(input_dict["obs"]): - if i in self.cnns: - cnn_out, _ = self.cnns[i]({"obs": component}) - outs.append(cnn_out) - elif i in self.one_hot: - if component.dtype in [torch.int32, torch.int64, torch.uint8]: - outs.append( - one_hot(component, self.original_space.spaces[i])) - else: - outs.append(component) - else: - outs.append(torch.reshape(component, [-1, self.flatten[i]])) - # Concat all outputs and the non-image inputs. - out = torch.cat(outs, dim=1) - # Push through (optional) FC-stack (this may be an empty stack). - out, _ = self.post_fc_stack({"obs": out}, [], None) - - # No logits/value branches. - if self.logits_layer is None: - return out, [] - - # Logits- and value branches. - logits, values = self.logits_layer(out), self.value_layer(out) - self._value_out = torch.reshape(values, [-1]) - return logits, [] - - @override(ModelV2) - def value_function(self): - return self._value_out diff --git a/rllib/models/torch/fcnet.py b/rllib/models/torch/fcnet.py index dc1608156a67..58fbb6bc476d 100644 --- a/rllib/models/torch/fcnet.py +++ b/rllib/models/torch/fcnet.py @@ -24,11 +24,8 @@ def __init__(self, obs_space: gym.spaces.Space, model_config, name) nn.Module.__init__(self) - hiddens = model_config.get("fcnet_hiddens", []) + \ - model_config.get("post_fcnet_hiddens", []) activation = model_config.get("fcnet_activation") - if not model_config.get("fcnet_hiddens", []): - activation = model_config.get("post_fcnet_activation") + hiddens = model_config.get("fcnet_hiddens", []) no_final_linear = model_config.get("no_final_linear") self.vf_share_layers = model_config.get("vf_share_layers") self.free_log_std = model_config.get("free_log_std") @@ -109,7 +106,7 @@ def __init__(self, obs_space: gym.spaces.Space, self._value_branch = SlimFC( in_size=prev_layer_size, out_size=1, - initializer=normc_initializer(0.01), + initializer=normc_initializer(1.0), activation_fn=None) # Holds the current "base" output (before logits layer). self._features = None diff --git a/rllib/models/torch/misc.py b/rllib/models/torch/misc.py index 9f6d8234e87f..830e8bc33b5e 100644 --- a/rllib/models/torch/misc.py +++ b/rllib/models/torch/misc.py @@ -139,9 +139,8 @@ def __init__(self, layers = [] # Actual nn.Linear layer (including correct initialization logic). linear = nn.Linear(in_size, out_size, bias=use_bias) - if initializer is None: - initializer = nn.init.xavier_uniform_ - initializer(linear.weight) + if initializer: + initializer(linear.weight) if use_bias is True: nn.init.constant_(linear.bias, bias_init) layers.append(linear) diff --git a/rllib/models/torch/visionnet.py b/rllib/models/torch/visionnet.py index 133c851f5b7a..cd6352acd532 100644 --- a/rllib/models/torch/visionnet.py +++ b/rllib/models/torch/visionnet.py @@ -5,7 +5,7 @@ from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 from ray.rllib.models.torch.misc import normc_initializer, same_padding, \ SlimConv2d, SlimFC -from ray.rllib.models.utils import get_activation_fn, get_filter_config +from ray.rllib.models.utils import get_filter_config from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.policy.view_requirement import ViewRequirement from ray.rllib.utils.annotations import override @@ -33,12 +33,6 @@ def __init__(self, obs_space: gym.spaces.Space, filters = self.model_config["conv_filters"] assert len(filters) > 0,\ "Must provide at least 1 entry in `conv_filters`!" - - # Post FC net config. - post_fcnet_hiddens = model_config.get("post_fcnet_hiddens", []) - post_fcnet_activation = get_activation_fn( - model_config.get("post_fcnet_activation"), framework="torch") - no_final_linear = self.model_config.get("no_final_linear") vf_share_layers = self.model_config.get("vf_share_layers") @@ -74,33 +68,17 @@ def __init__(self, obs_space: gym.spaces.Space, out_channels, kernel, stride = filters[-1] - # No final linear: Last layer has activation function and exits with - # num_outputs nodes (this could be a 1x1 conv or a FC layer, depending - # on `post_fcnet_...` settings). + # No final linear: Last layer is a Conv2D and uses num_outputs. if no_final_linear and num_outputs: - out_channels = out_channels if post_fcnet_hiddens else num_outputs layers.append( SlimConv2d( in_channels, - out_channels, + num_outputs, kernel, stride, None, # padding=valid activation_fn=activation)) - - # Add (optional) post-fc-stack after last Conv2D layer. - layer_sizes = post_fcnet_hiddens[:-1] + ([num_outputs] - if post_fcnet_hiddens else - []) - for i, out_size in enumerate(layer_sizes): - layers.append( - SlimFC( - in_size=out_channels, - out_size=out_size, - activation_fn=post_fcnet_activation, - initializer=normc_initializer(1.0))) - out_channels = out_size - + out_channels = num_outputs # Finish network normally (w/o overriding last layer size with # `num_outputs`), then add another linear one of size `num_outputs`. else: @@ -121,31 +99,12 @@ def __init__(self, obs_space: gym.spaces.Space, np.ceil((in_size[1] - kernel[1]) / stride) ] padding, _ = same_padding(in_size, [1, 1], [1, 1]) - if post_fcnet_hiddens: - layers.append(nn.Flatten()) - in_size = out_channels - # Add (optional) post-fc-stack after last Conv2D layer. - for i, out_size in enumerate(post_fcnet_hiddens + - [num_outputs]): - layers.append( - SlimFC( - in_size=in_size, - out_size=out_size, - activation_fn=post_fcnet_activation - if i < len(post_fcnet_hiddens) - 1 else None, - initializer=normc_initializer(1.0))) - in_size = out_size - # Last layer is logits layer. - self._logits = layers.pop() - - else: - self._logits = SlimConv2d( - out_channels, - num_outputs, [1, 1], - 1, - padding, - activation_fn=None) - + self._logits = SlimConv2d( + out_channels, + num_outputs, [1, 1], + 1, + padding, + activation_fn=None) # num_outputs not known -> Flatten, then set self.num_outputs # to the resulting number of nodes. else: @@ -237,19 +196,16 @@ def forward(self, input_dict: Dict[str, TensorType], if not self.last_layer_is_flattened: if self._logits: conv_out = self._logits(conv_out) - if len(conv_out.shape) == 4: - if conv_out.shape[2] != 1 or conv_out.shape[3] != 1: - raise ValueError( - "Given `conv_filters` ({}) do not result in a [B, {} " - "(`num_outputs`), 1, 1] shape (but in {})! Please " - "adjust your Conv2D stack such that the last 2 dims " - "are both 1.".format(self.model_config["conv_filters"], - self.num_outputs, - list(conv_out.shape))) - logits = conv_out.squeeze(3) - logits = logits.squeeze(2) - else: - logits = conv_out + if conv_out.shape[2] != 1 or conv_out.shape[3] != 1: + raise ValueError( + "Given `conv_filters` ({}) do not result in a [B, {} " + "(`num_outputs`), 1, 1] shape (but in {})! Please adjust " + "your Conv2D stack such that the last 2 dims are both " + "1.".format(self.model_config["conv_filters"], + self.num_outputs, list(conv_out.shape))) + logits = conv_out.squeeze(3) + logits = logits.squeeze(2) + return logits, state else: return conv_out, state diff --git a/rllib/offline/__init__.py b/rllib/offline/__init__.py index 540151cc2d4d..69b07c657006 100644 --- a/rllib/offline/__init__.py +++ b/rllib/offline/__init__.py @@ -5,7 +5,6 @@ from ray.rllib.offline.input_reader import InputReader from ray.rllib.offline.mixed_input import MixedInput from ray.rllib.offline.shuffled_input import ShuffledInput -from ray.rllib.offline.d4rl_reader import D4RLReader __all__ = [ "IOContext", @@ -16,5 +15,4 @@ "InputReader", "MixedInput", "ShuffledInput", - "D4RLReader", ] diff --git a/rllib/offline/d4rl_reader.py b/rllib/offline/d4rl_reader.py deleted file mode 100644 index 2c02af08868c..000000000000 --- a/rllib/offline/d4rl_reader.py +++ /dev/null @@ -1,52 +0,0 @@ -import logging -import gym - -from ray.rllib.offline.input_reader import InputReader -from ray.rllib.offline.io_context import IOContext -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.utils.annotations import override, PublicAPI -from ray.rllib.utils.typing import SampleBatchType -from typing import Dict - -logger = logging.getLogger(__name__) - - -@PublicAPI -class D4RLReader(InputReader): - """Reader object that loads the dataset from the D4RL dataset.""" - - @PublicAPI - def __init__(self, inputs: str, ioctx: IOContext = None): - """Initialize a D4RLReader. - - Args: - inputs (str): String corresponding to D4RL environment name - ioctx (IOContext): Current IO context object. - """ - import d4rl - self.env = gym.make(inputs) - self.dataset = convert_to_batch(d4rl.qlearning_dataset(self.env)) - assert self.dataset.count >= 1 - self.dataset.shuffle() - self.counter = 0 - - @override(InputReader) - def next(self) -> SampleBatchType: - if self.counter >= self.dataset.count: - self.counter = 0 - self.dataset.shuffle() - - self.counter += 1 - return self.dataset.slice(start=self.counter, end=self.counter + 1) - - -def convert_to_batch(dataset: Dict) -> SampleBatchType: - # Converts D4RL dataset to SampleBatch - d = {} - d[SampleBatch.OBS] = dataset["observations"] - d[SampleBatch.ACTIONS] = dataset["actions"] - d[SampleBatch.NEXT_OBS] = dataset["next_observations"] - d[SampleBatch.REWARDS] = dataset["rewards"] - d[SampleBatch.DONES] = dataset["terminals"] - - return SampleBatch(d) diff --git a/rllib/policy/dynamic_tf_policy.py b/rllib/policy/dynamic_tf_policy.py index e56691370eb1..10ecf99311e6 100644 --- a/rllib/policy/dynamic_tf_policy.py +++ b/rllib/policy/dynamic_tf_policy.py @@ -580,26 +580,22 @@ def fake_array(tensor): # Add those needed for postprocessing and training. all_accessed_keys = train_batch.accessed_keys | \ batch_for_postproc.accessed_keys - # Tag those only needed for post-processing (with some exceptions). + # Tag those only needed for post-processing. for key in batch_for_postproc.accessed_keys: if key not in train_batch.accessed_keys and \ - key not in self.model.view_requirements and \ - key not in [ - SampleBatch.EPS_ID, SampleBatch.AGENT_INDEX, - SampleBatch.UNROLL_ID, SampleBatch.DONES, - SampleBatch.REWARDS, SampleBatch.INFOS]: + key not in self.model.view_requirements: if key in self.view_requirements: self.view_requirements[key].used_for_training = False if key in self._loss_input_dict: del self._loss_input_dict[key] # Remove those not needed at all (leave those that are needed # by Sampler to properly execute sample collection). - # Also always leave DONES, REWARDS, and INFOS, no matter what. + # Also always leave DONES and REWARDS, no matter what. for key in list(self.view_requirements.keys()): if key not in all_accessed_keys and key not in [ SampleBatch.EPS_ID, SampleBatch.AGENT_INDEX, SampleBatch.UNROLL_ID, SampleBatch.DONES, - SampleBatch.REWARDS, SampleBatch.INFOS] and \ + SampleBatch.REWARDS] and \ key not in self.model.view_requirements: # If user deleted this key manually in postprocessing # fn, warn about it and do not remove from diff --git a/rllib/policy/eager_tf_policy.py b/rllib/policy/eager_tf_policy.py index 050e655ca6ff..805cacaaa4dc 100644 --- a/rllib/policy/eager_tf_policy.py +++ b/rllib/policy/eager_tf_policy.py @@ -5,7 +5,6 @@ import functools import logging import threading -from typing import Dict, List, Optional, Tuple from ray.util.debug import log_once from ray.rllib.models.catalog import ModelCatalog @@ -19,7 +18,6 @@ from ray.rllib.utils.tf_ops import convert_to_non_tf_type from ray.rllib.utils.threading import with_lock from ray.rllib.utils.tracking_dict import UsageTrackingDict -from ray.rllib.utils.typing import TensorType tf1, tf, tfv = try_import_tf() logger = logging.getLogger(__name__) @@ -320,11 +318,8 @@ def postprocess_trajectory(self, @override(Policy) def learn_on_batch(self, postprocessed_batch): # Callback handling. - learn_stats = {} self.callbacks.on_learn_on_batch( - policy=self, - train_batch=postprocessed_batch, - result=learn_stats) + policy=self, train_batch=postprocessed_batch) pad_batch_to_sequences_of_same_size( postprocessed_batch, @@ -336,9 +331,7 @@ def learn_on_batch(self, postprocessed_batch): self._is_training = True postprocessed_batch["is_training"] = True - stats = self._learn_on_batch_eager(postprocessed_batch) - stats.update({"custom_metrics": learn_stats}) - return stats + return self._learn_on_batch_eager(postprocessed_batch) @convert_eager_inputs @convert_eager_outputs @@ -368,7 +361,10 @@ def _compute_gradients_eager(self, samples): grads = [g for g, v in grads_and_vars] return grads, stats + @with_lock @override(Policy) + @convert_eager_inputs + @convert_eager_outputs def compute_actions(self, obs_batch, state_batches=None, @@ -380,9 +376,16 @@ def compute_actions(self, timestep=None, **kwargs): + explore = explore if explore is not None else \ + self.config["explore"] + timestep = timestep if timestep is not None else \ + self.global_timestep + + # TODO: remove python side effect to cull sources of bugs. self._is_training = False self._is_recurrent = \ state_batches is not None and state_batches != [] + self._state_in = state_batches or [] if not tf1.executing_eagerly(): tf1.enable_eager_execution() @@ -391,6 +394,8 @@ def compute_actions(self, SampleBatch.CUR_OBS: tf.convert_to_tensor(obs_batch), "is_training": tf.constant(False), } + batch_size = input_dict[SampleBatch.CUR_OBS].shape[0] + seq_lens = tf.ones(batch_size, dtype=tf.int32) if obs_include_prev_action_reward: if prev_action_batch is not None: input_dict[SampleBatch.PREV_ACTIONS] = \ @@ -399,50 +404,6 @@ def compute_actions(self, input_dict[SampleBatch.PREV_REWARDS] = \ tf.convert_to_tensor(prev_reward_batch) - return self._compute_action_helper(input_dict, state_batches, - episodes, explore, timestep) - - @override(Policy) - def compute_actions_from_input_dict( - self, - input_dict: Dict[str, TensorType], - explore: bool = None, - timestep: Optional[int] = None, - **kwargs - ) -> Tuple[TensorType, List[TensorType], Dict[str, TensorType]]: - - if not tf1.executing_eagerly(): - tf1.enable_eager_execution() - - # Pass lazy (torch) tensor dict to Model as `input_dict`. - input_dict = self._lazy_tensor_dict(input_dict) - # Pack internal state inputs into (separate) list. - state_batches = [ - input_dict[k] for k in input_dict.keys() if "state_in" in k[:8] - ] - - return self._compute_action_helper(input_dict, state_batches, None, - explore, timestep) - - @with_lock - @convert_eager_inputs - @convert_eager_outputs - def _compute_action_helper(self, input_dict, state_batches, episodes, - explore, timestep): - - explore = explore if explore is not None else \ - self.config["explore"] - timestep = timestep if timestep is not None else \ - self.global_timestep - if isinstance(timestep, tf.Tensor): - timestep = int(timestep.numpy()) - self._is_training = False - self._state_in = state_batches or [] - # Calculate RNN sequence lengths. - batch_size = input_dict[SampleBatch.CUR_OBS].shape[0] - seq_lens = tf.ones(batch_size, dtype=tf.int32) if state_batches \ - else None - # Use Exploration object. with tf.variable_creator_scope(_disallow_var_creation): if action_sampler_fn: @@ -535,6 +496,8 @@ def compute_log_likelihoods(self, input_dict[SampleBatch.CUR_OBS], explore=False, is_training=False) + action_dist = dist_class(dist_inputs, self.model) + log_likelihoods = action_dist.logp(actions) # Default log-likelihood calculation. else: dist_inputs, _ = self.model(input_dict, state_batches, diff --git a/rllib/policy/policy.py b/rllib/policy/policy.py index 277ec5c24b3c..577ac3d68c75 100644 --- a/rllib/policy/policy.py +++ b/rllib/policy/policy.py @@ -668,25 +668,20 @@ def _initialize_loss_from_dummy_batch( if key not in self.view_requirements: self.view_requirements[key] = ViewRequirement() if self._loss: - # Tag those only needed for post-processing (with some - # exceptions). + # Tag those only needed for post-processing. for key in batch_for_postproc.accessed_keys: if key not in train_batch.accessed_keys and \ key in self.view_requirements and \ - key not in self.model.view_requirements and \ - key not in [ - SampleBatch.EPS_ID, SampleBatch.AGENT_INDEX, - SampleBatch.UNROLL_ID, SampleBatch.DONES, - SampleBatch.REWARDS, SampleBatch.INFOS]: + key not in self.model.view_requirements: self.view_requirements[key].used_for_training = False # Remove those not needed at all (leave those that are needed # by Sampler to properly execute sample collection). - # Also always leave DONES, REWARDS, INFOS, no matter what. + # Also always leave DONES and REWARDS, no matter what. for key in list(self.view_requirements.keys()): if key not in all_accessed_keys and key not in [ SampleBatch.EPS_ID, SampleBatch.AGENT_INDEX, SampleBatch.UNROLL_ID, SampleBatch.DONES, - SampleBatch.REWARDS, SampleBatch.INFOS] and \ + SampleBatch.REWARDS] and \ key not in self.model.view_requirements: # If user deleted this key manually in postprocessing # fn, warn about it and do not remove from @@ -714,8 +709,7 @@ def _get_dummy_batch_from_view_requirements( ret = {} for view_col, view_req in self.view_requirements.items(): if isinstance(view_req.space, (gym.spaces.Dict, gym.spaces.Tuple)): - _, shape = ModelCatalog.get_action_shape( - view_req.space, framework=self.config["framework"]) + _, shape = ModelCatalog.get_action_shape(view_req.space) ret[view_col] = \ np.zeros((batch_size, ) + shape[1:], np.float32) else: diff --git a/rllib/policy/tests/test_compute_log_likelihoods.py b/rllib/policy/tests/test_compute_log_likelihoods.py index 77c52d44b5d8..b64eabd47cea 100644 --- a/rllib/policy/tests/test_compute_log_likelihoods.py +++ b/rllib/policy/tests/test_compute_log_likelihoods.py @@ -177,8 +177,8 @@ def logp_func(means, log_stds, values, low=-1.0, high=1.0): config, prev_a, continuous=True, - layer_key=("fc", (0, 2), ("action_model._hidden_layers.0.", - "action_model._logits.")), + layer_key=("sequential/action", (2, 4), + ("action_model.action_0.", "action_model.action_out.")), logp_func=logp_func) def test_sac_discr(self): @@ -188,7 +188,12 @@ def test_sac_discr(self): config["policy_model"]["fcnet_activation"] = "linear" prev_a = np.array(0) - do_test_log_likelihood(sac.SACTrainer, config, prev_a) + do_test_log_likelihood( + sac.SACTrainer, + config, + prev_a, + layer_key=("sequential/action", (0, 2), + ("action_model.action_0.", "action_model.action_out."))) if __name__ == "__main__": diff --git a/rllib/policy/tf_policy.py b/rllib/policy/tf_policy.py index e71cd2b44971..3ac64441575d 100644 --- a/rllib/policy/tf_policy.py +++ b/rllib/policy/tf_policy.py @@ -423,18 +423,9 @@ def compute_log_likelihoods( def learn_on_batch( self, postprocessed_batch: SampleBatch) -> Dict[str, TensorType]: assert self.loss_initialized() - builder = TFRunBuilder(self._sess, "learn_on_batch") - - # Callback handling. - learn_stats = {} - self.callbacks.on_learn_on_batch( - policy=self, train_batch=postprocessed_batch, result=learn_stats) - fetches = self._build_learn_on_batch(builder, postprocessed_batch) - stats = builder.get(fetches) - stats.update({"custom_metrics": learn_stats}) - return stats + return builder.get(fetches) @override(Policy) @DeveloperAPI @@ -709,14 +700,9 @@ def _build_signature_def(self): input_signature["prev_reward"] = \ tf1.saved_model.utils.build_tensor_info( self._prev_reward_input) - input_signature["is_training"] = \ tf1.saved_model.utils.build_tensor_info(self._is_training) - if self._timestep is not None: - input_signature["timestep"] = \ - tf1.saved_model.utils.build_tensor_info(self._timestep) - for state_input in self._state_inputs: input_signature[state_input.name] = \ tf1.saved_model.utils.build_tensor_info(state_input) @@ -855,6 +841,10 @@ def _build_apply_gradients(self, builder, gradients): def _build_learn_on_batch(self, builder, postprocessed_batch): self._debug_vars() + # Callback handling. + self.callbacks.on_learn_on_batch( + policy=self, train_batch=postprocessed_batch) + builder.add_feed_dict(self.extra_compute_grad_feed_dict()) builder.add_feed_dict( self._get_loss_inputs_dict(postprocessed_batch, shuffle=False)) diff --git a/rllib/policy/torch_policy.py b/rllib/policy/torch_policy.py index 7ff26dfda601..19d576d3776a 100644 --- a/rllib/policy/torch_policy.py +++ b/rllib/policy/torch_policy.py @@ -159,6 +159,9 @@ def compute_actions( **kwargs) -> \ Tuple[TensorType, List[TensorType], Dict[str, TensorType]]: + explore = explore if explore is not None else self.config["explore"] + timestep = timestep if timestep is not None else self.global_timestep + with torch.no_grad(): seq_lens = torch.ones(len(obs_batch), dtype=torch.int32) input_dict = self._lazy_tensor_dict({ @@ -187,6 +190,9 @@ def compute_actions_from_input_dict( **kwargs) -> \ Tuple[TensorType, List[TensorType], Dict[str, TensorType]]: + explore = explore if explore is not None else self.config["explore"] + timestep = timestep if timestep is not None else self.global_timestep + with torch.no_grad(): # Pass lazy (torch) tensor dict to Model as `input_dict`. input_dict = self._lazy_tensor_dict(input_dict) @@ -210,8 +216,6 @@ def _compute_action_helper(self, input_dict, state_batches, seq_lens, Tuple: - actions, state_out, extra_fetches, logp. """ - explore = explore if explore is not None else self.config["explore"] - timestep = timestep if timestep is not None else self.global_timestep self._is_recurrent = state_batches is not None and state_batches != [] # Switch to eval mode. @@ -347,9 +351,8 @@ def learn_on_batch( if self.model: self.model.train() # Callback handling. - learn_stats = {} self.callbacks.on_learn_on_batch( - policy=self, train_batch=postprocessed_batch, result=learn_stats) + policy=self, train_batch=postprocessed_batch) # Compute gradients (will calculate all losses and `backward()` # them to get the grads). @@ -361,7 +364,6 @@ def learn_on_batch( if self.model: fetches["model"] = self.model.metrics() - fetches.update({"custom_metrics": learn_stats}) return fetches diff --git a/rllib/rollout.py b/rllib/rollout.py index be4bce95a58e..dfc599160865 100755 --- a/rllib/rollout.py +++ b/rllib/rollout.py @@ -12,27 +12,24 @@ import ray import ray.cloudpickle as cloudpickle -from ray.rllib.agents.registry import get_trainer_class from ray.rllib.env import MultiAgentEnv from ray.rllib.env.base_env import _DUMMY_AGENT_ID from ray.rllib.env.env_context import EnvContext from ray.rllib.evaluation.worker_set import WorkerSet from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID +from ray.rllib.utils.deprecation import deprecation_warning from ray.rllib.utils.spaces.space_utils import flatten_to_single_ndarray from ray.tune.utils import merge_dicts from ray.tune.registry import get_trainable_cls, _global_registry, ENV_CREATOR EXAMPLE_USAGE = """ -Example usage via RLlib CLI: +Example Usage via RLlib CLI: rllib rollout /tmp/ray/checkpoint_dir/checkpoint-0 --run DQN --env CartPole-v0 --steps 1000000 --out rollouts.pkl -Example usage via executable: +Example Usage via executable: ./rollout.py /tmp/ray/checkpoint_dir/checkpoint-0 --run DQN --env CartPole-v0 --steps 1000000 --out rollouts.pkl - -Example usage w/o checkpoint (for testing purposes): - ./rollout.py --run PPO --env CartPole-v0 --episodes 500 """ # Note: if you use any custom models or envs, register them here first, e.g.: @@ -45,94 +42,6 @@ # register_env("pa_cartpole", lambda _: ParametricActionsCartPole(10)) -def create_parser(parser_creator=None): - parser_creator = parser_creator or argparse.ArgumentParser - parser = parser_creator( - formatter_class=argparse.RawDescriptionHelpFormatter, - description="Roll out a reinforcement learning agent " - "given a checkpoint.", - epilog=EXAMPLE_USAGE) - - parser.add_argument( - "checkpoint", - type=str, - nargs="?", - help="(Optional) checkpoint from which to roll out. " - "If none given, will use an initial (untrained) Trainer.") - - required_named = parser.add_argument_group("required named arguments") - required_named.add_argument( - "--run", - type=str, - required=True, - help="The algorithm or model to train. This may refer to the name " - "of a built-on algorithm (e.g. RLLib's `DQN` or `PPO`), or a " - "user-defined trainable function or class registered in the " - "tune registry.") - required_named.add_argument( - "--env", - type=str, - help="The environment specifier to use. This could be an openAI gym " - "specifier (e.g. `CartPole-v0`) or a full class-path (e.g. " - "`ray.rllib.examples.env.simple_corridor.SimpleCorridor`).") - parser.add_argument( - "--local-mode", - action="store_true", - help="Run ray in local mode for easier debugging.") - parser.add_argument( - "--no-render", - default=False, - action="store_const", - const=True, - help="Suppress rendering of the environment.") - parser.add_argument( - "--video-dir", - type=str, - default=None, - help="Specifies the directory into which videos of all episode " - "rollouts will be stored.") - parser.add_argument( - "--steps", - default=10000, - help="Number of timesteps to roll out. Rollout will also stop if " - "`--episodes` limit is reached first. A value of 0 means no " - "limitation on the number of timesteps run.") - parser.add_argument( - "--episodes", - default=0, - help="Number of complete episodes to roll out. Rollout will also stop " - "if `--steps` (timesteps) limit is reached first. A value of 0 means " - "no limitation on the number of episodes run.") - parser.add_argument("--out", default=None, help="Output filename.") - parser.add_argument( - "--config", - default="{}", - type=json.loads, - help="Algorithm-specific configuration (e.g. env, hyperparams). " - "Gets merged with loaded configuration from checkpoint file and " - "`evaluation_config` settings therein.") - parser.add_argument( - "--save-info", - default=False, - action="store_true", - help="Save the info field generated by the step() method, " - "as well as the action, observations, rewards and done fields.") - parser.add_argument( - "--use-shelve", - default=False, - action="store_true", - help="Save rollouts into a python shelf file (will save each episode " - "as it is generated). An output filename must be set using --out.") - parser.add_argument( - "--track-progress", - default=False, - action="store_true", - help="Write progress to a temporary file (updated " - "after each episode). An output filename must be set using --out; " - "the progress file will live in the same folder.") - return parser - - class RolloutSaver: """Utility class for storing rollouts. @@ -256,31 +165,108 @@ def append_step(self, obs, action, next_obs, reward, done, info): self._total_steps += 1 +def create_parser(parser_creator=None): + parser_creator = parser_creator or argparse.ArgumentParser + parser = parser_creator( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="Roll out a reinforcement learning agent " + "given a checkpoint.", + epilog=EXAMPLE_USAGE) + + parser.add_argument( + "checkpoint", type=str, help="Checkpoint from which to roll out.") + required_named = parser.add_argument_group("required named arguments") + required_named.add_argument( + "--run", + type=str, + required=True, + help="The algorithm or model to train. This may refer to the name " + "of a built-on algorithm (e.g. RLLib's DQN or PPO), or a " + "user-defined trainable function or class registered in the " + "tune registry.") + required_named.add_argument( + "--env", type=str, help="The gym environment to use.") + parser.add_argument( + "--no-render", + default=False, + action="store_const", + const=True, + help="Suppress rendering of the environment.") + parser.add_argument( + "--monitor", + default=False, + action="store_true", + help="Wrap environment in gym Monitor to record video. NOTE: This " + "option is deprecated: Use `--video-dir [some dir]` instead.") + parser.add_argument( + "--video-dir", + type=str, + default=None, + help="Specifies the directory into which videos of all episode " + "rollouts will be stored.") + parser.add_argument( + "--steps", + default=10000, + help="Number of timesteps to roll out (overwritten by --episodes).") + parser.add_argument( + "--episodes", + default=0, + help="Number of complete episodes to roll out (overrides --steps).") + parser.add_argument("--out", default=None, help="Output filename.") + parser.add_argument( + "--config", + default="{}", + type=json.loads, + help="Algorithm-specific configuration (e.g. env, hyperparams). " + "Gets merged with loaded configuration from checkpoint file and " + "`evaluation_config` settings therein.") + parser.add_argument( + "--save-info", + default=False, + action="store_true", + help="Save the info field generated by the step() method, " + "as well as the action, observations, rewards and done fields.") + parser.add_argument( + "--use-shelve", + default=False, + action="store_true", + help="Save rollouts into a python shelf file (will save each episode " + "as it is generated). An output filename must be set using --out.") + parser.add_argument( + "--track-progress", + default=False, + action="store_true", + help="Write progress to a temporary file (updated " + "after each episode). An output filename must be set using --out; " + "the progress file will live in the same folder.") + return parser + + def run(args, parser): # Load configuration from checkpoint file. - config_path = "" - if args.checkpoint: - config_dir = os.path.dirname(args.checkpoint) - config_path = os.path.join(config_dir, "params.pkl") - # Try parent directory. - if not os.path.exists(config_path): - config_path = os.path.join(config_dir, "../params.pkl") + config_dir = os.path.dirname(args.checkpoint) + config_path = os.path.join(config_dir, "params.pkl") + # Try parent directory. + if not os.path.exists(config_path): + config_path = os.path.join(config_dir, "../params.pkl") - # Load the config from pickled. - if os.path.exists(config_path): - with open(config_path, "rb") as f: - config = cloudpickle.load(f) # If no pkl file found, require command line `--config`. - else: - # If no config in given checkpoint -> Error. - if args.checkpoint: + if not os.path.exists(config_path): + if not args.config: raise ValueError( "Could not find params.pkl in either the checkpoint dir or " - "its parent directory AND no `--config` given on command " - "line!") + "its parent directory AND no config given on command line!") + else: + config = args.config - # Use default config for given agent. - _, config = get_trainer_class(args.run, return_config=True) + # Load the config from pickled. + else: + with open(config_path, "rb") as f: + config = cloudpickle.load(f) + + # Set num_workers to be at least 2. + if "num_workers" in config: + config["num_workers"] = min(2, config["num_workers"]) # Make sure worker 0 has an Env. config["create_env_on_driver"] = True @@ -299,31 +285,25 @@ def run(args, parser): parser.error("the following arguments are required: --env") args.env = config.get("env") - # Make sure we have evaluation workers. - if not config.get("evaluation_num_workers"): - config["evaluation_num_workers"] = config.get("num_workers", 0) - if not config.get("evaluation_num_episodes"): - config["evaluation_num_episodes"] = 1 - config["render_env"] = not args.no_render - config["record_env"] = args.video_dir - - ray.init(local_mode=args.local_mode) + ray.init() # Create the Trainer from config. cls = get_trainable_cls(args.run) agent = cls(env=args.env, config=config) - - # Load state from checkpoint, if provided. - if args.checkpoint: - agent.restore(args.checkpoint) - + # Load state from checkpoint. + agent.restore(args.checkpoint) num_steps = int(args.steps) num_episodes = int(args.episodes) # Determine the video output directory. + # Deprecated way: Use (--out|~/ray_results) + "/monitor" as dir. video_dir = None - # Allow user to specify a video output path. - if args.video_dir: + if args.monitor: + video_dir = os.path.join( + os.path.dirname(args.out or "") + or os.path.expanduser("~/ray_results/"), "monitor") + # New way: Allow user to specify a video output path. + elif args.video_dir: video_dir = os.path.expanduser(args.video_dir) # Do the actual rollout. @@ -353,13 +333,13 @@ def default_policy_agent_mapping(unused_agent_id): def keep_going(steps, num_steps, episodes, num_episodes): """Determine whether we've collected enough data""" - # If num_episodes is set, stop if limit reached. - if num_episodes and episodes >= num_episodes: - return False - # If num_steps is set, stop if limit reached. - elif num_steps and steps >= num_steps: - return False - # Otherwise, keep going. + # if num_episodes is set, this overrides num_steps + if num_episodes: + return episodes < num_episodes + # if num_steps is set, continue until we reach the limit + if num_steps: + return steps < num_steps + # otherwise keep going forever return True @@ -375,36 +355,16 @@ def rollout(agent, if saver is None: saver = RolloutSaver() - # Normal case: Agent was setup correctly with an evaluation WorkerSet, - # which we will now use to rollout. - if hasattr(agent, "evaluation_workers") and isinstance( - agent.evaluation_workers, WorkerSet): - steps = 0 - episodes = 0 - while keep_going(steps, num_steps, episodes, num_episodes): - saver.begin_rollout() - eval_result = agent._evaluate()["evaluation"] - # Increase timestep and episode counters. - eps = agent.config["evaluation_num_episodes"] - episodes += eps - steps += eps * eval_result["episode_len_mean"] - # Print out results and continue. - print("Episode #{}: reward: {}".format( - episodes, eval_result["episode_reward_mean"])) - saver.end_rollout() - return - - # Agent has no evaluation workers, but RolloutWorkers. - elif hasattr(agent, "workers") and isinstance(agent.workers, WorkerSet): + if hasattr(agent, "workers") and isinstance(agent.workers, WorkerSet): env = agent.workers.local_worker().env multiagent = isinstance(env, MultiAgentEnv) if agent.workers.local_worker().multiagent: policy_agent_mapping = agent.config["multiagent"][ "policy_mapping_fn"] + policy_map = agent.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} - # Agent has neither evaluation- nor rollout workers. else: from gym import envs if envs.registry.env_specs.get(agent.config["env"]): @@ -437,7 +397,7 @@ def rollout(agent, env = gym_wrappers.Monitor( env=env, directory=video_dir, - video_callable=lambda _: True, + video_callable=lambda x: True, force=True) steps = 0 @@ -510,6 +470,15 @@ def rollout(agent, parser = create_parser() args = parser.parse_args() + # Old option: monitor, use video-dir instead. + if args.monitor: + deprecation_warning("--monitor", "--video-dir=[some dir]") + # User tries to record videos, but no-render is set: Error. + if (args.monitor or args.video_dir) and args.no_render: + raise ValueError( + "You have --no-render set, but are trying to record rollout videos" + " (via options --video-dir/--monitor)! " + "Either unset --no-render or do not use --video-dir/--monitor.") # --use_shelve w/o --out option. if args.use_shelve and not args.out: raise ValueError( diff --git a/rllib/tests/run_regression_tests.py b/rllib/tests/run_regression_tests.py index cc2650425fb9..3f42147e4071 100644 --- a/rllib/tests/run_regression_tests.py +++ b/rllib/tests/run_regression_tests.py @@ -37,10 +37,6 @@ "--yaml-dir", type=str, help="The directory in which to find all yamls to test.") -parser.add_argument( - "--local-mode", - action="store_true", - help="Run ray in local mode for easier debugging.") # Obsoleted arg, use --framework=torch instead. parser.add_argument( @@ -96,7 +92,7 @@ passed = False for i in range(3): try: - ray.init(num_cpus=5, local_mode=args.local_mode) + ray.init(num_cpus=5) trials = run_experiments(experiments, resume=False, verbose=2) finally: ray.shutdown() diff --git a/rllib/tests/test_catalog.py b/rllib/tests/test_catalog.py index bbd1ec1bbbaa..b98f7143a56d 100644 --- a/rllib/tests/test_catalog.py +++ b/rllib/tests/test_catalog.py @@ -1,15 +1,13 @@ -from functools import partial import gym -from gym.spaces import Box, Dict, Discrete +from gym.spaces import Box, Discrete import numpy as np import unittest import ray -from ray.rllib.models import ActionDistribution, ModelCatalog, MODEL_DEFAULTS -from ray.rllib.models.preprocessors import NoPreprocessor, Preprocessor -from ray.rllib.models.tf.tf_action_dist import MultiActionDistribution, \ - TFActionDistribution +from ray.rllib.models import ModelCatalog, MODEL_DEFAULTS, ActionDistribution from ray.rllib.models.tf.tf_modelv2 import TFModelV2 +from ray.rllib.models.tf.tf_action_dist import TFActionDistribution +from ray.rllib.models.preprocessors import NoPreprocessor, Preprocessor from ray.rllib.utils.annotations import override from ray.rllib.utils.framework import try_import_tf, try_import_torch from ray.rllib.utils.test_utils import framework_iterator @@ -62,12 +60,6 @@ def logp(self, x): return tf.zeros(self.output_shape) -class CustomMultiActionDistribution(MultiActionDistribution): - @override(MultiActionDistribution) - def entropy(self): - raise NotImplementedError - - class TestModelCatalog(unittest.TestCase): def tearDown(self): ray.shutdown() @@ -169,42 +161,6 @@ class Model(): with self.assertRaises(NotImplementedError): dist.entropy() - def test_custom_multi_action_distribution(self): - class Model(): - pass - - ray.init( - object_store_memory=1000 * 1024 * 1024, - ignore_reinit_error=True) # otherwise fails sometimes locally - # registration - ModelCatalog.register_custom_action_dist( - "test", CustomMultiActionDistribution) - s1 = Discrete(5) - s2 = Box(0, 1, shape=(3, ), dtype=np.float32) - spaces = dict(action_1=s1, action_2=s2) - action_space = Dict(spaces) - # test retrieving it - model_config = MODEL_DEFAULTS.copy() - model_config["custom_action_dist"] = "test" - dist_cls, param_shape = ModelCatalog.get_action_dist( - action_space, model_config) - self.assertIsInstance(dist_cls, partial) - self.assertEqual(param_shape, s1.n + 2 * s2.shape[0]) - - # test the class works as a distribution - dist_input = tf1.placeholder(tf.float32, (None, param_shape)) - model = Model() - model.model_config = model_config - dist = dist_cls(dist_input, model=model) - self.assertIsInstance(dist.sample(), dict) - self.assertIn("action_1", dist.sample()) - self.assertIn("action_2", dist.sample()) - self.assertEqual(dist.sample()["action_1"].dtype, tf.int64) - self.assertEqual(dist.sample()["action_2"].shape[1:], s2.shape) - - with self.assertRaises(NotImplementedError): - dist.entropy() - if __name__ == "__main__": import pytest diff --git a/rllib/tests/test_checkpoint_restore.py b/rllib/tests/test_checkpoint_restore.py index b95a50015273..42bc039d8423 100644 --- a/rllib/tests/test_checkpoint_restore.py +++ b/rllib/tests/test_checkpoint_restore.py @@ -4,7 +4,7 @@ import unittest import ray -from ray.rllib.agents.registry import get_trainer_class +from ray.rllib.agents.registry import get_agent_class from ray.rllib.utils.test_utils import check, framework_iterator @@ -69,7 +69,7 @@ def ckpt_restore_test(alg_name, tfe=False): for fw in framework_iterator(config, frameworks=frameworks): for use_object_store in [False, True]: print("use_object_store={}".format(use_object_store)) - cls = get_trainer_class(alg_name) + cls = get_agent_class(alg_name) if "DDPG" in alg_name or "SAC" in alg_name: alg1 = cls(config=config, env="Pendulum-v0") alg2 = cls(config=config, env="Pendulum-v0") diff --git a/rllib/tests/test_eager_support.py b/rllib/tests/test_eager_support.py index b08918e04c28..95e6c69fc9e6 100644 --- a/rllib/tests/test_eager_support.py +++ b/rllib/tests/test_eager_support.py @@ -2,7 +2,7 @@ import ray from ray import tune -from ray.rllib.agents.registry import get_trainer_class +from ray.rllib.agents.registry import get_agent_class from ray.rllib.utils.framework import try_import_tf tf1, tf, tfv = try_import_tf() @@ -23,7 +23,7 @@ def check_support(alg, config, test_eager=False, test_trace=True): else: config["env"] = "CartPole-v0" - a = get_trainer_class(alg) + a = get_agent_class(alg) if test_eager: print("tf-eager: alg={} cont.act={}".format(alg, cont)) config["eager_tracing"] = False diff --git a/rllib/tests/test_export.py b/rllib/tests/test_export.py index bb8bde8e15e6..f2f61b00545f 100644 --- a/rllib/tests/test_export.py +++ b/rllib/tests/test_export.py @@ -5,12 +5,9 @@ import unittest import ray -from ray.rllib.agents.registry import get_trainer_class -from ray.rllib.utils.framework import try_import_tf +from ray.rllib.agents.registry import get_agent_class from ray.tune.trial import ExportFormat -tf1, tf, tfv = try_import_tf() - CONFIGS = { "A3C": { "explore": False, @@ -77,7 +74,7 @@ def valid_tf_checkpoint(checkpoint_dir): and os.path.exists(os.path.join(checkpoint_dir, "model.index")) \ and os.path.exists(os.path.join(checkpoint_dir, "checkpoint")) - cls = get_trainer_class(alg_name) + cls = get_agent_class(alg_name) if "DDPG" in alg_name or "SAC" in alg_name: algo = cls(config=CONFIGS[alg_name], env="Pendulum-v0") else: @@ -108,11 +105,6 @@ def valid_tf_checkpoint(checkpoint_dir): or not valid_tf_checkpoint(os.path.join(export_dir, ExportFormat.CHECKPOINT)): failures.append(alg_name) - - # Test loading the exported model. - model = tf.saved_model.load(os.path.join(export_dir, ExportFormat.MODEL)) - assert model - shutil.rmtree(export_dir) diff --git a/rllib/tests/test_ignore_worker_failure.py b/rllib/tests/test_ignore_worker_failure.py index a49d068f4ec0..8cb9962ce8a0 100644 --- a/rllib/tests/test_ignore_worker_failure.py +++ b/rllib/tests/test_ignore_worker_failure.py @@ -3,7 +3,7 @@ import ray from ray.rllib import _register_all -from ray.rllib.agents.registry import get_trainer_class +from ray.rllib.agents.registry import get_agent_class from ray.rllib.utils.test_utils import framework_iterator from ray.tune.registry import register_env @@ -37,7 +37,7 @@ def do_test(self, alg, config, fn=None): def _do_test_fault_recover(self, alg, config): register_env("fault_env", lambda c: FaultInjectEnv(c)) - agent_cls = get_trainer_class(alg) + agent_cls = get_agent_class(alg) # Test fault handling config["num_workers"] = 2 @@ -51,7 +51,7 @@ def _do_test_fault_recover(self, alg, config): def _do_test_fault_fatal(self, alg, config): register_env("fault_env", lambda c: FaultInjectEnv(c)) - agent_cls = get_trainer_class(alg) + agent_cls = get_agent_class(alg) # Test raises real error when out of workers config["num_workers"] = 2 config["ignore_worker_failures"] = True diff --git a/rllib/tests/test_model_imports.py b/rllib/tests/test_model_imports.py index d4d1c8545311..2a03b3789ff3 100644 --- a/rllib/tests/test_model_imports.py +++ b/rllib/tests/test_model_imports.py @@ -6,7 +6,7 @@ import unittest import ray -from ray.rllib.agents.registry import get_trainer_class +from ray.rllib.agents.registry import get_agent_class from ray.rllib.models.catalog import ModelCatalog from ray.rllib.models.tf.misc import normc_initializer from ray.rllib.models.tf.tf_modelv2 import TFModelV2 @@ -127,7 +127,7 @@ def model_import_test(algo, config, env): rllib_dir = Path(__file__).parent.parent import_file = str(rllib_dir) + "/tests/data/model_weights/weights.h5" - agent_cls = get_trainer_class(algo) + agent_cls = get_agent_class(algo) for fw in framework_iterator(config, ["tf", "torch"]): config["model"]["custom_model"] = "keras_model" if fw != "torch" else \ diff --git a/rllib/tests/test_nested_observation_spaces.py b/rllib/tests/test_nested_observation_spaces.py index e1aac7b42cb3..1a10e8c71d0e 100644 --- a/rllib/tests/test_nested_observation_spaces.py +++ b/rllib/tests/test_nested_observation_spaces.py @@ -333,7 +333,7 @@ def test_invalid_model(self): def test_invalid_model2(self): ModelCatalog.register_custom_model("invalid2", InvalidModel2) self.assertRaisesRegexp( - ValueError, "State output is not a list", + ValueError, "Expected output shape of", lambda: PGTrainer( env="CartPole-v0", config={ "model": { diff --git a/rllib/tests/test_pettingzoo_env.py b/rllib/tests/test_pettingzoo_env.py index d56d82c53d07..bf3fc4aaa4cd 100644 --- a/rllib/tests/test_pettingzoo_env.py +++ b/rllib/tests/test_pettingzoo_env.py @@ -4,7 +4,7 @@ import ray from ray.tune.registry import register_env from ray.rllib.env import PettingZooEnv -from ray.rllib.agents.registry import get_trainer_class +from ray.rllib.agents.registry import get_agent_class from pettingzoo.mpe import simple_spread_v2 @@ -20,7 +20,7 @@ def test_pettingzoo_env(self): register_env("simple_spread", lambda _: PettingZooEnv(simple_spread_v2.env())) - agent_class = get_trainer_class("PPO") + agent_class = get_agent_class("PPO") config = deepcopy(agent_class._default_config) diff --git a/rllib/tests/test_supported_multi_agent.py b/rllib/tests/test_supported_multi_agent.py index 0f4063bb2e88..7e7eecc41b60 100644 --- a/rllib/tests/test_supported_multi_agent.py +++ b/rllib/tests/test_supported_multi_agent.py @@ -1,7 +1,7 @@ import unittest import ray -from ray.rllib.agents.registry import get_trainer_class +from ray.rllib.agents.registry import get_agent_class from ray.rllib.examples.env.multi_agent import MultiAgentCartPole, \ MultiAgentMountainCar from ray.rllib.utils.test_utils import framework_iterator @@ -19,11 +19,10 @@ def check_support_multiagent(alg, config): alg in ["A3C", "APEX", "APEX_DDPG", "IMPALA"]: continue if alg in ["DDPG", "APEX_DDPG", "SAC"]: - a = get_trainer_class(alg)( + a = get_agent_class(alg)( config=config, env="multi_agent_mountaincar") else: - a = get_trainer_class(alg)( - config=config, env="multi_agent_cartpole") + a = get_agent_class(alg)(config=config, env="multi_agent_cartpole") print(a.train()) a.stop() @@ -66,7 +65,7 @@ def test_ppo_multiagent(self): class TestSupportedMultiAgentOffPolicy(unittest.TestCase): @classmethod def setUpClass(cls) -> None: - ray.init(num_cpus=6) + ray.init(num_cpus=4) @classmethod def tearDownClass(cls) -> None: @@ -82,9 +81,6 @@ def test_apex_multiagent(self): "min_iter_time_s": 1, "learning_starts": 10, "target_network_update_freq": 100, - "optimizer": { - "num_replay_buffer_shards": 1, - }, }) def test_apex_ddpg_multiagent(self): diff --git a/rllib/tests/test_supported_spaces.py b/rllib/tests/test_supported_spaces.py index 9da6249273c9..39a7ebb9382f 100644 --- a/rllib/tests/test_supported_spaces.py +++ b/rllib/tests/test_supported_spaces.py @@ -3,7 +3,7 @@ import unittest import ray -from ray.rllib.agents.registry import get_trainer_class +from ray.rllib.agents.registry import get_agent_class from ray.rllib.examples.env.random_env import RandomEnv from ray.rllib.models.tf.fcnet import FullyConnectedNetwork as FCNetV2 from ray.rllib.models.tf.visionnet import VisionNetwork as VisionNetV2 @@ -15,7 +15,7 @@ ACTION_SPACES_TO_TEST = { "discrete": Discrete(5), "vector": Box(-1.0, 1.0, (5, ), dtype=np.float32), - "vector2": Box(-1.0, 1.0, (5, 5), dtype=np.float32), + # "vector2": Box(-1.0, 1.0, (5, 5), dtype=np.float32), "multidiscrete": MultiDiscrete([1, 2, 3, 4]), "tuple": Tuple( [Discrete(2), @@ -47,8 +47,6 @@ def check_support(alg, config, train=True, check_bounds=False, tfe=False): config["log_level"] = "ERROR" - config["train_batch_size"] = 10 - config["rollout_fragment_length"] = 10 def _do_check(alg, config, a_name, o_name): fw = config["framework"] @@ -65,9 +63,11 @@ def _do_check(alg, config, a_name, o_name): p_done=1.0, check_action_bounds=check_bounds))) stat = "ok" + if alg == "SAC": + config["use_state_preprocessor"] = o_name in ["atari", "image"] try: - a = get_trainer_class(alg)(config=config, env=RandomEnv) + a = get_agent_class(alg)(config=config, env=RandomEnv) except UnsupportedSpaceException: stat = "unsupported" else: @@ -90,24 +90,25 @@ def _do_check(alg, config, a_name, o_name): frameworks = ("tf", "torch") if tfe: - frameworks += ("tf2", "tfe") + frameworks += ("tfe", ) for _ in framework_iterator(config, frameworks=frameworks): - # Zip through action- and obs-spaces. - for a_name, o_name in zip(ACTION_SPACES_TO_TEST.keys(), - OBSERVATION_SPACES_TO_TEST.keys()): - _do_check(alg, config, a_name, o_name) - # Do the remaining obs spaces. - assert len(OBSERVATION_SPACES_TO_TEST) >= len(ACTION_SPACES_TO_TEST) - for i, o_name in enumerate(OBSERVATION_SPACES_TO_TEST.keys()): - if i < len(ACTION_SPACES_TO_TEST): + # Check all action spaces (using a discrete obs-space). + for a_name in ACTION_SPACES_TO_TEST.keys(): + _do_check(alg, config, a_name, "discrete") + # Check all obs spaces (using a supported action-space). + for o_name in OBSERVATION_SPACES_TO_TEST.keys(): + # We already tested discrete observation spaces against all action + # spaces above -> skip. + if o_name == "discrete": continue - _do_check(alg, config, "discrete", o_name) + a_name = "discrete" if alg not in ["DDPG", "SAC"] else "vector" + _do_check(alg, config, a_name, o_name) class TestSupportedSpacesPG(unittest.TestCase): @classmethod def setUpClass(cls) -> None: - ray.init(num_cpus=6) + ray.init(num_cpus=4) @classmethod def tearDownClass(cls) -> None: @@ -126,11 +127,11 @@ def test_impala(self): def test_ppo(self): config = { - "num_workers": 0, - "train_batch_size": 100, - "rollout_fragment_length": 10, + "num_workers": 1, "num_sgd_iter": 1, - "sgd_minibatch_size": 10, + "train_batch_size": 10, + "rollout_fragment_length": 10, + "sgd_minibatch_size": 1, } check_support("PPO", config, check_bounds=True, tfe=True) diff --git a/rllib/tests/test_trainer.py b/rllib/tests/test_trainer.py deleted file mode 100644 index 7555c27c5581..000000000000 --- a/rllib/tests/test_trainer.py +++ /dev/null @@ -1,30 +0,0 @@ -"""Testing for trainer class""" -import copy -import unittest -from ray.rllib.agents.trainer import Trainer, COMMON_CONFIG - - -class TestTrainer(unittest.TestCase): - def test_validate_config_idempotent(self): - """ - Asserts that validate_config run multiple - times on COMMON_CONFIG will be idempotent - """ - # Given - standard_config = copy.deepcopy(COMMON_CONFIG) - standard_config["_use_trajectory_view_api"] = False - - # When (we validate config 2 times) - Trainer._validate_config(standard_config) - config_v1 = copy.deepcopy(standard_config) - Trainer._validate_config(standard_config) - config_v2 = copy.deepcopy(standard_config) - - # Then - self.assertEqual(config_v1, config_v2) - - -if __name__ == "__main__": - import pytest - import sys - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/train.py b/rllib/train.py index 8314556d045a..228dcbfbca36 100755 --- a/rllib/train.py +++ b/rllib/train.py @@ -60,7 +60,8 @@ def create_parser(parser_creator=None): parser.add_argument( "--local-mode", action="store_true", - help="Run ray in local mode for easier debugging.") + help="Whether to run ray with `local_mode=True`. " + "Only if --ray-num-nodes is not used.") parser.add_argument( "--ray-num-cpus", default=None, diff --git a/rllib/tuned_examples/cql/halfcheetah-cql.yaml b/rllib/tuned_examples/cql/halfcheetah-cql.yaml index 9a5fa9982875..5bab20751c53 100644 --- a/rllib/tuned_examples/cql/halfcheetah-cql.yaml +++ b/rllib/tuned_examples/cql/halfcheetah-cql.yaml @@ -5,7 +5,6 @@ halfcheetah_cql: episode_reward_mean: 9000 config: # SAC Configs - input: d4rl.halfcheetah-medium-v0 framework: torch horizon: 1000 soft_horizon: false diff --git a/rllib/tuned_examples/sac/atari-sac.yaml b/rllib/tuned_examples/sac/atari-sac.yaml index 4efca862011d..28c6d26db6a1 100644 --- a/rllib/tuned_examples/sac/atari-sac.yaml +++ b/rllib/tuned_examples/sac/atari-sac.yaml @@ -14,6 +14,8 @@ atari-sac-tf-and-torch: framework: grid_search: [tf, torch] gamma: 0.99 + # state-preprocessor=Our default Atari Conv2D-net. + use_state_preprocessor: true Q_model: hidden_activation: relu hidden_layer_sizes: [512] diff --git a/rllib/tuned_examples/sac/mspacman-sac.yaml b/rllib/tuned_examples/sac/mspacman-sac.yaml index 9d563884bf2d..50883b114ecb 100644 --- a/rllib/tuned_examples/sac/mspacman-sac.yaml +++ b/rllib/tuned_examples/sac/mspacman-sac.yaml @@ -11,6 +11,8 @@ mspacman-sac-tf: # Works for both torch and tf. framework: tf gamma: 0.99 + # state-preprocessor=Our default Atari Conv2D-net. + use_state_preprocessor: true Q_model: fcnet_hiddens: [512] fcnet_activation: relu diff --git a/rllib/utils/sgd.py b/rllib/utils/sgd.py index 787b885cd7d6..b5b72d44d37c 100644 --- a/rllib/utils/sgd.py +++ b/rllib/utils/sgd.py @@ -104,12 +104,12 @@ def do_minibatch_sgd(samples, policies, local_worker, num_sgd_iter, """Execute minibatch SGD. Args: - samples (SampleBatch): Batch of samples to optimize. - policies (dict): Dictionary of policies to optimize. - local_worker (RolloutWorker): Master rollout worker instance. - num_sgd_iter (int): Number of epochs of optimization to take. - sgd_minibatch_size (int): Size of minibatches to use for optimization. - standardize_fields (list): List of sample field names that should be + samples (SampleBatch): batch of samples to optimize. + policies (dict): dictionary of policies to optimize. + local_worker (RolloutWorker): master rollout worker instance. + num_sgd_iter (int): number of epochs of optimization to take. + sgd_minibatch_size (int): size of minibatches to use for optimization. + standardize_fields (list): list of sample field names that should be normalized prior to optimization. Returns: diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index 89a402117b4c..eda9d1cfa11a 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -301,10 +301,13 @@ def check_compute_single_action(trainer, assert worker_set if isinstance(worker_set, list): obs_space = trainer.get_policy().observation_space + try: + obs_space = obs_space.original_space + except AttributeError: + pass else: obs_space = worker_set.local_worker().for_policy( lambda p: p.observation_space) - obs_space = getattr(obs_space, "original_space", obs_space) else: method_to_test = pol.compute_single_action obs_space = pol.observation_space diff --git a/rllib/utils/threading.py b/rllib/utils/threading.py index adc7dfe10f40..7361dad65383 100644 --- a/rllib/utils/threading.py +++ b/rllib/utils/threading.py @@ -22,6 +22,6 @@ def wrapper(self, *a, **k): except AttributeError: raise AttributeError( "Object {} must have a `self._lock` property (assigned to a " - "threading.RLock() object in its constructor)!".format(self)) + "threading.Lock() object in its constructor)!".format(self)) return wrapper diff --git a/src/ray/common/placement_group.h b/src/ray/common/placement_group.h index 532f69d74ef9..a068ce4a1e51 100644 --- a/src/ray/common/placement_group.h +++ b/src/ray/common/placement_group.h @@ -67,9 +67,8 @@ class PlacementGroupSpecBuilder { PlacementGroupSpecBuilder &SetPlacementGroupSpec( const PlacementGroupID &placement_group_id, std::string name, const std::vector> &bundles, - const rpc::PlacementStrategy strategy, const bool is_detached, - const JobID &creator_job_id, const ActorID &creator_actor_id, - bool is_creator_detached_actor) { + const rpc::PlacementStrategy strategy, const JobID &creator_job_id, + const ActorID &creator_actor_id, bool is_creator_detached_actor) { message_->set_placement_group_id(placement_group_id.Binary()); message_->set_name(name); message_->set_strategy(strategy); @@ -83,7 +82,6 @@ class PlacementGroupSpecBuilder { message_->set_creator_job_dead(is_creator_detached_actor); message_->set_creator_actor_id(creator_actor_id.Binary()); message_->set_creator_actor_dead(creator_actor_id.IsNil()); - message_->set_is_detached(is_detached); for (size_t i = 0; i < bundles.size(); i++) { auto resources = bundles[i]; diff --git a/src/ray/common/ray_config_def.h b/src/ray/common/ray_config_def.h index 3bcb1554697c..cfbc62517d5e 100644 --- a/src/ray/common/ray_config_def.h +++ b/src/ray/common/ray_config_def.h @@ -35,7 +35,7 @@ RAY_CONFIG(int64_t, ray_cookie, 0x5241590000000000) RAY_CONFIG(int64_t, handler_warning_timeout_ms, 1000) /// The duration between heartbeats sent by the raylets. -RAY_CONFIG(int64_t, raylet_heartbeat_period_milliseconds, 100) +RAY_CONFIG(int64_t, raylet_heartbeat_timeout_milliseconds, 100) /// If a component has not sent a heartbeat in the last num_heartbeats_timeout /// heartbeat intervals, the raylet monitor process will report /// it as dead to the db_client table. @@ -57,6 +57,10 @@ RAY_CONFIG(int64_t, debug_dump_period_milliseconds, 10000) /// type of task from starving other types (see issue #3664). RAY_CONFIG(bool, fair_queueing_enabled, true) +/// Whether to enable object pinning for plasma objects. When this is +/// enabled, objects in scope in the cluster will not be LRU evicted. +RAY_CONFIG(bool, object_pinning_enabled, true) + /// Whether to enable distributed reference counting for objects. When this is /// enabled, an object's ref count will include any references held by other /// processes, such as when an ObjectID is serialized and passed as an argument @@ -66,9 +70,11 @@ RAY_CONFIG(bool, fair_queueing_enabled, true) /// information: /// 1. Local Python references to the ObjectID. /// 2. Pending tasks submitted by the local process that depend on the object. -/// If both this flag is turned on, then an object +/// If both this flag and object_pinning_enabled are turned on, then an object /// will not be LRU evicted until it is out of scope in ALL processes in the -/// cluster and all objects that contain it are also out of scope. +/// cluster and all objects that contain it are also out of scope. If this flag +/// is off and object_pinning_enabled is turned on, then an object will not be +/// LRU evicted until it is out of scope on the CREATOR of the ObjectID. RAY_CONFIG(bool, distributed_ref_counting_enabled, true) /// Whether to record the creation sites of object references. This adds more @@ -76,7 +82,7 @@ RAY_CONFIG(bool, distributed_ref_counting_enabled, true) /// creating object references. RAY_CONFIG(bool, record_ref_creation_sites, true) -/// Objects that have been unpinned are +/// If object_pinning_enabled is on, then objects that have been unpinned are /// added to a local cache. When the cache is flushed, all objects in the cache /// will be eagerly evicted in a batch by freeing all plasma copies in the /// cluster. If set, then this is the duration between attempts to flush the @@ -87,10 +93,10 @@ RAY_CONFIG(bool, record_ref_creation_sites, true) /// serialized, then either passed as an argument or returned from a task. /// NOTE(swang): The timer is checked by the raylet during every heartbeat, so /// this should be set to a value larger than -/// raylet_heartbeat_period_milliseconds. +/// raylet_heartbeat_timeout_milliseconds. RAY_CONFIG(int64_t, free_objects_period_milliseconds, 1000) -/// Objects that have been unpinned are +/// If object_pinning_enabled is on, then objects that have been unpinned are /// added to a local cache. When the cache is flushed, all objects in the cache /// will be eagerly evicted in a batch by freeing all plasma copies in the /// cluster. This is the maximum number of objects in the local cache before it @@ -355,20 +361,7 @@ RAY_CONFIG(bool, automatic_object_deletion_enabled, true) /// Grace period until we throw the OOM error to the application in seconds. RAY_CONFIG(int64_t, oom_grace_period_s, 10) -/// Whether or not the external storage is file system. -/// This is configured based on object_spilling_config. -RAY_CONFIG(bool, is_external_storage_type_fs, true) - /* Configuration parameters for locality-aware scheduling. */ /// Whether to enable locality-aware leasing. If enabled, then Ray will consider task /// dependency locality when choosing a worker for leasing. RAY_CONFIG(bool, locality_aware_leasing_enabled, true) - -/* Configuration parameters for logging */ -/// Parameters for log rotation. This value is equivalent to RotatingFileHandler's -/// maxBytes argument. -RAY_CONFIG(int64_t, log_rotation_max_bytes, 100 * 1024 * 1024) - -/// Parameters for log rotation. This value is equivalent to RotatingFileHandler's -/// backupCount argument. -RAY_CONFIG(int64_t, log_rotation_backup_count, 5) diff --git a/src/ray/common/ray_object.h b/src/ray/common/ray_object.h index c036550a8652..633a5d787c7e 100644 --- a/src/ray/common/ray_object.h +++ b/src/ray/common/ray_object.h @@ -92,20 +92,12 @@ class RayObject { /// large to return directly as part of a gRPC response). bool IsInPlasmaError() const; - /// Mark this object as accessed before. - void SetAccessed() { accessed_ = true; }; - - /// Check if this object was accessed before. - bool WasAccessed() const { return accessed_; } - private: std::shared_ptr data_; std::shared_ptr metadata_; const std::vector nested_ids_; /// Whether this class holds a data copy. bool has_data_copy_; - /// Whether this object was accessed. - bool accessed_ = false; }; } // namespace ray diff --git a/src/ray/core_worker/common.h b/src/ray/core_worker/common.h index bb10aff958ad..1716fe606de9 100644 --- a/src/ray/core_worker/common.h +++ b/src/ray/core_worker/common.h @@ -144,11 +144,8 @@ using PlacementStrategy = rpc::PlacementStrategy; struct PlacementGroupCreationOptions { PlacementGroupCreationOptions( std::string name, PlacementStrategy strategy, - std::vector> bundles, bool is_detached) - : name(std::move(name)), - strategy(strategy), - bundles(std::move(bundles)), - is_detached(is_detached) {} + std::vector> bundles) + : name(std::move(name)), strategy(strategy), bundles(std::move(bundles)) {} /// The name of the placement group. const std::string name; @@ -156,8 +153,6 @@ struct PlacementGroupCreationOptions { const PlacementStrategy strategy = rpc::PACK; /// The resource bundles in this placement group. const std::vector> bundles; - /// Whether to keep the placement group persistent after its creator dead. - const bool is_detached = false; }; } // namespace ray diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 06d12387c8ad..21fc462a7af6 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -161,21 +161,15 @@ CoreWorkerProcess::CoreWorkerProcess(const CoreWorkerOptions &options) // RayConfig is generated in Java_io_ray_runtime_RayNativeRuntime_nativeInitialize // for java worker or in constructor of CoreWorker for python worker. ray::stats::Init(global_tags, options_.metrics_agent_port); - -#ifndef _WIN32 - // NOTE(kfstorm): std::atexit should be put at the end of `CoreWorkerProcess` - // constructor. We assume that spdlog has been initialized before this line. When the - // process is exiting, `HandleAtExit` will be invoked before destructing spdlog static - // variables. We explicitly destruct `CoreWorkerProcess` instance in the callback to - // ensure the static `CoreWorkerProcess` instance is destructed while spdlog is still - // usable. This prevents crashing (or hanging) when using `RAY_LOG` in - // `CoreWorkerProcess` destructor. - RAY_CHECK(std::atexit(CoreWorkerProcess::HandleAtExit) == 0); -#endif } CoreWorkerProcess::~CoreWorkerProcess() { RAY_LOG(INFO) << "Destructing CoreWorkerProcess. pid: " << getpid(); + { + // Check that all `CoreWorker` instances have been removed. + absl::ReaderMutexLock lock(&worker_map_mutex_); + RAY_CHECK(workers_.empty()); + } RAY_LOG(DEBUG) << "Stats stop in core worker."; // Shutdown stats module if worker process exits. ray::stats::Shutdown(); @@ -189,8 +183,6 @@ void CoreWorkerProcess::EnsureInitialized() { << "shutdown."; } -void CoreWorkerProcess::HandleAtExit() { instance_.reset(); } - std::shared_ptr CoreWorkerProcess::TryGetWorker(const WorkerID &worker_id) { if (!instance_) { return nullptr; @@ -422,7 +414,7 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_ return Status::OK(); }, options_.ref_counting_enabled ? reference_counter_ : nullptr, local_raylet_client_, - options_.check_signals, options_.unhandled_exception_handler)); + options_.check_signals)); auto check_node_alive_fn = [this](const NodeID &node_id) { auto node = gcs_client_->Nodes().Get(node_id); @@ -535,56 +527,27 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_ actor_manager_ = std::unique_ptr( new ActorManager(gcs_client_, direct_actor_submitter_, reference_counter_)); - std::function - object_lookup_fn; - - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - object_lookup_fn = [this, node_addr_factory](const ObjectID &object_id, - const ObjectLookupCallback &callback) { - std::vector locations; - const absl::optional> object_locations = - reference_counter_->GetObjectLocations(object_id); - if (object_locations.has_value()) { - locations.reserve(object_locations.value().size()); - for (const auto &node_id : object_locations.value()) { - absl::optional addr = node_addr_factory(node_id); - if (addr.has_value()) { - locations.push_back(addr.value()); - } else { - // We're getting potentially stale locations directly from the reference - // counter, so the location might be a dead node. - RAY_LOG(DEBUG) << "Location " << node_id - << " is dead, not using it in the recovery of object " - << object_id; + auto object_lookup_fn = [this](const ObjectID &object_id, + const ObjectLookupCallback &callback) { + return gcs_client_->Objects().AsyncGetLocations( + object_id, [this, object_id, callback]( + const Status &status, + const boost::optional &result) { + RAY_CHECK_OK(status); + std::vector locations; + for (const auto &loc : result->locations()) { + const auto &node_id = NodeID::FromBinary(loc.manager()); + auto node = gcs_client_->Nodes().Get(node_id); + RAY_CHECK(node.has_value()); + rpc::Address address; + address.set_raylet_id(node->node_id()); + address.set_ip_address(node->node_manager_address()); + address.set_port(node->node_manager_port()); + locations.push_back(address); } - } - } - callback(object_id, locations); - return Status::OK(); - }; - } else { - object_lookup_fn = [this](const ObjectID &object_id, - const ObjectLookupCallback &callback) { - return gcs_client_->Objects().AsyncGetLocations( - object_id, [this, object_id, callback]( - const Status &status, - const boost::optional &result) { - RAY_CHECK_OK(status); - std::vector locations; - for (const auto &loc : result->locations()) { - const auto &node_id = NodeID::FromBinary(loc.manager()); - auto node = gcs_client_->Nodes().Get(node_id); - RAY_CHECK(node.has_value()); - rpc::Address address; - address.set_raylet_id(node->node_id()); - address.set_ip_address(node->node_manager_address()); - address.set_port(node->node_manager_port()); - locations.push_back(address); - } - callback(object_id, locations); - }); - }; - } + callback(object_id, locations); + }); + }; object_recovery_manager_ = std::unique_ptr(new ObjectRecoveryManager( rpc_address_, raylet_client_factory, local_raylet_client_, object_lookup_fn, @@ -603,8 +566,6 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_ // NOTE: This also marks the worker as available in Raylet. We do this at the // very end in case there is a problem during construction. RAY_CHECK_OK(local_raylet_client_->AnnounceWorkerPort(core_worker_server_->GetPort())); - // Used to detect if the object is in the plasma store. - max_direct_call_object_size_ = RayConfig::instance().max_direct_call_object_size(); } void CoreWorker::Shutdown() { @@ -799,7 +760,6 @@ void CoreWorker::InternalHeartbeat(const boost::system::error_code &error) { } absl::MutexLock lock(&mutex_); - while (!to_resubmit_.empty() && current_time_ms() > to_resubmit_.front().first) { auto &spec = to_resubmit_.front().second; if (spec.IsActorTask()) { @@ -920,7 +880,8 @@ Status CoreWorker::Put(const RayObject &object, bool object_exists; if (options_.is_local_mode || (RayConfig::instance().put_small_object_in_memory_store() && - static_cast(object.GetSize()) < max_direct_call_object_size_)) { + static_cast(object.GetSize()) < + RayConfig::instance().max_direct_call_object_size())) { RAY_LOG(DEBUG) << "Put " << object_id << " in memory store"; RAY_CHECK(memory_store_->Put(object, object_id)); return Status::OK(); @@ -961,7 +922,8 @@ Status CoreWorker::CreateOwned(const std::shared_ptr &metadata, NodeID::FromBinary(rpc_address_.raylet_id())); if (options_.is_local_mode || (RayConfig::instance().put_small_object_in_memory_store() && - static_cast(data_size) < max_direct_call_object_size_)) { + static_cast(data_size) < + RayConfig::instance().max_direct_call_object_size())) { *data = std::make_shared(data_size); } else { auto status = @@ -1074,7 +1036,7 @@ Status CoreWorker::Get(const std::vector &ids, const int64_t timeout_m bool missing_result = false; bool will_throw_exception = false; for (size_t i = 0; i < ids.size(); i++) { - const auto pair = result_map.find(ids[i]); + auto pair = result_map.find(ids[i]); if (pair != result_map.end()) { (*results)[i] = pair->second; RAY_CHECK(!pair->second->IsInPlasmaError()); @@ -1096,23 +1058,6 @@ Status CoreWorker::Get(const std::vector &ids, const int64_t timeout_m return Status::OK(); } -Status CoreWorker::GetIfLocal(const std::vector &ids, - std::vector> *results) { - results->resize(ids.size(), nullptr); - - absl::flat_hash_map> result_map; - RAY_RETURN_NOT_OK(plasma_store_provider_->GetIfLocal(ids, &result_map)); - for (size_t i = 0; i < ids.size(); i++) { - auto pair = result_map.find(ids[i]); - // The caller of this method should guarantee that the object exists in the plasma - // store when this method is called. - RAY_CHECK(pair != result_map.end()); - RAY_CHECK(pair->second != nullptr); - (*results)[i] = pair->second; - } - return Status::OK(); -} - Status CoreWorker::Contains(const ObjectID &object_id, bool *has_object) { bool found = false; bool in_plasma = false; @@ -1300,8 +1245,6 @@ void CoreWorker::SpillOwnedObject(const ObjectID &object_id, RAY_LOG(ERROR) << "Failed to spill object " << object_id << ", raylet unreachable or object could not be spilled."; } - // TODO(Clark): Provide spilled URL and spilled node ID to callback so it can - // added them to the reference. callback(); }); } @@ -1312,7 +1255,6 @@ Status CoreWorker::SpillObjects(const std::vector &object_ids) { auto ready_promise = std::make_shared>(std::promise()); Status final_status; - // TODO(Clark): Add spilled URL and spilled node ID to reference in this callback. auto callback = [mutex, num_remaining, ready_promise]() { absl::MutexLock lock(mutex.get()); (*num_remaining)--; @@ -1352,10 +1294,7 @@ Status CoreWorker::SpillObjects(const std::vector &object_ids) { ready_promise->get_future().wait(); for (const auto &object_id : object_ids) { - // TODO(Clark): Move this to the callback (unless we really wanted to batch it) and - // also include the spilled URL, spilled node ID, and updated object size. - reference_counter_->HandleObjectSpilled(object_id, "", NodeID::Nil(), -1, - /*release*/ true); + reference_counter_->HandleObjectSpilled(object_id); } return final_status; } @@ -1524,8 +1463,8 @@ Status CoreWorker::CreatePlacementGroup( builder.SetPlacementGroupSpec( placement_group_id, placement_group_creation_options.name, placement_group_creation_options.bundles, placement_group_creation_options.strategy, - placement_group_creation_options.is_detached, worker_context_.GetCurrentJobID(), - worker_context_.GetCurrentActorID(), worker_context_.CurrentActorDetached()); + worker_context_.GetCurrentJobID(), worker_context_.GetCurrentActorID(), + worker_context_.CurrentActorDetached()); PlacementGroupSpecification placement_group_spec = builder.Build(); *return_placement_group_id = placement_group_id; RAY_LOG(INFO) << "Submitting Placement Group creation to GCS: " << placement_group_id; @@ -1672,9 +1611,7 @@ Status CoreWorker::KillActor(const ActorID &actor_id, bool force_kill, bool no_r stream << "Failed to find a corresponding actor handle for " << actor_id; return Status::Invalid(stream.str()); } - - RAY_CHECK_OK( - gcs_client_->Actors().AsyncKillActor(actor_id, force_kill, no_restart, nullptr)); + direct_actor_submitter_->KillActor(actor_id, force_kill, no_restart); return Status::OK(); } @@ -1823,7 +1760,8 @@ Status CoreWorker::AllocateReturnObjects( // Allocate a buffer for the return object. if (options_.is_local_mode || - static_cast(data_sizes[i]) < max_direct_call_object_size_) { + static_cast(data_sizes[i]) < + RayConfig::instance().max_direct_call_object_size()) { data_buffer = std::make_shared(data_sizes[i]); } else { RAY_RETURN_NOT_OK(CreateExisting(metadatas[i], data_sizes[i], object_ids[i], @@ -2264,29 +2202,18 @@ void CoreWorker::HandleGetObjectLocationsOwner( return; } auto object_id = ObjectID::FromBinary(request.object_id()); - const auto &callback = [object_id, reply, send_reply_callback]( - const absl::flat_hash_set &locations, - int64_t object_size, const std::string &spilled_url, - const NodeID &spilled_node_id, int64_t current_version) { - RAY_LOG(DEBUG) << "Replying to HandleGetObjectLocationsOwner for " << object_id - << " with location update version " << current_version << ", " - << locations.size() << " locations, " << spilled_url - << " spilled url, " << spilled_node_id << " spilled node ID, and " - << object_size << " object size."; - for (const auto &node_id : locations) { + absl::optional> node_ids = + reference_counter_->GetObjectLocations(object_id); + Status status; + if (node_ids.has_value()) { + for (const auto &node_id : node_ids.value()) { reply->add_node_ids(node_id.Binary()); } - reply->set_object_size(object_size); - reply->set_spilled_url(spilled_url); - reply->set_spilled_node_id(spilled_node_id.Binary()); - reply->set_current_version(current_version); - send_reply_callback(Status::OK(), nullptr, nullptr); - }; - auto status = reference_counter_->SubscribeObjectLocations( - object_id, request.last_version(), callback); - if (!status.ok()) { - send_reply_callback(status, nullptr, nullptr); + status = Status::OK(); + } else { + status = Status::ObjectNotFound("Object " + object_id.Hex() + " not found"); } + send_reply_callback(status, nullptr, nullptr); } void CoreWorker::HandleWaitForRefRemoved(const rpc::WaitForRefRemovedRequest &request, @@ -2321,17 +2248,12 @@ void CoreWorker::HandleCancelTask(const rpc::CancelTaskRequest &request, rpc::SendReplyCallback send_reply_callback) { absl::MutexLock lock(&mutex_); TaskID task_id = TaskID::FromBinary(request.intended_task_id()); - bool requested_task_running = main_thread_task_id_ == task_id; - bool success = requested_task_running; + bool success = main_thread_task_id_ == task_id; // Try non-force kill - if (requested_task_running && !request.force_kill()) { + if (success && !request.force_kill()) { RAY_LOG(INFO) << "Interrupting a running task " << main_thread_task_id_; success = options_.kill_main(); - } else if (!requested_task_running) { - // If the task is not currently running, check if it is in the worker's queue of - // normal tasks, and remove it if found. - success = direct_task_receiver_->CancelQueuedNormalTask(task_id); } if (request.recursive()) { auto recursive_cancel = CancelChildren(task_id, request.force_kill()); @@ -2340,14 +2262,11 @@ void CoreWorker::HandleCancelTask(const rpc::CancelTaskRequest &request, } } - // TODO: fix race condition to avoid using this hack - requested_task_running = main_thread_task_id_ == task_id; - reply->set_attempt_succeeded(success); send_reply_callback(Status::OK(), nullptr, nullptr); // Do force kill after reply callback sent - if (requested_task_running && request.force_kill()) { + if (success && request.force_kill()) { RAY_LOG(INFO) << "Force killing a worker running " << main_thread_task_id_; Disconnect(); if (options_.enable_logging) { @@ -2471,13 +2390,7 @@ void CoreWorker::HandleSpillObjects(const rpc::SpillObjectsRequest &request, for (const auto &id_binary : request.object_ids_to_spill()) { object_ids_to_spill.push_back(ObjectID::FromBinary(id_binary)); } - std::vector owner_addresses; - owner_addresses.reserve(request.owner_addresses_size()); - for (const auto &owner_address : request.owner_addresses()) { - owner_addresses.push_back(owner_address.SerializeAsString()); - } - std::vector object_urls = - options_.spill_objects(object_ids_to_spill, owner_addresses); + std::vector object_urls = options_.spill_objects(object_ids_to_spill); for (size_t i = 0; i < object_urls.size(); i++) { reply->add_spilled_objects_url(std::move(object_urls[i])); } @@ -2488,24 +2401,6 @@ void CoreWorker::HandleSpillObjects(const rpc::SpillObjectsRequest &request, } } -void CoreWorker::HandleAddSpilledUrl(const rpc::AddSpilledUrlRequest &request, - rpc::AddSpilledUrlReply *reply, - rpc::SendReplyCallback send_reply_callback) { - const ObjectID object_id = ObjectID::FromBinary(request.object_id()); - const std::string &spilled_url = request.spilled_url(); - const NodeID node_id = NodeID::FromBinary(request.spilled_node_id()); - RAY_LOG(DEBUG) << "Received AddSpilledUrl request for object " << object_id - << ", which has been spilled to " << spilled_url << " on node " - << node_id; - auto reference_exists = reference_counter_->HandleObjectSpilled( - object_id, spilled_url, node_id, request.size(), /*release*/ false); - Status status = - reference_exists - ? Status::OK() - : Status::ObjectNotFound("Object " + object_id.Hex() + " not found"); - send_reply_callback(status, nullptr, nullptr); -} - void CoreWorker::HandleRestoreSpilledObjects( const rpc::RestoreSpilledObjectsRequest &request, rpc::RestoreSpilledObjectsReply *reply, rpc::SendReplyCallback send_reply_callback) { diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h index 47023df7b40b..088ba346a70c 100644 --- a/src/ray/core_worker/core_worker.h +++ b/src/ray/core_worker/core_worker.h @@ -82,7 +82,6 @@ struct CoreWorkerOptions { spill_objects(nullptr), restore_spilled_objects(nullptr), delete_spilled_objects(nullptr), - unhandled_exception_handler(nullptr), get_lang_stack(nullptr), kill_main(nullptr), ref_counting_enabled(false), @@ -138,17 +137,13 @@ struct CoreWorkerOptions { /// be held up in garbage objects. std::function gc_collect; /// Application-language callback to spill objects to external storage. - std::function(const std::vector &, - const std::vector &)> - spill_objects; + std::function(const std::vector &)> spill_objects; /// Application-language callback to restore objects from external storage. std::function &, const std::vector &)> restore_spilled_objects; /// Application-language callback to delete objects from external storage. std::function &, rpc::WorkerType)> delete_spilled_objects; - /// Function to call on error objects never retrieved. - std::function unhandled_exception_handler; /// Language worker callback to get the current call stack. std::function get_lang_stack; // Function that tries to interrupt the currently running Python thread. @@ -270,8 +265,6 @@ class CoreWorkerProcess { /// \return Void. static void EnsureInitialized(); - static void HandleAtExit(); - /// Get the `CoreWorker` instance by worker ID. /// /// \param[in] workerId The worker ID. @@ -562,20 +555,6 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { std::vector> *results, bool plasma_objects_only = false); - /// Get objects directly from the local plasma store, without waiting for the - /// objects to be fetched from another node. This should only be used - /// internally, never by user code. - /// NOTE: Caller of this method should guarantee that the object already exists in the - /// plasma store, thus it doesn't need to fetch from other nodes. - /// - /// \param[in] ids The IDs of the objects to get. - /// \param[out] results The results will be stored here. A nullptr will be - /// added for objects that were not in the local store. - /// \return Status OK if all objects were found. Returns ObjectNotFound error - /// if at least one object was not in the local store. - Status GetIfLocal(const std::vector &ids, - std::vector> *results); - /// Return whether or not the object store contains the given object. /// /// \param[in] object_id ID of the objects to check for. @@ -735,7 +714,6 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { /// Tell an actor to exit immediately, without completing outstanding work. /// /// \param[in] actor_id ID of the actor to kill. - /// \param[in] force_kill Whether to force kill an actor by killing the worker. /// \param[in] no_restart If set to true, the killed actor will not be /// restarted anymore. /// \param[out] Status @@ -916,11 +894,6 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { rpc::SpillObjectsReply *reply, rpc::SendReplyCallback send_reply_callback) override; - // Add spilled URL to owned reference. - void HandleAddSpilledUrl(const rpc::AddSpilledUrlRequest &request, - rpc::AddSpilledUrlReply *reply, - rpc::SendReplyCallback send_reply_callback) override; - // Restore objects from external storage. void HandleRestoreSpilledObjects(const rpc::RestoreSpilledObjectsRequest &request, rpc::RestoreSpilledObjectsReply *reply, @@ -1268,8 +1241,6 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { /// Whether we are shutting down and not running further tasks. bool exiting_ = false; - int64_t max_direct_call_object_size_; - friend class CoreWorkerTest; }; diff --git a/src/ray/core_worker/lib/java/io_ray_runtime_RayNativeRuntime.h b/src/ray/core_worker/lib/java/io_ray_runtime_RayNativeRuntime.h index daa4e05a9300..69c05cf9315f 100644 --- a/src/ray/core_worker/lib/java/io_ray_runtime_RayNativeRuntime.h +++ b/src/ray/core_worker/lib/java/io_ray_runtime_RayNativeRuntime.h @@ -25,7 +25,7 @@ extern "C" { * Class: io_ray_runtime_RayNativeRuntime * Method: nativeInitialize * Signature: - * (ILjava/lang/String;ILjava/lang/String;Ljava/lang/String;Ljava/lang/String;[BLio/ray/runtime/gcs/GcsClientOptions;ILjava/lang/String;Ljava/util/Map;[B)V + * (ILjava/lang/String;ILjava/lang/String;Ljava/lang/String;Ljava/lang/String;[BLio/ray/runtime/gcs/GcsClientOptions;ILjava/lang/String;Ljava/util/Map;)V */ JNIEXPORT void JNICALL Java_io_ray_runtime_RayNativeRuntime_nativeInitialize( JNIEnv *, jclass, jint, jstring, jint, jstring, jstring, jstring, jbyteArray, jobject, diff --git a/src/ray/core_worker/lib/java/io_ray_runtime_object_NativeObjectStore.h b/src/ray/core_worker/lib/java/io_ray_runtime_object_NativeObjectStore.h index fd194de55701..b1da06e57068 100644 --- a/src/ray/core_worker/lib/java/io_ray_runtime_object_NativeObjectStore.h +++ b/src/ray/core_worker/lib/java/io_ray_runtime_object_NativeObjectStore.h @@ -52,7 +52,7 @@ JNIEXPORT jobject JNICALL Java_io_ray_runtime_object_NativeObjectStore_nativeGet /* * Class: io_ray_runtime_object_NativeObjectStore * Method: nativeWait - * Signature: (Ljava/util/List;IJZ)Ljava/util/List; + * Signature: (Ljava/util/List;IJ)Ljava/util/List; */ JNIEXPORT jobject JNICALL Java_io_ray_runtime_object_NativeObjectStore_nativeWait( JNIEnv *, jclass, jobject, jint, jlong, jboolean); @@ -68,7 +68,7 @@ JNIEXPORT void JNICALL Java_io_ray_runtime_object_NativeObjectStore_nativeDelete /* * Class: io_ray_runtime_object_NativeObjectStore * Method: nativeAddLocalReference - * Signature: ([B[B)V + * Signature: ([B)V */ JNIEXPORT void JNICALL Java_io_ray_runtime_object_NativeObjectStore_nativeAddLocalReference(JNIEnv *, jclass, @@ -78,7 +78,7 @@ Java_io_ray_runtime_object_NativeObjectStore_nativeAddLocalReference(JNIEnv *, j /* * Class: io_ray_runtime_object_NativeObjectStore * Method: nativeRemoveLocalReference - * Signature: ([B[B)V + * Signature: ([B)V */ JNIEXPORT void JNICALL Java_io_ray_runtime_object_NativeObjectStore_nativeRemoveLocalReference(JNIEnv *, jclass, diff --git a/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskExecutor.h b/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskExecutor.h index ab7ec077d453..bf376aa12e64 100644 --- a/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskExecutor.h +++ b/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskExecutor.h @@ -21,6 +21,25 @@ #ifdef __cplusplus extern "C" { #endif +#undef io_ray_runtime_task_NativeTaskExecutor_NUM_ACTOR_CHECKPOINTS_TO_KEEP +#define io_ray_runtime_task_NativeTaskExecutor_NUM_ACTOR_CHECKPOINTS_TO_KEEP 20L +/* + * Class: io_ray_runtime_task_NativeTaskExecutor + * Method: nativePrepareCheckpoint + * Signature: ()[B + */ +JNIEXPORT jbyteArray JNICALL +Java_io_ray_runtime_task_NativeTaskExecutor_nativePrepareCheckpoint(JNIEnv *, jclass); + +/* + * Class: io_ray_runtime_task_NativeTaskExecutor + * Method: nativeNotifyActorResumedFromCheckpoint + * Signature: ([B)V + */ +JNIEXPORT void JNICALL +Java_io_ray_runtime_task_NativeTaskExecutor_nativeNotifyActorResumedFromCheckpoint( + JNIEnv *, jclass, jbyteArray); + #ifdef __cplusplus } #endif diff --git a/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.cc b/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.cc index cd374b76a272..5470f70fb395 100644 --- a/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.cc +++ b/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.cc @@ -201,8 +201,7 @@ inline ray::PlacementGroupCreationOptions ToPlacementGroupCreationOptions( }); }); return ray::PlacementGroupCreationOptions(JavaStringToNativeString(env, name), - ConvertStrategy(java_strategy), bundles, - /*is_detached=*/false); + ConvertStrategy(java_strategy), bundles); } #ifdef __cplusplus diff --git a/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.h b/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.h index d57e2d573188..8ea517b60cf9 100644 --- a/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.h +++ b/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.h @@ -74,13 +74,13 @@ Java_io_ray_runtime_task_NativeTaskSubmitter_nativeRemovePlacementGroup(JNIEnv * /* * Class: io_ray_runtime_task_NativeTaskSubmitter * Method: nativeWaitPlacementGroupReady - * Signature: ([BI)Z + * Signature: (J)Z */ JNIEXPORT jboolean JNICALL -Java_io_ray_runtime_task_NativeTaskSubmitter_nativeWaitPlacementGroupReady(JNIEnv *, - jclass, - jbyteArray, - jint); +Java_io_ray_runtime_task_NativeTaskSubmitter__nativeWaitPlacementGroupReady(JNIEnv *, + jclass, + jbyteArray, + jint); #ifdef __cplusplus } diff --git a/src/ray/core_worker/reference_count.cc b/src/ray/core_worker/reference_count.cc index 652663ecf50c..c638f831dbed 100644 --- a/src/ray/core_worker/reference_count.cc +++ b/src/ray/core_worker/reference_count.cc @@ -185,7 +185,6 @@ void ReferenceCounter::UpdateObjectSize(const ObjectID &object_id, int64_t objec auto it = object_id_refs_.find(object_id); if (it != object_id_refs_.end()) { it->second.object_size = object_size; - PushToLocationSubscribers(it); } } @@ -916,12 +915,11 @@ bool ReferenceCounter::AddObjectLocation(const ObjectID &object_id, absl::MutexLock lock(&mutex_); auto it = object_id_refs_.find(object_id); if (it == object_id_refs_.end()) { - RAY_LOG(INFO) << "Tried to add an object location for an object " << object_id - << " that doesn't exist in the reference table"; + RAY_LOG(WARNING) << "Tried to add an object location for an object " << object_id + << " that doesn't exist in the reference table"; return false; } it->second.locations.insert(node_id); - PushToLocationSubscribers(it); return true; } @@ -930,12 +928,11 @@ bool ReferenceCounter::RemoveObjectLocation(const ObjectID &object_id, absl::MutexLock lock(&mutex_); auto it = object_id_refs_.find(object_id); if (it == object_id_refs_.end()) { - RAY_LOG(INFO) << "Tried to remove an object location for an object " << object_id - << " that doesn't exist in the reference table"; + RAY_LOG(WARNING) << "Tried to remove an object location for an object " << object_id + << " that doesn't exist in the reference table"; return false; } it->second.locations.erase(node_id); - PushToLocationSubscribers(it); return true; } @@ -951,42 +948,17 @@ absl::optional> ReferenceCounter::GetObjectLocations return it->second.locations; } -size_t ReferenceCounter::GetObjectSize(const ObjectID &object_id) const { - absl::MutexLock lock(&mutex_); - auto it = object_id_refs_.find(object_id); - if (it == object_id_refs_.end()) { - return 0; - } - return it->second.object_size; -} - -bool ReferenceCounter::HandleObjectSpilled(const ObjectID &object_id, - const std::string spilled_url, - const NodeID &spilled_node_id, int64_t size, - bool release) { +void ReferenceCounter::HandleObjectSpilled(const ObjectID &object_id) { absl::MutexLock lock(&mutex_); auto it = object_id_refs_.find(object_id); if (it == object_id_refs_.end()) { RAY_LOG(WARNING) << "Spilled object " << object_id << " already out of scope"; - return false; + return; } it->second.spilled = true; - if (spilled_url != "") { - it->second.spilled_url = spilled_url; - } - if (!spilled_node_id.IsNil()) { - it->second.spilled_node_id = spilled_node_id; - } - if (size > 0) { - it->second.object_size = size; - } - PushToLocationSubscribers(it); - if (release) { - // Release the primary plasma copy, if any. - ReleasePlasmaObject(it); - } - return true; + // Release the primary plasma copy, if any. + ReleasePlasmaObject(it); } absl::optional ReferenceCounter::GetLocalityData( @@ -1022,41 +994,6 @@ absl::optional ReferenceCounter::GetLocalityData( return locality_data; } -void ReferenceCounter::PushToLocationSubscribers(ReferenceTable::iterator it) { - const auto callbacks = it->second.location_subscription_callbacks; - it->second.location_subscription_callbacks.clear(); - it->second.location_version++; - for (const auto &callback : callbacks) { - callback(it->second.locations, it->second.object_size, it->second.spilled_url, - it->second.spilled_node_id, it->second.location_version); - } -} - -Status ReferenceCounter::SubscribeObjectLocations( - const ObjectID &object_id, int64_t last_location_version, - const LocationSubscriptionCallback &callback) { - absl::MutexLock lock(&mutex_); - auto it = object_id_refs_.find(object_id); - if (it == object_id_refs_.end()) { - RAY_LOG(INFO) << "Tried to register a location subscriber for an object " << object_id - << " that doesn't exist in the reference table." - << " The object has probably already been freed."; - return Status::ObjectNotFound("Object " + object_id.Hex() + " not found"); - } - - if (last_location_version < it->second.location_version) { - // If the last location version is less than the current location version, we - // already have location data that the subscriber hasn't seen yet, so we immediately - // invoke the callback. - callback(it->second.locations, it->second.object_size, it->second.spilled_url, - it->second.spilled_node_id, it->second.location_version); - } else { - // Otherwise, save the callback for later invocation. - it->second.location_subscription_callbacks.push_back(callback); - } - return Status::OK(); -} - ReferenceCounter::Reference ReferenceCounter::Reference::FromProto( const rpc::ObjectReferenceCount &ref_count) { Reference ref; diff --git a/src/ray/core_worker/reference_count.h b/src/ray/core_worker/reference_count.h index 415044d702dd..caceabc53ab5 100644 --- a/src/ray/core_worker/reference_count.h +++ b/src/ray/core_worker/reference_count.h @@ -49,11 +49,6 @@ class ReferenceCounterInterface { virtual ~ReferenceCounterInterface() {} }; -// Callback for location subscriptions. -using LocationSubscriptionCallback = - std::function &, int64_t, const std::string &, - const NodeID &, int64_t)>; - /// Class used by the core worker to keep track of ObjectID reference counts for garbage /// collection. This class is thread safe. class ReferenceCounter : public ReferenceCounterInterface, @@ -402,37 +397,11 @@ class ReferenceCounter : public ReferenceCounterInterface, absl::optional> GetObjectLocations( const ObjectID &object_id) LOCKS_EXCLUDED(mutex_); - /// Subscribe to object location changes that are more recent than the given version. - /// The provided callback will be invoked when new locations become available. - /// - /// \param[in] object_id The object whose locations we want. - /// \param[in] last_location_version The version of the last location update the - /// caller received. Only more recent location updates will be returned. - /// \param[in] callback The callback to invoke with the location update. - /// \return The status of the location get. - Status SubscribeObjectLocations(const ObjectID &object_id, - int64_t last_location_version, - const LocationSubscriptionCallback &callback) - LOCKS_EXCLUDED(mutex_); - - /// Get an object's size. This will return 0 if the object is out of scope. - /// - /// \param[in] object_id The object whose size to get. - /// \return Object size, or 0 if the object is out of scope. - size_t GetObjectSize(const ObjectID &object_id) const; - /// Handle an object has been spilled to external storage. /// /// This notifies the primary raylet that the object is safe to release and - /// records the spill URL, spill node ID, and updated object size. - /// \param[in] object_id The object that has been spilled. - /// \param[in] spilled_url The URL to which the object has been spilled. - /// \param[in] spilled_node_id The ID of the node on which the object was spilled. - /// \param[in] size The size of the object. - /// \param[in] release Whether to release the reference. - /// \return True if the reference exists, false otherwise. - bool HandleObjectSpilled(const ObjectID &object_id, const std::string spilled_url, - const NodeID &spilled_node_id, int64_t size, bool release); + /// records that the object has been spilled to suppress reconstruction. + void HandleObjectSpilled(const ObjectID &object_id); /// Get locality data for object. absl::optional GetLocalityData(const ObjectID &object_id); @@ -517,17 +486,13 @@ class ReferenceCounter : public ReferenceCounterInterface, /// process is a borrower, the borrower must add the owner's address before /// using the ObjectID. absl::optional owner_address; - /// If this object is owned by us and stored in plasma, and reference - /// counting is enabled, then some raylet must be pinning the object value. - /// This is the address of that raylet. + // If this object is owned by us and stored in plasma, and reference + // counting is enabled, then some raylet must be pinning the object value. + // This is the address of that raylet. absl::optional pinned_at_raylet_id; - /// If this object is owned by us and stored in plasma, this contains all - /// object locations. + // If this object is owned by us and stored in plasma, this contains all + // object locations. absl::flat_hash_set locations; - /// A logical counter for object location updates, used for object location - /// subscriptions. Subscribers use -1 to indicate that they want us to - /// immediately send them the current location data. - int64_t location_version = 0; // Whether this object can be reconstructed via lineage. If false, then the // object's value will be pinned as long as it is referenced by any other // object's lineage. @@ -594,16 +559,7 @@ class ReferenceCounter : public ReferenceCounterInterface, size_t lineage_ref_count = 0; /// Whether this object has been spilled to external storage. bool spilled = false; - /// For objects that have been spilled to external storage, the URL from which - /// they can be retrieved. - std::string spilled_url = ""; - /// The ID of the node that spilled the object. - /// This will be Nil if the object has not been spilled or if it is spilled - /// distributed external storage. - NodeID spilled_node_id = NodeID::Nil(); - /// Location subscription callbacks registered by async location get requests. - /// These will be invoked whenever locations or object_size are changed. - std::vector location_subscription_callbacks; + /// Callback that will be called when this ObjectID no longer has /// references. std::function on_delete; @@ -727,12 +683,6 @@ class ReferenceCounter : public ReferenceCounterInterface, void ReleaseLineageReferencesInternal(const std::vector &argument_ids) EXCLUSIVE_LOCKS_REQUIRED(mutex_); - /// Pushes location updates to subscribers of a particular reference, invoking all - /// callbacks registered for the reference by GetLocationsAsync calls. This method - /// also increments the reference's location version counter. - void PushToLocationSubscribers(ReferenceTable::iterator it) - EXCLUSIVE_LOCKS_REQUIRED(mutex_); - /// Address of our RPC server. This is used to determine whether we own a /// given object or not, by comparing our WorkerID with the WorkerID of the /// object's owner. diff --git a/src/ray/core_worker/store_provider/memory_store/memory_store.cc b/src/ray/core_worker/store_provider/memory_store/memory_store.cc index 7897b6504e82..6dad1b37be72 100644 --- a/src/ray/core_worker/store_provider/memory_store/memory_store.cc +++ b/src/ray/core_worker/store_provider/memory_store/memory_store.cc @@ -93,7 +93,6 @@ void GetRequest::Set(const ObjectID &object_id, std::shared_ptr objec if (is_ready_) { return; // We have already hit the number of objects to return limit. } - object->SetAccessed(); objects_.emplace(object_id, object); if (objects_.size() == num_objects_ || (abort_if_any_object_is_exception_ && object->IsException() && @@ -107,7 +106,6 @@ std::shared_ptr GetRequest::Get(const ObjectID &object_id) const { std::unique_lock lock(mutex_); auto iter = objects_.find(object_id); if (iter != objects_.end()) { - iter->second->SetAccessed(); return iter->second; } @@ -118,13 +116,11 @@ CoreWorkerMemoryStore::CoreWorkerMemoryStore( std::function store_in_plasma, std::shared_ptr counter, std::shared_ptr raylet_client, - std::function check_signals, - std::function unhandled_exception_handler) + std::function check_signals) : store_in_plasma_(store_in_plasma), ref_counter_(counter), raylet_client_(raylet_client), - check_signals_(check_signals), - unhandled_exception_handler_(unhandled_exception_handler) {} + check_signals_(check_signals) {} void CoreWorkerMemoryStore::GetAsync( const ObjectID &object_id, std::function)> callback) { @@ -140,7 +136,6 @@ void CoreWorkerMemoryStore::GetAsync( } // It's important for performance to run the callback outside the lock. if (ptr != nullptr) { - ptr->SetAccessed(); callback(ptr); } } @@ -151,7 +146,6 @@ std::shared_ptr CoreWorkerMemoryStore::GetOrPromoteToPlasma( auto iter = objects_.find(object_id); if (iter != objects_.end()) { auto obj = iter->second; - obj->SetAccessed(); if (obj->IsInPlasmaError()) { return nullptr; } @@ -216,8 +210,6 @@ bool CoreWorkerMemoryStore::Put(const RayObject &object, const ObjectID &object_ if (should_add_entry) { // If there is no existing get request, then add the `RayObject` to map. objects_.emplace(object_id, object_entry); - } else { - OnErase(object_entry); } } @@ -231,7 +223,6 @@ bool CoreWorkerMemoryStore::Put(const RayObject &object, const ObjectID &object_ // It's important for performance to run the callbacks outside the lock. for (const auto &cb : async_callbacks) { - object_entry->SetAccessed(); cb(object_entry); } @@ -266,7 +257,6 @@ Status CoreWorkerMemoryStore::GetImpl(const std::vector &object_ids, const auto &object_id = object_ids[i]; auto iter = objects_.find(object_id); if (iter != objects_.end()) { - iter->second->SetAccessed(); (*results)[i] = iter->second; if (remove_after_get) { // Note that we cannot remove the object_id from `objects_` now, @@ -436,7 +426,6 @@ void CoreWorkerMemoryStore::Delete(const absl::flat_hash_set &object_i if (it->second->IsInPlasmaError()) { plasma_ids_to_delete->insert(object_id); } else { - OnErase(it->second); objects_.erase(it); } } @@ -446,11 +435,7 @@ void CoreWorkerMemoryStore::Delete(const absl::flat_hash_set &object_i void CoreWorkerMemoryStore::Delete(const std::vector &object_ids) { absl::MutexLock lock(&mu_); for (const auto &object_id : object_ids) { - auto it = objects_.find(object_id); - if (it != objects_.end()) { - OnErase(it->second); - objects_.erase(it); - } + objects_.erase(object_id); } } @@ -466,14 +451,6 @@ bool CoreWorkerMemoryStore::Contains(const ObjectID &object_id, bool *in_plasma) return false; } -void CoreWorkerMemoryStore::OnErase(std::shared_ptr obj) { - // TODO(ekl) note that this doesn't warn on errors that are stored in plasma. - if (obj->IsException() && !obj->IsInPlasmaError() && !obj->WasAccessed() && - unhandled_exception_handler_ != nullptr) { - unhandled_exception_handler_(*obj); - } -} - MemoryStoreStats CoreWorkerMemoryStore::GetMemoryStoreStatisticalData() { absl::MutexLock lock(&mu_); MemoryStoreStats item; diff --git a/src/ray/core_worker/store_provider/memory_store/memory_store.h b/src/ray/core_worker/store_provider/memory_store/memory_store.h index 0ca94ef6cc02..709227f65206 100644 --- a/src/ray/core_worker/store_provider/memory_store/memory_store.h +++ b/src/ray/core_worker/store_provider/memory_store/memory_store.h @@ -35,8 +35,7 @@ class CoreWorkerMemoryStore { std::function store_in_plasma = nullptr, std::shared_ptr counter = nullptr, std::shared_ptr raylet_client = nullptr, - std::function check_signals = nullptr, - std::function unhandled_exception_handler = nullptr); + std::function check_signals = nullptr); ~CoreWorkerMemoryStore(){}; /// Put an object with specified ID into object store. @@ -144,9 +143,6 @@ class CoreWorkerMemoryStore { std::vector> *results, bool abort_if_any_object_is_exception); - /// Called when an object is erased from the store. - void OnErase(std::shared_ptr obj); - /// Optional callback for putting objects into the plasma store. std::function store_in_plasma_; @@ -177,9 +173,6 @@ class CoreWorkerMemoryStore { /// Function passed in to be called to check for signals (e.g., Ctrl-C). std::function check_signals_; - - /// Function called to report unhandled exceptions. - std::function unhandled_exception_handler_; }; } // namespace ray diff --git a/src/ray/core_worker/store_provider/plasma_store_provider.cc b/src/ray/core_worker/store_provider/plasma_store_provider.cc index f3b5f047c8fc..831f2629a9b1 100644 --- a/src/ray/core_worker/store_provider/plasma_store_provider.cc +++ b/src/ray/core_worker/store_provider/plasma_store_provider.cc @@ -191,8 +191,7 @@ Status CoreWorkerPlasmaStoreProvider::FetchAndGetFromPlasmaStore( std::vector plasma_results; { std::lock_guard guard(store_client_mutex_); - RAY_RETURN_NOT_OK(store_client_.Get(batch_ids, timeout_ms, &plasma_results, - /*is_from_worker=*/true)); + RAY_RETURN_NOT_OK(store_client_.Get(batch_ids, timeout_ms, &plasma_results)); } // Add successfully retrieved objects to the result map and remove them from @@ -226,40 +225,6 @@ Status CoreWorkerPlasmaStoreProvider::FetchAndGetFromPlasmaStore( return Status::OK(); } -Status CoreWorkerPlasmaStoreProvider::GetIfLocal( - const std::vector &object_ids, - absl::flat_hash_map> *results) { - std::vector plasma_results; - { - std::lock_guard guard(store_client_mutex_); - // Since this path is used only for spilling, we should set is_from_worker: false. - RAY_RETURN_NOT_OK(store_client_.Get(object_ids, /*timeout_ms=*/0, &plasma_results, - /*is_from_worker=*/false)); - } - - for (size_t i = 0; i < object_ids.size(); i++) { - if (plasma_results[i].data != nullptr || plasma_results[i].metadata != nullptr) { - const auto &object_id = object_ids[i]; - std::shared_ptr data = nullptr; - std::shared_ptr metadata = nullptr; - if (plasma_results[i].data && plasma_results[i].data->Size()) { - // We track the set of active data buffers in active_buffers_. On destruction, - // the buffer entry will be removed from the set via callback. - data = std::make_shared(plasma_results[i].data, buffer_tracker_, - object_id); - buffer_tracker_->Record(object_id, data.get(), get_current_call_site_()); - } - if (plasma_results[i].metadata && plasma_results[i].metadata->Size()) { - metadata = plasma_results[i].metadata; - } - const auto result_object = - std::make_shared(data, metadata, std::vector()); - (*results)[object_id] = result_object; - } - } - return Status::OK(); -} - Status UnblockIfNeeded(const std::shared_ptr &client, const WorkerContext &ctx) { if (ctx.CurrentTaskIsDirectCall()) { @@ -464,7 +429,7 @@ Status CoreWorkerPlasmaStoreProvider::WarmupStore() { RAY_RETURN_NOT_OK(Create(nullptr, 8, object_id, rpc::Address(), &data)); RAY_RETURN_NOT_OK(Seal(object_id)); RAY_RETURN_NOT_OK(Release(object_id)); - RAY_RETURN_NOT_OK(Delete({object_id}, true)); + RAY_RETURN_NOT_OK(Delete({object_id}, false)); return Status::OK(); } diff --git a/src/ray/core_worker/store_provider/plasma_store_provider.h b/src/ray/core_worker/store_provider/plasma_store_provider.h index e67c561b6c9c..2282a09a91b1 100644 --- a/src/ray/core_worker/store_provider/plasma_store_provider.h +++ b/src/ray/core_worker/store_provider/plasma_store_provider.h @@ -143,18 +143,6 @@ class CoreWorkerPlasmaStoreProvider { absl::flat_hash_map> *results, bool *got_exception); - /// Get objects directly from the local plasma store, without waiting for the - /// objects to be fetched from another node. This should only be used - /// internally, never by user code. - /// - /// \param[in] ids The IDs of the objects to get. - /// \param[out] results The results will be stored here. A nullptr will be - /// added for objects that were not in the local store. - /// \return Status OK if the request to the local object store was - /// successful. - Status GetIfLocal(const std::vector &ids, - absl::flat_hash_map> *results); - Status Contains(const ObjectID &object_id, bool *has_object); Status Wait(const absl::flat_hash_set &object_ids, int num_objects, diff --git a/src/ray/core_worker/test/core_worker_test.cc b/src/ray/core_worker/test/core_worker_test.cc index cf1bab624de2..82ea826175e4 100644 --- a/src/ray/core_worker/test/core_worker_test.cc +++ b/src/ray/core_worker/test/core_worker_test.cc @@ -841,48 +841,6 @@ TEST_F(SingleNodeTest, TestNormalTaskLocal) { TestNormalTask(resources); } -TEST_F(SingleNodeTest, TestCancelTasks) { - auto &driver = CoreWorkerProcess::GetCoreWorker(); - - // Create two functions, each implementing a while(true) loop. - RayFunction func1(ray::Language::PYTHON, ray::FunctionDescriptorBuilder::BuildPython( - "WhileTrueLoop", "", "", "")); - RayFunction func2(ray::Language::PYTHON, ray::FunctionDescriptorBuilder::BuildPython( - "WhileTrueLoop", "", "", "")); - // Return IDs for the two functions that implement while(true) loops. - std::vector return_ids1; - std::vector return_ids2; - - // Create default args and options needed to submit the tasks that encapsulate func1 and - // func2. - std::vector> args; - TaskOptions options; - - // Submit func1. The function should start looping forever. - driver.SubmitTask(func1, args, options, &return_ids1, /*max_retries=*/0, - std::make_pair(PlacementGroupID::Nil(), -1), true, - /*debugger_breakpoint=*/""); - ASSERT_EQ(return_ids1.size(), 1); - - // Submit func2. The function should be queued at the worker indefinitely. - driver.SubmitTask(func2, args, options, &return_ids2, /*max_retries=*/0, - std::make_pair(PlacementGroupID::Nil(), -1), true, - /*debugger_breakpoint=*/""); - ASSERT_EQ(return_ids2.size(), 1); - - // Cancel func2 by removing it from the worker's queue - RAY_CHECK_OK(driver.CancelTask(return_ids2[0], true, false)); - - // Cancel func1, which is currently running. - RAY_CHECK_OK(driver.CancelTask(return_ids1[0], true, false)); - - // TestNormalTask will get stuck unless both func1 and func2 have been cancelled. Thus, - // if TestNormalTask succeeds, we know that func2 must have been removed from the - // worker's queue. - std::unordered_map resources; - TestNormalTask(resources); -} - TEST_F(TwoNodeTest, TestNormalTaskCrossNodes) { std::unordered_map resources; resources.emplace("resource1", 1); diff --git a/src/ray/core_worker/test/memory_store_test.cc b/src/ray/core_worker/test/memory_store_test.cc deleted file mode 100644 index f4403e4a887e..000000000000 --- a/src/ray/core_worker/test/memory_store_test.cc +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright 2017 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "ray/core_worker/store_provider/memory_store/memory_store.h" - -#include "gtest/gtest.h" -#include "ray/common/test_util.h" - -namespace ray { - -TEST(TestMemoryStore, TestReportUnhandledErrors) { - std::vector> results; - WorkerContext context(WorkerType::WORKER, WorkerID::FromRandom(), JobID::FromInt(0)); - int unhandled_count = 0; - - std::shared_ptr provider = - std::make_shared( - nullptr, nullptr, nullptr, nullptr, - [&](const RayObject &obj) { unhandled_count++; }); - RayObject obj1(rpc::ErrorType::TASK_EXECUTION_EXCEPTION); - RayObject obj2(rpc::ErrorType::TASK_EXECUTION_EXCEPTION); - auto id1 = ObjectID::FromRandom(); - auto id2 = ObjectID::FromRandom(); - - // Check delete without get. - RAY_CHECK(provider->Put(obj1, id1)); - RAY_CHECK(provider->Put(obj2, id2)); - ASSERT_EQ(unhandled_count, 0); - provider->Delete({id1, id2}); - ASSERT_EQ(unhandled_count, 2); - unhandled_count = 0; - - // Check delete after get. - RAY_CHECK(provider->Put(obj1, id1)); - RAY_CHECK(provider->Put(obj1, id2)); - provider->Get({id1}, 1, 100, context, false, &results); - provider->GetOrPromoteToPlasma(id2); - provider->Delete({id1, id2}); - ASSERT_EQ(unhandled_count, 0); - - // Check delete after async get. - provider->GetAsync({id2}, [](std::shared_ptr obj) {}); - RAY_CHECK(provider->Put(obj1, id1)); - RAY_CHECK(provider->Put(obj2, id2)); - provider->GetAsync({id1}, [](std::shared_ptr obj) {}); - provider->Delete({id1, id2}); - ASSERT_EQ(unhandled_count, 0); -} - -} // namespace ray - -int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/src/ray/core_worker/test/mock_worker.cc b/src/ray/core_worker/test/mock_worker.cc index 03a78a1981a7..4439519bb5ce 100644 --- a/src/ray/core_worker/test/mock_worker.cc +++ b/src/ray/core_worker/test/mock_worker.cc @@ -79,8 +79,6 @@ class MockWorker { } else if ("MergeInputArgsAsOutput" == typed_descriptor->ModuleName()) { // Merge input args and write the merged content to each of return ids return MergeInputArgsAsOutput(args, return_ids, results); - } else if ("WhileTrueLoop" == typed_descriptor->ModuleName()) { - return WhileTrueLoop(args, return_ids, results); } else { return Status::TypeError("Unknown function descriptor: " + typed_descriptor->ModuleName()); @@ -130,15 +128,6 @@ class MockWorker { return Status::OK(); } - Status WhileTrueLoop(const std::vector> &args, - const std::vector &return_ids, - std::vector> *results) { - while (1) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } - return Status::OK(); - } - int64_t prev_seq_no_ = 0; }; diff --git a/src/ray/core_worker/test/scheduling_queue_test.cc b/src/ray/core_worker/test/scheduling_queue_test.cc index 6854c1810e3e..8c8e60fd5251 100644 --- a/src/ray/core_worker/test/scheduling_queue_test.cc +++ b/src/ray/core_worker/test/scheduling_queue_test.cc @@ -66,9 +66,9 @@ TEST(SchedulingQueueTest, TestWaitForObjects) { auto fn_ok = [&n_ok]() { n_ok++; }; auto fn_rej = [&n_rej]() { n_rej++; }; queue.Add(0, -1, fn_ok, fn_rej); - queue.Add(1, -1, fn_ok, fn_rej, TaskID::Nil(), ObjectIdsToRefs({obj1})); - queue.Add(2, -1, fn_ok, fn_rej, TaskID::Nil(), ObjectIdsToRefs({obj2})); - queue.Add(3, -1, fn_ok, fn_rej, TaskID::Nil(), ObjectIdsToRefs({obj3})); + queue.Add(1, -1, fn_ok, fn_rej, ObjectIdsToRefs({obj1})); + queue.Add(2, -1, fn_ok, fn_rej, ObjectIdsToRefs({obj2})); + queue.Add(3, -1, fn_ok, fn_rej, ObjectIdsToRefs({obj3})); ASSERT_EQ(n_ok, 1); waiter.Complete(0); @@ -92,7 +92,7 @@ TEST(SchedulingQueueTest, TestWaitForObjectsNotSubjectToSeqTimeout) { auto fn_ok = [&n_ok]() { n_ok++; }; auto fn_rej = [&n_rej]() { n_rej++; }; queue.Add(0, -1, fn_ok, fn_rej); - queue.Add(1, -1, fn_ok, fn_rej, TaskID::Nil(), ObjectIdsToRefs({obj1})); + queue.Add(1, -1, fn_ok, fn_rej, ObjectIdsToRefs({obj1})); ASSERT_EQ(n_ok, 1); io_service.run(); ASSERT_EQ(n_rej, 0); @@ -158,25 +158,6 @@ TEST(SchedulingQueueTest, TestSkipAlreadyProcessedByClient) { ASSERT_EQ(n_rej, 2); } -TEST(SchedulingQueueTest, TestCancelQueuedTask) { - NormalSchedulingQueue *queue = new NormalSchedulingQueue(); - ASSERT_TRUE(queue->TaskQueueEmpty()); - int n_ok = 0; - int n_rej = 0; - auto fn_ok = [&n_ok]() { n_ok++; }; - auto fn_rej = [&n_rej]() { n_rej++; }; - queue->Add(-1, -1, fn_ok, fn_rej); - queue->Add(-1, -1, fn_ok, fn_rej); - queue->Add(-1, -1, fn_ok, fn_rej); - queue->Add(-1, -1, fn_ok, fn_rej); - queue->Add(-1, -1, fn_ok, fn_rej); - ASSERT_TRUE(queue->CancelTaskIfFound(TaskID::Nil())); - ASSERT_FALSE(queue->TaskQueueEmpty()); - queue->ScheduleRequests(); - ASSERT_EQ(n_ok, 4); - ASSERT_EQ(n_rej, 0); -} - } // namespace ray int main(int argc, char **argv) { diff --git a/src/ray/core_worker/transport/direct_actor_transport.cc b/src/ray/core_worker/transport/direct_actor_transport.cc index bac80af4f7a6..e266b0d94f01 100644 --- a/src/ray/core_worker/transport/direct_actor_transport.cc +++ b/src/ray/core_worker/transport/direct_actor_transport.cc @@ -482,12 +482,12 @@ void CoreWorkerDirectTaskReceiver::HandleTask( // TODO(swang): Remove this with legacy raylet code. dependencies.pop_back(); it->second->Add(request.sequence_number(), request.client_processed_up_to(), - accept_callback, reject_callback, task_spec.TaskId(), dependencies); + accept_callback, reject_callback, dependencies); } else { // Add the normal task's callbacks to the non-actor scheduling queue. normal_scheduling_queue_->Add(request.sequence_number(), request.client_processed_up_to(), accept_callback, - reject_callback, task_spec.TaskId(), dependencies); + reject_callback, dependencies); } } @@ -501,10 +501,4 @@ void CoreWorkerDirectTaskReceiver::RunNormalTasksFromQueue() { normal_scheduling_queue_->ScheduleRequests(); } -bool CoreWorkerDirectTaskReceiver::CancelQueuedNormalTask(TaskID task_id) { - // Look up the task to be canceled in the queue of normal tasks. If it is found and - // removed successfully, return true. - return normal_scheduling_queue_->CancelTaskIfFound(task_id); -} - } // namespace ray diff --git a/src/ray/core_worker/transport/direct_actor_transport.h b/src/ray/core_worker/transport/direct_actor_transport.h index cbd0a82fccf6..ab28dc85a8ba 100644 --- a/src/ray/core_worker/transport/direct_actor_transport.h +++ b/src/ray/core_worker/transport/direct_actor_transport.h @@ -254,23 +254,19 @@ class InboundRequest { public: InboundRequest(){}; InboundRequest(std::function accept_callback, - std::function reject_callback, TaskID task_id, - bool has_dependencies) + std::function reject_callback, bool has_dependencies) : accept_callback_(accept_callback), reject_callback_(reject_callback), - task_id(task_id), has_pending_dependencies_(has_dependencies) {} void Accept() { accept_callback_(); } void Cancel() { reject_callback_(); } bool CanExecute() const { return !has_pending_dependencies_; } - ray::TaskID TaskID() const { return task_id; } void MarkDependenciesSatisfied() { has_pending_dependencies_ = false; } private: std::function accept_callback_; std::function reject_callback_; - ray::TaskID task_id; bool has_pending_dependencies_; }; @@ -350,11 +346,10 @@ class SchedulingQueue { public: virtual void Add(int64_t seq_no, int64_t client_processed_up_to, std::function accept_request, - std::function reject_request, TaskID task_id = TaskID::Nil(), + std::function reject_request, const std::vector &dependencies = {}) = 0; virtual void ScheduleRequests() = 0; virtual bool TaskQueueEmpty() const = 0; - virtual bool CancelTaskIfFound(TaskID task_id) = 0; virtual ~SchedulingQueue(){}; }; @@ -376,7 +371,6 @@ class ActorSchedulingQueue : public SchedulingQueue { /// Add a new actor task's callbacks to the worker queue. void Add(int64_t seq_no, int64_t client_processed_up_to, std::function accept_request, std::function reject_request, - TaskID task_id = TaskID::Nil(), const std::vector &dependencies = {}) { // A seq_no of -1 means no ordering constraint. Actor tasks must be executed in order. RAY_CHECK(seq_no != -1); @@ -389,7 +383,7 @@ class ActorSchedulingQueue : public SchedulingQueue { } RAY_LOG(DEBUG) << "Enqueue " << seq_no << " cur seqno " << next_seq_no_; pending_actor_tasks_[seq_no] = - InboundRequest(accept_request, reject_request, task_id, dependencies.size() > 0); + InboundRequest(accept_request, reject_request, dependencies.size() > 0); if (dependencies.size() > 0) { waiter_.Wait(dependencies, [seq_no, this]() { RAY_CHECK(boost::this_thread::get_id() == main_thread_id_); @@ -403,15 +397,6 @@ class ActorSchedulingQueue : public SchedulingQueue { ScheduleRequests(); } - // We don't allow the cancellation of actor tasks, so invoking CancelTaskIfFound results - // in a fatal error. - bool CancelTaskIfFound(TaskID task_id) { - RAY_CHECK(false) << "Cannot cancel actor tasks"; - // The return instruction will never be executed, but we need to include it - // nonetheless because this is a non-void function. - return false; - } - /// Schedules as many requests as possible in sequence. void ScheduleRequests() { // Only call SetMaxActorConcurrency to configure threadpool size when the @@ -535,45 +520,22 @@ class NormalSchedulingQueue : public SchedulingQueue { /// Add a new task's callbacks to the worker queue. void Add(int64_t seq_no, int64_t client_processed_up_to, std::function accept_request, std::function reject_request, - TaskID task_id = TaskID::Nil(), const std::vector &dependencies = {}) { absl::MutexLock lock(&mu_); // Normal tasks should not have ordering constraints. RAY_CHECK(seq_no == -1); // Create a InboundRequest object for the new task, and add it to the queue. pending_normal_tasks_.push_back( - InboundRequest(accept_request, reject_request, task_id, dependencies.size() > 0)); - } - - // Search for an InboundRequest associated with the task that we are trying to cancel. - // If found, remove the InboundRequest from the queue and return true. Otherwise, return - // false. - bool CancelTaskIfFound(TaskID task_id) { - absl::MutexLock lock(&mu_); - for (std::deque::reverse_iterator it = pending_normal_tasks_.rbegin(); - it != pending_normal_tasks_.rend(); ++it) { - if (it->TaskID() == task_id) { - pending_normal_tasks_.erase(std::next(it).base()); - return true; - } - } - return false; + InboundRequest(accept_request, reject_request, dependencies.size() > 0)); } /// Schedules as many requests as possible in sequence. void ScheduleRequests() { - while (true) { - InboundRequest head; - { - absl::MutexLock lock(&mu_); - if (!pending_normal_tasks_.empty()) { - head = pending_normal_tasks_.front(); - pending_normal_tasks_.pop_front(); - } else { - return; - } - } + absl::MutexLock lock(&mu_); + while (!pending_normal_tasks_.empty()) { + auto &head = pending_normal_tasks_.front(); head.Accept(); + pending_normal_tasks_.pop_front(); } } @@ -621,8 +583,6 @@ class CoreWorkerDirectTaskReceiver { /// Pop tasks from the queue and execute them sequentially void RunNormalTasksFromQueue(); - bool CancelQueuedNormalTask(TaskID task_id); - private: // Worker context. WorkerContext &worker_context_; diff --git a/src/ray/gcs/accessor.h b/src/ray/gcs/accessor.h index db240b411cdf..83dc3de3ca46 100644 --- a/src/ray/gcs/accessor.h +++ b/src/ray/gcs/accessor.h @@ -64,16 +64,6 @@ class ActorInfoAccessor { virtual Status AsyncRegisterActor(const TaskSpecification &task_spec, const StatusCallback &callback) = 0; - /// Kill actor via GCS asynchronously. - /// - /// \param actor_id The ID of actor to destroy. - /// \param force_kill Whether to force kill an actor by killing the worker. - /// \param no_restart If set to true, the killed actor will not be restarted anymore. - /// \param callback Callback that will be called after the actor is destroyed. - /// \return Status - virtual Status AsyncKillActor(const ActorID &actor_id, bool force_kill, bool no_restart, - const StatusCallback &callback) = 0; - /// Asynchronously request GCS to create the actor. /// /// This should be called after the worker has resolved the actor dependencies. @@ -307,18 +297,16 @@ class ObjectInfoAccessor { /// \param callback Callback that will be called after object has been added to GCS. /// \return Status virtual Status AsyncAddLocation(const ObjectID &object_id, const NodeID &node_id, - size_t object_size, const StatusCallback &callback) = 0; + const StatusCallback &callback) = 0; /// Add spilled location of object to GCS asynchronously. /// /// \param object_id The ID of object which location will be added to GCS. /// \param spilled_url The URL where the object has been spilled. - /// \param spilled_node_id The NodeID where the object has been spilled. /// \param callback Callback that will be called after object has been added to GCS. /// \return Status virtual Status AsyncAddSpilledUrl(const ObjectID &object_id, const std::string &spilled_url, - const NodeID &spilled_node_id, size_t object_size, const StatusCallback &callback) = 0; /// Remove location of object from GCS asynchronously. @@ -575,7 +563,7 @@ class NodeResourceInfoAccessor { virtual void AsyncReReportResourceUsage() = 0; /// Return resources in last report. Used by light heartbeat. - const std::shared_ptr &GetLastResourceUsage() { + std::shared_ptr &GetLastResourceUsage() { return last_resource_usage_; } @@ -599,6 +587,7 @@ class NodeResourceInfoAccessor { protected: NodeResourceInfoAccessor() = default; + private: /// Cache which stores resource usage in last report used to check if they are changed. /// Used by light resource usage report. std::shared_ptr last_resource_usage_ = @@ -736,7 +725,7 @@ class PlacementGroupInfoAccessor { virtual Status AsyncCreatePlacementGroup( const PlacementGroupSpecification &placement_group_spec) = 0; - /// Get a placement group data from GCS asynchronously by id. + /// Get a placement group data from GCS asynchronously. /// /// \param placement_group_id The id of a placement group to obtain from GCS. /// \return Status. @@ -744,14 +733,6 @@ class PlacementGroupInfoAccessor { const PlacementGroupID &placement_group_id, const OptionalItemCallback &callback) = 0; - /// Get a placement group data from GCS asynchronously by name. - /// - /// \param placement_group_name The name of a placement group to obtain from GCS. - /// \return Status. - virtual Status AsyncGetByName( - const std::string &placement_group_name, - const OptionalItemCallback &callback) = 0; - /// Get all placement group info from GCS asynchronously. /// /// \param callback Callback that will be called after lookup finished. diff --git a/src/ray/gcs/gcs_client/global_state_accessor.cc b/src/ray/gcs/gcs_client/global_state_accessor.cc index 669b16e2b4a6..4e9a6fa18cef 100644 --- a/src/ray/gcs/gcs_client/global_state_accessor.cc +++ b/src/ray/gcs/gcs_client/global_state_accessor.cc @@ -259,17 +259,5 @@ std::unique_ptr GlobalStateAccessor::GetPlacementGroupInfo( return placement_group_table_data; } -std::unique_ptr GlobalStateAccessor::GetPlacementGroupByName( - const std::string &placement_group_name) { - std::unique_ptr placement_group_table_data; - std::promise promise; - RAY_CHECK_OK(gcs_client_->PlacementGroups().AsyncGetByName( - placement_group_name, - TransformForOptionalItemCallback( - placement_group_table_data, promise))); - promise.get_future().get(); - return placement_group_table_data; -} - } // namespace gcs } // namespace ray diff --git a/src/ray/gcs/gcs_client/global_state_accessor.h b/src/ray/gcs/gcs_client/global_state_accessor.h index c15963587d65..0c5695780c2a 100644 --- a/src/ray/gcs/gcs_client/global_state_accessor.h +++ b/src/ray/gcs/gcs_client/global_state_accessor.h @@ -151,24 +151,15 @@ class GlobalStateAccessor { /// deserialized with protobuf function. std::vector GetAllPlacementGroupInfo(); - /// Get information of a placement group from GCS Service by ID. + /// Get information of a placement group from GCS Service. /// - /// \param placement_group_id The ID of placement group to look up in the GCS Service. + /// \param placement_group The ID of placement group to look up in the GCS Service. /// \return Placement group info. To support multi-language, we serialize each /// PlacementGroupTableData and return the serialized string. Where used, it needs to be /// deserialized with protobuf function. std::unique_ptr GetPlacementGroupInfo( const PlacementGroupID &placement_group_id); - /// Get information of a placement group from GCS Service by name. - /// - /// \param placement_group_name The name of placement group to look up in the GCS - /// Service. \return Placement group info. To support multi-language, we serialize each - /// PlacementGroupTableData and return the serialized string. Where used, it needs to be - /// deserialized with protobuf function. - std::unique_ptr GetPlacementGroupByName( - const std::string &placement_group_name); - private: /// MultiItem transformation helper in template style. /// diff --git a/src/ray/gcs/gcs_client/service_based_accessor.cc b/src/ray/gcs/gcs_client/service_based_accessor.cc index 5905966cb92a..f9380b78ee12 100644 --- a/src/ray/gcs/gcs_client/service_based_accessor.cc +++ b/src/ray/gcs/gcs_client/service_based_accessor.cc @@ -200,26 +200,6 @@ Status ServiceBasedActorInfoAccessor::AsyncRegisterActor( return Status::OK(); } -Status ServiceBasedActorInfoAccessor::AsyncKillActor( - const ActorID &actor_id, bool force_kill, bool no_restart, - const ray::gcs::StatusCallback &callback) { - rpc::KillActorViaGcsRequest request; - request.set_actor_id(actor_id.Binary()); - request.set_force_kill(force_kill); - request.set_no_restart(no_restart); - client_impl_->GetGcsRpcClient().KillActorViaGcs( - request, [callback](const Status &, const rpc::KillActorViaGcsReply &reply) { - if (callback) { - auto status = - reply.status().code() == (int)StatusCode::OK - ? Status() - : Status(StatusCode(reply.status().code()), reply.status().message()); - callback(status); - } - }); - return Status::OK(); -} - Status ServiceBasedActorInfoAccessor::AsyncCreateActor( const ray::TaskSpecification &task_spec, const ray::gcs::StatusCallback &callback) { RAY_CHECK(task_spec.IsActorCreationTask() && callback); @@ -295,7 +275,7 @@ Status ServiceBasedActorInfoAccessor::AsyncSubscribe( auto on_subscribe = [subscribe](const std::string &id, const std::string &data) { ActorTableData actor_data; actor_data.ParseFromString(data); - subscribe(ActorID::FromHex(id), actor_data); + subscribe(ActorID::FromBinary(actor_data.actor_id()), actor_data); }; return client_impl_->GetGcsPubSub().Subscribe(ACTOR_CHANNEL, actor_id.Hex(), on_subscribe, subscribe_done); @@ -727,12 +707,6 @@ Status ServiceBasedNodeResourceInfoAccessor::AsyncUpdateResources( Status ServiceBasedNodeResourceInfoAccessor::AsyncReportResourceUsage( const std::shared_ptr &data_ptr, const StatusCallback &callback) { absl::MutexLock lock(&mutex_); - last_resource_usage_->SetAvailableResources( - ResourceSet(MapFromProtobuf(data_ptr->resources_available()))); - last_resource_usage_->SetTotalResources( - ResourceSet(MapFromProtobuf(data_ptr->resources_total()))); - last_resource_usage_->SetLoadResources( - ResourceSet(MapFromProtobuf(data_ptr->resource_load()))); cached_resource_usage_.mutable_resources()->CopyFrom(*data_ptr); client_impl_->GetGcsRpcClient().ReportResourceUsage( cached_resource_usage_, @@ -1096,7 +1070,6 @@ Status ServiceBasedObjectInfoAccessor::AsyncGetAll( Status ServiceBasedObjectInfoAccessor::AsyncAddLocation(const ObjectID &object_id, const NodeID &node_id, - size_t object_size, const StatusCallback &callback) { RAY_LOG(DEBUG) << "Adding object location, object id = " << object_id << ", node id = " << node_id @@ -1104,7 +1077,6 @@ Status ServiceBasedObjectInfoAccessor::AsyncAddLocation(const ObjectID &object_i rpc::AddObjectLocationRequest request; request.set_object_id(object_id.Binary()); request.set_node_id(node_id.Binary()); - request.set_size(object_size); auto operation = [this, request, object_id, node_id, callback](const SequencerDoneCallback &done_callback) { @@ -1128,15 +1100,13 @@ Status ServiceBasedObjectInfoAccessor::AsyncAddLocation(const ObjectID &object_i Status ServiceBasedObjectInfoAccessor::AsyncAddSpilledUrl( const ObjectID &object_id, const std::string &spilled_url, - const NodeID &spilled_node_id, size_t object_size, const StatusCallback &callback) { + const StatusCallback &callback) { RAY_LOG(DEBUG) << "Adding object spilled location, object id = " << object_id << ", spilled_url = " << spilled_url << ", job id = " << object_id.TaskId().JobId(); rpc::AddObjectLocationRequest request; request.set_object_id(object_id.Binary()); request.set_spilled_url(spilled_url); - request.set_spilled_node_id(spilled_node_id.Binary()); - request.set_size(object_size); auto operation = [this, request, callback](const SequencerDoneCallback &done_callback) { client_impl_->GetGcsRpcClient().AddObjectLocation( @@ -1201,14 +1171,11 @@ Status ServiceBasedObjectInfoAccessor::AsyncSubscribeToLocations( rpc::ObjectLocationChange update; update.set_is_add(true); update.set_node_id(loc.manager()); - update.set_size(result->size()); notification.push_back(update); } if (!result->spilled_url().empty()) { rpc::ObjectLocationChange update; update.set_spilled_url(result->spilled_url()); - update.set_spilled_node_id(result->spilled_node_id()); - update.set_size(result->size()); notification.push_back(update); } subscribe(object_id, notification); @@ -1492,26 +1459,6 @@ Status ServiceBasedPlacementGroupInfoAccessor::AsyncGet( return Status::OK(); } -Status ServiceBasedPlacementGroupInfoAccessor::AsyncGetByName( - const std::string &name, - const OptionalItemCallback &callback) { - RAY_LOG(DEBUG) << "Getting named placement group info, name = " << name; - rpc::GetNamedPlacementGroupRequest request; - request.set_name(name); - client_impl_->GetGcsRpcClient().GetNamedPlacementGroup( - request, [name, callback](const Status &status, - const rpc::GetNamedPlacementGroupReply &reply) { - if (reply.has_placement_group_table_data()) { - callback(status, reply.placement_group_table_data()); - } else { - callback(status, boost::none); - } - RAY_LOG(DEBUG) << "Finished getting named placement group info, status = " - << status << ", name = " << name; - }); - return Status::OK(); -} - Status ServiceBasedPlacementGroupInfoAccessor::AsyncGetAll( const MultiItemCallback &callback) { RAY_LOG(DEBUG) << "Getting all placement group info."; diff --git a/src/ray/gcs/gcs_client/service_based_accessor.h b/src/ray/gcs/gcs_client/service_based_accessor.h index 8aab5198f28e..b498e0acfd46 100644 --- a/src/ray/gcs/gcs_client/service_based_accessor.h +++ b/src/ray/gcs/gcs_client/service_based_accessor.h @@ -85,9 +85,6 @@ class ServiceBasedActorInfoAccessor : public ActorInfoAccessor { Status AsyncCreateActor(const TaskSpecification &task_spec, const StatusCallback &callback) override; - Status AsyncKillActor(const ActorID &actor_id, bool force_kill, bool no_restart, - const StatusCallback &callback) override; - Status AsyncSubscribeAll( const SubscribeCallback &subscribe, const StatusCallback &done) override; @@ -326,10 +323,9 @@ class ServiceBasedObjectInfoAccessor : public ObjectInfoAccessor { Status AsyncGetAll(const MultiItemCallback &callback) override; Status AsyncAddLocation(const ObjectID &object_id, const NodeID &node_id, - size_t object_size, const StatusCallback &callback) override; + const StatusCallback &callback) override; Status AsyncAddSpilledUrl(const ObjectID &object_id, const std::string &spilled_url, - const NodeID &node_id, size_t object_size, const StatusCallback &callback) override; Status AsyncRemoveLocation(const ObjectID &object_id, const NodeID &node_id, @@ -456,10 +452,6 @@ class ServiceBasedPlacementGroupInfoAccessor : public PlacementGroupInfoAccessor const PlacementGroupID &placement_group_id, const OptionalItemCallback &callback) override; - Status AsyncGetByName( - const std::string &name, - const OptionalItemCallback &callback) override; - Status AsyncGetAll( const MultiItemCallback &callback) override; diff --git a/src/ray/gcs/gcs_client/service_based_gcs_client.cc b/src/ray/gcs/gcs_client/service_based_gcs_client.cc index 5fccd645726d..cf9bdd9e4d4e 100644 --- a/src/ray/gcs/gcs_client/service_based_gcs_client.cc +++ b/src/ray/gcs/gcs_client/service_based_gcs_client.cc @@ -207,7 +207,7 @@ void ServiceBasedGcsClient::ReconnectGcsServer() { RAY_LOG(INFO) << "Repeated reconnection in " << RayConfig::instance().minimum_gcs_reconnect_interval_milliseconds() - << " milliseconds, return directly."; + << "milliseconds, return directly."; return; } diff --git a/src/ray/gcs/gcs_client/test/global_state_accessor_test.cc b/src/ray/gcs/gcs_client/test/global_state_accessor_test.cc index e896beccb6f5..7af602808fc7 100644 --- a/src/ray/gcs/gcs_client/test/global_state_accessor_test.cc +++ b/src/ray/gcs/gcs_client/test/global_state_accessor_test.cc @@ -283,7 +283,7 @@ TEST_F(GlobalStateAccessorTest, TestObjectTable) { NodeID node_id = NodeID::FromRandom(); std::promise promise; RAY_CHECK_OK(gcs_client_->Objects().AsyncAddLocation( - object_id, node_id, 0, + object_id, node_id, [&promise](Status status) { promise.set_value(status.ok()); })); WaitReady(promise.get_future(), timeout_ms_); } diff --git a/src/ray/gcs/gcs_client/test/service_based_gcs_client_test.cc b/src/ray/gcs/gcs_client/test/service_based_gcs_client_test.cc index 191ffa0fff0f..3b0f731bbccd 100644 --- a/src/ray/gcs/gcs_client/test/service_based_gcs_client_test.cc +++ b/src/ray/gcs/gcs_client/test/service_based_gcs_client_test.cc @@ -450,7 +450,7 @@ class ServiceBasedGcsClientTest : public ::testing::Test { bool AddLocation(const ObjectID &object_id, const NodeID &node_id) { std::promise promise; RAY_CHECK_OK(gcs_client_->Objects().AsyncAddLocation( - object_id, node_id, 0, + object_id, node_id, [&promise](Status status) { promise.set_value(status.ok()); })); return WaitReady(promise.get_future(), timeout_ms_); } @@ -715,16 +715,8 @@ TEST_F(ServiceBasedGcsClientTest, TestNodeResourceUsage) { auto resource = std::make_shared(); resource->set_node_id(node_id.Binary()); resource->set_should_global_gc(true); - std::string resource_name = "CPU"; - double resource_value = 1.0; - (*resource->mutable_resources_total())[resource_name] = resource_value; ASSERT_TRUE(ReportResourceUsage(resource)); WaitForExpectedCount(resource_batch_count, 1); - - // Get and check last report resource usage. - auto last_resource_usage = gcs_client_->NodeResources().GetLastResourceUsage(); - ASSERT_EQ(last_resource_usage->GetTotalResources().GetResource(resource_name), - resource_value); } TEST_F(ServiceBasedGcsClientTest, TestNodeResourceUsageWithLightResourceUsageReport) { diff --git a/src/ray/gcs/gcs_server/gcs_actor_manager.cc b/src/ray/gcs/gcs_server/gcs_actor_manager.cc index 338fc149c327..7b30bbc7dde9 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_actor_manager.cc @@ -214,25 +214,6 @@ void GcsActorManager::HandleGetNamedActorInfo( ++counts_[CountType::GET_NAMED_ACTOR_INFO_REQUEST]; } -void GcsActorManager::HandleKillActorViaGcs(const rpc::KillActorViaGcsRequest &request, - rpc::KillActorViaGcsReply *reply, - rpc::SendReplyCallback send_reply_callback) { - const auto &actor_id = ActorID::FromBinary(request.actor_id()); - bool force_kill = request.force_kill(); - bool no_restart = request.no_restart(); - if (no_restart) { - DestroyActor(actor_id); - } else { - KillActor(actor_id, force_kill, no_restart); - } - - GCS_RPC_SEND_REPLY(send_reply_callback, reply, Status::OK()); - RAY_LOG(DEBUG) << "Finished killing actor, job id = " << actor_id.JobId() - << ", actor id = " << actor_id << ", force_kill = " << force_kill - << ", no_restart = " << no_restart; - ++counts_[CountType::KILL_ACTOR_REQUEST]; -} - Status GcsActorManager::RegisterActor(const ray::rpc::RegisterActorRequest &request, RegisterActorCallback success_callback) { // NOTE: After the abnormal recovery of the network between GCS client and GCS server or @@ -436,11 +417,8 @@ void GcsActorManager::DestroyActor(const ActorID &actor_id) { actor_to_register_callbacks_.erase(actor_id); actor_to_create_callbacks_.erase(actor_id); auto it = registered_actors_.find(actor_id); - if (it == registered_actors_.end()) { - RAY_LOG(INFO) << "Tried to destroy actor that does not exist " << actor_id; - return; - } - const auto &task_id = it->second->GetCreationTaskSpecification().TaskId(); + RAY_CHECK(it != registered_actors_.end()) + << "Tried to destroy actor that does not exist " << actor_id; it->second->GetMutableActorTableData()->mutable_task_spec()->Clear(); it->second->GetMutableActorTableData()->set_timestamp(current_sys_time_ms()); AddDestroyedActorToCache(it->second); @@ -478,13 +456,38 @@ void GcsActorManager::DestroyActor(const ActorID &actor_id) { if (node_it != created_actors_.end() && node_it->second.count(worker_id)) { // The actor has already been created. Destroy the process by force-killing // it. - NotifyCoreWorkerToKillActor(actor); + KillActor(actor); RAY_CHECK(node_it->second.erase(actor->GetWorkerID())); if (node_it->second.empty()) { created_actors_.erase(node_it); } } else { - CancelActorInScheduling(actor, task_id); + // The actor has not been created yet. It is either being scheduled or is + // pending scheduling. + auto canceled_actor_id = + gcs_actor_scheduler_->CancelOnWorker(actor->GetNodeID(), actor->GetWorkerID()); + if (!canceled_actor_id.IsNil()) { + // The actor was being scheduled and has now been canceled. + RAY_CHECK(canceled_actor_id == actor_id); + } else { + auto pending_it = + std::find_if(pending_actors_.begin(), pending_actors_.end(), + [actor_id](const std::shared_ptr &actor) { + return actor->GetActorID() == actor_id; + }); + + // The actor was pending scheduling. Remove it from the queue. + if (pending_it != pending_actors_.end()) { + pending_actors_.erase(pending_it); + } else { + // When actor creation request of this actor id is pending in raylet, + // it doesn't responds, and the actor should be still in leasing state. + // NOTE: Raylet will cancel the lease request once it receives the + // actor state notification. So this method doesn't have to cancel + // outstanding lease request by calling raylet_client->CancelWorkerLease + gcs_actor_scheduler_->CancelOnLeasing(node_id, actor_id); + } + } } } @@ -500,9 +503,9 @@ void GcsActorManager::DestroyActor(const ActorID &actor_id) { RAY_CHECK_OK(gcs_table_storage_->ActorTable().Put( actor->GetActorID(), *actor_table_data, [this, actor_id, actor_table_data](Status status) { - RAY_CHECK_OK(gcs_pub_sub_->Publish( - ACTOR_CHANNEL, actor_id.Hex(), - GenActorDataOnlyWithStates(*actor_table_data)->SerializeAsString(), nullptr)); + RAY_CHECK_OK(gcs_pub_sub_->Publish(ACTOR_CHANNEL, actor_id.Hex(), + actor_table_data->SerializeAsString(), + nullptr)); // Destroy placement group owned by this actor. destroy_owned_placement_group_if_needed_(actor_id); })); @@ -674,6 +677,7 @@ void GcsActorManager::ReconstructActor(const ActorID &actor_id, bool need_resche // between memory cache and storage. mutable_actor_table_data->set_num_restarts(num_restarts + 1); mutable_actor_table_data->set_state(rpc::ActorTableData::RESTARTING); + const auto actor_table_data = actor->GetActorTableData(); // Make sure to reset the address before flushing to GCS. Otherwise, // GCS will mistakenly consider this lease request succeeds when restarting. actor->UpdateAddress(rpc::Address()); @@ -681,11 +685,10 @@ void GcsActorManager::ReconstructActor(const ActorID &actor_id, bool need_resche // The backend storage is reliable in the future, so the status must be ok. RAY_CHECK_OK(gcs_table_storage_->ActorTable().Put( actor_id, *mutable_actor_table_data, - [this, actor_id, mutable_actor_table_data](Status status) { - RAY_CHECK_OK(gcs_pub_sub_->Publish( - ACTOR_CHANNEL, actor_id.Hex(), - GenActorDataOnlyWithStates(*mutable_actor_table_data)->SerializeAsString(), - nullptr)); + [this, actor_id, actor_table_data](Status status) { + RAY_CHECK_OK(gcs_pub_sub_->Publish(ACTOR_CHANNEL, actor_id.Hex(), + actor_table_data.SerializeAsString(), + nullptr)); })); gcs_actor_scheduler_->Schedule(actor); } else { @@ -698,12 +701,11 @@ void GcsActorManager::ReconstructActor(const ActorID &actor_id, bool need_resche } mutable_actor_table_data->set_state(rpc::ActorTableData::DEAD); - mutable_actor_table_data->set_timestamp(current_sys_time_ms()); // The backend storage is reliable in the future, so the status must be ok. RAY_CHECK_OK(gcs_table_storage_->ActorTable().Put( actor_id, *mutable_actor_table_data, [this, actor, actor_id, mutable_actor_table_data](Status status) { - // If actor was an detached actor, make sure to destroy it. + // if actor was an detached actor, make sure to destroy it. // We need to do this because detached actors are not destroyed // when its owners are dead because it doesn't have owners. if (actor->IsDetached()) { @@ -711,8 +713,7 @@ void GcsActorManager::ReconstructActor(const ActorID &actor_id, bool need_resche } RAY_CHECK_OK(gcs_pub_sub_->Publish( ACTOR_CHANNEL, actor_id.Hex(), - GenActorDataOnlyWithStates(*mutable_actor_table_data)->SerializeAsString(), - nullptr)); + mutable_actor_table_data->SerializeAsString(), nullptr)); })); // The actor is dead, but we should not remove the entry from the // registered actors yet. If the actor is owned, we will destroy the actor @@ -753,9 +754,9 @@ void GcsActorManager::OnActorCreationSuccess(const std::shared_ptr &ac RAY_CHECK_OK(gcs_table_storage_->ActorTable().Put( actor_id, actor_table_data, [this, actor_id, actor_table_data, actor](Status status) { - RAY_CHECK_OK(gcs_pub_sub_->Publish( - ACTOR_CHANNEL, actor_id.Hex(), - GenActorDataOnlyWithStates(actor_table_data)->SerializeAsString(), nullptr)); + RAY_CHECK_OK(gcs_pub_sub_->Publish(ACTOR_CHANNEL, actor_id.Hex(), + actor_table_data.SerializeAsString(), + nullptr)); // Invoke all callbacks for all registration requests of this actor (duplicated // requests are included) and remove all of them from // actor_to_create_callbacks_. @@ -931,47 +932,15 @@ void GcsActorManager::RemoveActorFromOwner(const std::shared_ptr &acto } } -void GcsActorManager::NotifyCoreWorkerToKillActor(const std::shared_ptr &actor, - bool force_kill, bool no_restart) { +void GcsActorManager::KillActor(const std::shared_ptr &actor) { auto actor_client = worker_client_factory_(actor->GetAddress()); rpc::KillActorRequest request; request.set_intended_actor_id(actor->GetActorID().Binary()); - request.set_force_kill(force_kill); - request.set_no_restart(no_restart); + request.set_force_kill(true); + request.set_no_restart(true); RAY_UNUSED(actor_client->KillActor(request, nullptr)); } -void GcsActorManager::KillActor(const ActorID &actor_id, bool force_kill, - bool no_restart) { - RAY_LOG(DEBUG) << "Killing actor, job id = " << actor_id.JobId() - << ", actor id = " << actor_id << ", force_kill = " << force_kill; - const auto &it = registered_actors_.find(actor_id); - if (it == registered_actors_.end()) { - RAY_LOG(INFO) << "Tried to kill actor that does not exist " << actor_id; - return; - } - - const auto &actor = it->second; - if (actor->GetState() == rpc::ActorTableData::DEAD || - actor->GetState() == rpc::ActorTableData::DEPENDENCIES_UNREADY) { - return; - } - - // The actor is still alive or pending creation. - const auto &node_id = actor->GetNodeID(); - const auto &worker_id = actor->GetWorkerID(); - auto node_it = created_actors_.find(node_id); - if (node_it != created_actors_.end() && node_it->second.count(worker_id)) { - // The actor has already been created. Destroy the process by force-killing - // it. - NotifyCoreWorkerToKillActor(actor, force_kill, no_restart); - } else { - const auto &task_id = actor->GetCreationTaskSpecification().TaskId(); - CancelActorInScheduling(actor, task_id); - ReconstructActor(actor_id, /*need_reschedule=*/true); - } -} - void GcsActorManager::AddDestroyedActorToCache(const std::shared_ptr &actor) { if (destroyed_actors_.size() >= RayConfig::instance().maximum_gcs_destroyed_actor_cached_count()) { @@ -985,36 +954,6 @@ void GcsActorManager::AddDestroyedActorToCache(const std::shared_ptr & actor->GetActorID(), (int64_t)actor->GetActorTableData().timestamp()); } -void GcsActorManager::CancelActorInScheduling(const std::shared_ptr &actor, - const TaskID &task_id) { - const auto &actor_id = actor->GetActorID(); - const auto &node_id = actor->GetNodeID(); - // The actor has not been created yet. It is either being scheduled or is - // pending scheduling. - auto canceled_actor_id = - gcs_actor_scheduler_->CancelOnWorker(actor->GetNodeID(), actor->GetWorkerID()); - if (!canceled_actor_id.IsNil()) { - // The actor was being scheduled and has now been canceled. - RAY_CHECK(canceled_actor_id == actor_id); - } else { - auto pending_it = std::find_if(pending_actors_.begin(), pending_actors_.end(), - [actor_id](const std::shared_ptr &actor) { - return actor->GetActorID() == actor_id; - }); - - // The actor was pending scheduling. Remove it from the queue. - if (pending_it != pending_actors_.end()) { - pending_actors_.erase(pending_it); - } else { - // When actor creation request of this actor id is pending in raylet, - // it doesn't responds, and the actor should be still in leasing state. - // NOTE: We will cancel outstanding lease request by calling - // `raylet_client->CancelWorkerLease`. - gcs_actor_scheduler_->CancelOnLeasing(node_id, actor_id, task_id); - } - } -} - std::string GcsActorManager::DebugString() const { std::ostringstream stream; stream << "GcsActorManager: {RegisterActor request count: " @@ -1023,7 +962,6 @@ std::string GcsActorManager::DebugString() const { << ", GetActorInfo request count: " << counts_[CountType::GET_ACTOR_INFO_REQUEST] << ", GetNamedActorInfo request count: " << counts_[CountType::GET_NAMED_ACTOR_INFO_REQUEST] - << ", KillActor request count: " << counts_[CountType::KILL_ACTOR_REQUEST] << ", Registered actors count: " << registered_actors_.size() << ", Destroyed actors count: " << destroyed_actors_.size() << ", Named actors count: " << named_actors_.size() diff --git a/src/ray/gcs/gcs_server/gcs_actor_manager.h b/src/ray/gcs/gcs_server/gcs_actor_manager.h index f2db9345f0ba..0f47cfb4f672 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_manager.h +++ b/src/ray/gcs/gcs_server/gcs_actor_manager.h @@ -190,10 +190,6 @@ class GcsActorManager : public rpc::ActorInfoHandler { rpc::GetAllActorInfoReply *reply, rpc::SendReplyCallback send_reply_callback) override; - void HandleKillActorViaGcs(const rpc::KillActorViaGcsRequest &request, - rpc::KillActorViaGcsReply *reply, - rpc::SendReplyCallback send_reply_callback) override; - /// Register actor asynchronously. /// /// \param request Contains the meta info to create the actor. @@ -320,6 +316,7 @@ class GcsActorManager : public rpc::ActorInfoHandler { absl::flat_hash_set GetUnresolvedActorsByOwnerWorker( const NodeID &node_id, const WorkerID &worker_id) const; + private: /// Reconstruct the specified actor. /// /// \param actor The target actor to be reconstructed. @@ -340,18 +337,8 @@ class GcsActorManager : public rpc::ActorInfoHandler { /// Kill the specified actor. /// - /// \param actor_id ID of the actor to kill. - /// \param force_kill Whether to force kill an actor by killing the worker. - /// \param no_restart If set to true, the killed actor will not be restarted anymore. - void KillActor(const ActorID &actor_id, bool force_kill, bool no_restart); - - /// Notify CoreWorker to kill the specified actor. - /// /// \param actor The actor to be killed. - /// \param force_kill Whether to force kill an actor by killing the worker. - /// \param no_restart If set to true, the killed actor will not be restarted anymore. - void NotifyCoreWorkerToKillActor(const std::shared_ptr &actor, - bool force_kill = true, bool no_restart = true); + void KillActor(const std::shared_ptr &actor); /// Add the destroyed actor to the cache. If the cache is full, one actor is randomly /// evicted. @@ -359,24 +346,6 @@ class GcsActorManager : public rpc::ActorInfoHandler { /// \param actor The actor to be killed. void AddDestroyedActorToCache(const std::shared_ptr &actor); - std::shared_ptr GenActorDataOnlyWithStates( - const rpc::ActorTableData &actor) { - auto actor_delta = std::make_shared(); - actor_delta->set_state(actor.state()); - actor_delta->mutable_address()->CopyFrom(actor.address()); - actor_delta->set_num_restarts(actor.num_restarts()); - actor_delta->set_timestamp(actor.timestamp()); - actor_delta->set_pid(actor.pid()); - return actor_delta; - } - - /// Cancel actor which is either being scheduled or is pending scheduling. - /// - /// \param actor The actor to be cancelled. - /// \param task_id The id of actor creation task to be cancelled. - void CancelActorInScheduling(const std::shared_ptr &actor, - const TaskID &task_id); - /// Callbacks of pending `RegisterActor` requests. /// Maps actor ID to actor registration callbacks, which is used to filter duplicated /// messages from a driver/worker caused by some network problems. @@ -434,8 +403,7 @@ class GcsActorManager : public rpc::ActorInfoHandler { GET_ACTOR_INFO_REQUEST = 2, GET_NAMED_ACTOR_INFO_REQUEST = 3, GET_ALL_ACTOR_INFO_REQUEST = 4, - KILL_ACTOR_REQUEST = 5, - CountType_MAX = 6, + CountType_MAX = 10, }; uint64_t counts_[CountType::CountType_MAX] = {0}; }; diff --git a/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc b/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc index 1b4201c4f573..9c81c8c0e98d 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc +++ b/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc @@ -127,27 +127,13 @@ std::vector GcsActorScheduler::CancelOnNode(const NodeID &node_id) { return actor_ids; } -void GcsActorScheduler::CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id, - const TaskID &task_id) { - // NOTE: This method will cancel the outstanding lease request and remove leasing - // information from the internal state. +void GcsActorScheduler::CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id) { + // NOTE: This method does not currently cancel the outstanding lease request. + // It only removes leasing information from the internal state so that + // RequestWorkerLease ignores the response from raylet. auto node_it = node_to_actors_when_leasing_.find(node_id); - if (node_it != node_to_actors_when_leasing_.end()) { - node_it->second.erase(actor_id); - } - - const auto &alive_nodes = gcs_node_manager_.GetAllAliveNodes(); - const auto &iter = alive_nodes.find(node_id); - if (iter != alive_nodes.end()) { - const auto &node_info = iter->second; - rpc::Address address; - address.set_raylet_id(node_info->node_id()); - address.set_ip_address(node_info->node_manager_address()); - address.set_port(node_info->node_manager_port()); - auto lease_client = GetOrConnectLeaseClient(address); - lease_client->CancelWorkerLease( - task_id, [](const Status &status, const rpc::CancelWorkerLeaseReply &reply) {}); - } + RAY_CHECK(node_it != node_to_actors_when_leasing_.end()); + node_it->second.erase(actor_id); } ActorID GcsActorScheduler::CancelOnWorker(const NodeID &node_id, @@ -252,16 +238,6 @@ void GcsActorScheduler::LeaseWorkerFromNode(std::shared_ptr actor, } if (status.ok()) { - if (reply.worker_address().raylet_id().empty() && - reply.retry_at_raylet_address().raylet_id().empty()) { - // Actor creation task has been cancelled. It is triggered by `ray.kill`. If - // the number of remaining restarts of the actor is not equal to 0, GCS will - // reschedule the actor, so it return directly here. - RAY_LOG(DEBUG) << "Actor " << actor->GetActorID() - << " creation task has been cancelled."; - return; - } - // Remove the actor from the leasing map as the reply is returned from the // remote node. iter->second.erase(actor_iter); diff --git a/src/ray/gcs/gcs_server/gcs_actor_scheduler.h b/src/ray/gcs/gcs_server/gcs_actor_scheduler.h index c0e3d430ecbf..71dd351087e0 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_scheduler.h +++ b/src/ray/gcs/gcs_server/gcs_actor_scheduler.h @@ -59,8 +59,7 @@ class GcsActorSchedulerInterface { /// /// \param node_id ID of the node where the actor leasing request has been sent. /// \param actor_id ID of an actor. - virtual void CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id, - const TaskID &task_id) = 0; + virtual void CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id) = 0; /// Cancel the actor that is being scheduled to the specified worker. /// @@ -131,8 +130,7 @@ class GcsActorScheduler : public GcsActorSchedulerInterface { /// /// \param node_id ID of the node where the actor leasing request has been sent. /// \param actor_id ID of an actor. - void CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id, - const TaskID &task_id) override; + void CancelOnLeasing(const NodeID &node_id, const ActorID &actor_id) override; /// Cancel the actor that is being scheduled to the specified worker. /// diff --git a/src/ray/gcs/gcs_server/gcs_heartbeat_manager.cc b/src/ray/gcs/gcs_server/gcs_heartbeat_manager.cc index 5991c20a8f0e..b6dd56945cbf 100644 --- a/src/ray/gcs/gcs_server/gcs_heartbeat_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_heartbeat_manager.cc @@ -103,7 +103,7 @@ void GcsHeartbeatManager::DetectDeadNodes() { void GcsHeartbeatManager::ScheduleTick() { auto heartbeat_period = boost::posix_time::milliseconds( - RayConfig::instance().raylet_heartbeat_period_milliseconds()); + RayConfig::instance().raylet_heartbeat_timeout_milliseconds()); detect_timer_.expires_from_now(heartbeat_period); detect_timer_.async_wait([this](const boost::system::error_code &error) { if (error == boost::asio::error::operation_aborted) { diff --git a/src/ray/gcs/gcs_server/gcs_object_manager.cc b/src/ray/gcs/gcs_server/gcs_object_manager.cc index 818904d65b61..b5cc8f765113 100644 --- a/src/ray/gcs/gcs_server/gcs_object_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_object_manager.cc @@ -51,7 +51,6 @@ void GcsObjectManager::HandleGetAllObjectLocations( object_table_data.set_manager(node_id.Binary()); object_location_info.add_locations()->CopyFrom(object_table_data); } - object_location_info.set_size(item.second.object_size); reply->add_object_location_info_list()->CopyFrom(object_location_info); } RAY_LOG(DEBUG) << "Finished getting all object locations."; @@ -66,7 +65,6 @@ void GcsObjectManager::HandleAddObjectLocation( NodeID node_id; std::string spilled_url; - NodeID spilled_node_id; if (!request.node_id().empty()) { node_id = NodeID::FromBinary(request.node_id()); RAY_LOG(DEBUG) << "Adding object location, job id = " << object_id.TaskId().JobId() @@ -76,14 +74,11 @@ void GcsObjectManager::HandleAddObjectLocation( absl::MutexLock lock(&mutex_); RAY_CHECK(!request.spilled_url().empty()); spilled_url = request.spilled_url(); - spilled_node_id = NodeID::FromBinary(request.spilled_node_id()); object_to_locations_[object_id].spilled_url = spilled_url; - object_to_locations_[object_id].spilled_node_id = spilled_node_id; RAY_LOG(DEBUG) << "Adding object spilled location, object id = " << object_id; } - size_t size = request.size(); - auto on_done = [this, object_id, node_id, spilled_url, size, spilled_node_id, reply, + auto on_done = [this, object_id, node_id, spilled_url, reply, send_reply_callback](const Status &status) { if (status.ok()) { rpc::ObjectLocationChange notification; @@ -93,16 +88,13 @@ void GcsObjectManager::HandleAddObjectLocation( } if (!spilled_url.empty()) { notification.set_spilled_url(spilled_url); - notification.set_spilled_node_id(spilled_node_id.Binary()); } - notification.set_size(size); RAY_CHECK_OK(gcs_pub_sub_->Publish(OBJECT_CHANNEL, object_id.Hex(), notification.SerializeAsString(), nullptr)); RAY_LOG(DEBUG) << "Finished adding object location, job id = " << object_id.TaskId().JobId() << ", object id = " << object_id << ", node id = " << node_id << ", task id = " << object_id.TaskId() - << ", spilled_url = " << spilled_url - << ", spilled_node_id = " << spilled_node_id; + << ", spilled_url = " << spilled_url; } else { RAY_LOG(ERROR) << "Failed to add object location: " << status.ToString() << ", job id = " << object_id.TaskId().JobId() @@ -115,7 +107,6 @@ void GcsObjectManager::HandleAddObjectLocation( }; absl::MutexLock lock(&mutex_); - object_to_locations_[object_id].object_size = size; const auto object_data = GenObjectLocationInfo(object_id); Status status = gcs_table_storage_->ObjectTable().Put(object_id, object_data, on_done); if (!status.ok()) { @@ -296,8 +287,6 @@ const ObjectLocationInfo GcsObjectManager::GenObjectLocationInfo( object_data.add_locations()->set_manager(node_id.Binary()); } object_data.set_spilled_url(it->second.spilled_url); - object_data.set_spilled_node_id(it->second.spilled_node_id.Binary()); - object_data.set_size(it->second.object_size); } return object_data; } diff --git a/src/ray/gcs/gcs_server/gcs_object_manager.h b/src/ray/gcs/gcs_server/gcs_object_manager.h index 6d4d39598cb6..bd21bfd1b977 100644 --- a/src/ray/gcs/gcs_server/gcs_object_manager.h +++ b/src/ray/gcs/gcs_server/gcs_object_manager.h @@ -65,8 +65,6 @@ class GcsObjectManager : public rpc::ObjectInfoHandler { struct LocationSet { absl::flat_hash_set locations; std::string spilled_url = ""; - NodeID spilled_node_id = NodeID::Nil(); - size_t object_size = 0; }; /// Add a location of objects. diff --git a/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc b/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc index 12260d867d37..b56f6b1d3b81 100644 --- a/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc @@ -65,8 +65,7 @@ rpc::PlacementStrategy GcsPlacementGroup::GetStrategy() const { return placement_group_table_data_.strategy(); } -const rpc::PlacementGroupTableData &GcsPlacementGroup::GetPlacementGroupTableData() - const { +const rpc::PlacementGroupTableData &GcsPlacementGroup::GetPlacementGroupTableData() { return placement_group_table_data_; } @@ -97,15 +96,11 @@ void GcsPlacementGroup::MarkCreatorActorDead() { placement_group_table_data_.set_creator_actor_dead(true); } -bool GcsPlacementGroup::IsPlacementGroupLifetimeDone() const { - return !IsDetached() && placement_group_table_data_.creator_job_dead() && +bool GcsPlacementGroup::IsPlacementGroupRemovable() const { + return placement_group_table_data_.creator_job_dead() && placement_group_table_data_.creator_actor_dead(); } -bool GcsPlacementGroup::IsDetached() const { - return placement_group_table_data_.is_detached(); -} - ///////////////////////////////////////////////////////////////////////////////////////// GcsPlacementGroupManager::GcsPlacementGroupManager( @@ -148,21 +143,6 @@ void GcsPlacementGroupManager::RegisterPlacementGroup( } return; } - if (!placement_group->GetName().empty()) { - auto it = named_placement_groups_.find(placement_group->GetName()); - if (it == named_placement_groups_.end()) { - named_placement_groups_.emplace(placement_group->GetName(), - placement_group->GetPlacementGroupID()); - } else { - std::stringstream stream; - stream << "Failed to create placement group '" - << placement_group->GetPlacementGroupID() << "' because name '" - << placement_group->GetName() << "' already exists."; - RAY_LOG(WARNING) << stream.str(); - callback(Status::Invalid(stream.str())); - return; - } - } // Mark the callback as pending and invoke it after the placement_group has been // successfully created. @@ -194,9 +174,11 @@ void GcsPlacementGroupManager::RegisterPlacementGroup( PlacementGroupID GcsPlacementGroupManager::GetPlacementGroupIDByName( const std::string &name) { PlacementGroupID placement_group_id = PlacementGroupID::Nil(); - auto it = named_placement_groups_.find(name); - if (it != named_placement_groups_.end()) { - placement_group_id = it->second; + for (const auto &iter : registered_placement_groups_) { + if (iter.second->GetName() == name) { + placement_group_id = iter.first; + break; + } } return placement_group_id; } @@ -329,19 +311,10 @@ void GcsPlacementGroupManager::RemovePlacementGroup( on_placement_group_removed(Status::OK()); return; } - auto placement_group = std::move(placement_group_it->second); + auto placement_group = placement_group_it->second; registered_placement_groups_.erase(placement_group_it); placement_group_to_create_callbacks_.erase(placement_group_id); - // Remove placement group from `named_placement_groups_` if its name is not empty. - if (!placement_group->GetName().empty()) { - auto it = named_placement_groups_.find(placement_group->GetName()); - if (it != named_placement_groups_.end() && - it->second == placement_group->GetPlacementGroupID()) { - named_placement_groups_.erase(it); - } - } - // Destroy all bundles. gcs_placement_group_scheduler_->DestroyPlacementGroupBundleResourcesIfExists( placement_group_id); @@ -408,30 +381,6 @@ void GcsPlacementGroupManager::HandleGetPlacementGroup( ++counts_[CountType::GET_PLACEMENT_GROUP_REQUEST]; } -void GcsPlacementGroupManager::HandleGetNamedPlacementGroup( - const rpc::GetNamedPlacementGroupRequest &request, - rpc::GetNamedPlacementGroupReply *reply, rpc::SendReplyCallback send_reply_callback) { - const std::string &name = request.name(); - RAY_LOG(DEBUG) << "Getting named placement group info, name = " << name; - - // Try to look up the placement Group ID for the named placement group. - auto placement_group_id = GetPlacementGroupIDByName(name); - - if (placement_group_id.IsNil()) { - // The placement group was not found. - RAY_LOG(DEBUG) << "Placement Group with name '" << name << "' was not found"; - } else { - const auto &iter = registered_placement_groups_.find(placement_group_id); - RAY_CHECK(iter != registered_placement_groups_.end()); - reply->mutable_placement_group_table_data()->CopyFrom( - iter->second->GetPlacementGroupTableData()); - RAY_LOG(DEBUG) << "Finished get named placement group info, placement group id = " - << placement_group_id; - } - GCS_RPC_SEND_REPLY(send_reply_callback, reply, Status::OK()); - ++counts_[CountType::GET_NAMED_PLACEMENT_GROUP_REQUEST]; -} - void GcsPlacementGroupManager::HandleGetAllPlacementGroup( const rpc::GetAllPlacementGroupRequest &request, rpc::GetAllPlacementGroupReply *reply, rpc::SendReplyCallback send_reply_callback) { @@ -546,7 +495,7 @@ void GcsPlacementGroupManager::CleanPlacementGroupIfNeededWhenJobDead( continue; } placement_group->MarkCreatorJobDead(); - if (placement_group->IsPlacementGroupLifetimeDone()) { + if (placement_group->IsPlacementGroupRemovable()) { RemovePlacementGroup(placement_group->GetPlacementGroupID(), [](Status status) {}); } } @@ -560,7 +509,7 @@ void GcsPlacementGroupManager::CleanPlacementGroupIfNeededWhenActorDead( continue; } placement_group->MarkCreatorActorDead(); - if (placement_group->IsPlacementGroupLifetimeDone()) { + if (placement_group->IsPlacementGroupRemovable()) { RemovePlacementGroup(placement_group->GetPlacementGroupID(), [](Status status) {}); } } @@ -597,10 +546,6 @@ void GcsPlacementGroupManager::Initialize(const GcsInitData &gcs_init_data) { auto placement_group = std::make_shared(item.second); if (item.second.state() != rpc::PlacementGroupTableData::REMOVED) { registered_placement_groups_.emplace(item.first, placement_group); - if (!placement_group->GetName().empty()) { - named_placement_groups_.emplace(placement_group->GetName(), - placement_group->GetPlacementGroupID()); - } if (item.second.state() == rpc::PlacementGroupTableData::PENDING || item.second.state() == rpc::PlacementGroupTableData::RESCHEDULING) { @@ -638,7 +583,6 @@ std::string GcsPlacementGroupManager::DebugString() const { << ", WaitPlacementGroupUntilReady request count: " << counts_[CountType::WAIT_PLACEMENT_GROUP_UNTIL_READY_REQUEST] << ", Registered placement groups count: " << registered_placement_groups_.size() - << ", Named placement group count: " << named_placement_groups_.size() << ", Pending placement groups count: " << pending_placement_groups_.size() << "}"; return stream.str(); diff --git a/src/ray/gcs/gcs_server/gcs_placement_group_manager.h b/src/ray/gcs/gcs_server/gcs_placement_group_manager.h index 49a7634dfc0f..8bd36941745f 100644 --- a/src/ray/gcs/gcs_server/gcs_placement_group_manager.h +++ b/src/ray/gcs/gcs_server/gcs_placement_group_manager.h @@ -61,11 +61,10 @@ class GcsPlacementGroup { placement_group_spec.creator_job_dead()); placement_group_table_data_.set_creator_actor_dead( placement_group_spec.creator_actor_dead()); - placement_group_table_data_.set_is_detached(placement_group_spec.is_detached()); } /// Get the immutable PlacementGroupTableData of this placement group. - const rpc::PlacementGroupTableData &GetPlacementGroupTableData() const; + const rpc::PlacementGroupTableData &GetPlacementGroupTableData(); /// Get the mutable bundle of this placement group. rpc::Bundle *GetMutableBundle(int bundle_index); @@ -108,11 +107,8 @@ class GcsPlacementGroup { /// Mark that the creator actor of this placement group is dead. void MarkCreatorActorDead(); - /// Return True if the placement group lifetime is done. False otherwise. - bool IsPlacementGroupLifetimeDone() const; - - /// Returns whether or not this is a detached placement group. - bool IsDetached() const; + /// Return True if the placement group is removable. False otherwise. + bool IsPlacementGroupRemovable() const; private: /// The placement_group meta data which contains the task specification as well as the @@ -155,13 +151,10 @@ class GcsPlacementGroupManager : public rpc::PlacementGroupInfoHandler { rpc::GetPlacementGroupReply *reply, rpc::SendReplyCallback send_reply_callback) override; - void HandleGetNamedPlacementGroup(const rpc::GetNamedPlacementGroupRequest &request, - rpc::GetNamedPlacementGroupReply *reply, - rpc::SendReplyCallback send_reply_callback) override; - void HandleGetAllPlacementGroup(const rpc::GetAllPlacementGroupRequest &request, rpc::GetAllPlacementGroupReply *reply, rpc::SendReplyCallback send_reply_callback) override; + void HandleWaitPlacementGroupUntilReady( const rpc::WaitPlacementGroupUntilReadyRequest &request, rpc::WaitPlacementGroupUntilReadyReply *reply, @@ -200,7 +193,7 @@ class GcsPlacementGroupManager : public rpc::PlacementGroupInfoHandler { void OnPlacementGroupCreationSuccess( const std::shared_ptr &placement_group); - /// Remove the placement group of a given id. + /// TODO-SANG Fill it up. void RemovePlacementGroup(const PlacementGroupID &placement_group_id, StatusCallback on_placement_group_removed); @@ -318,9 +311,6 @@ class GcsPlacementGroupManager : public rpc::PlacementGroupInfoHandler { /// Reference of GcsResourceManager. GcsResourceManager &gcs_resource_manager_; - /// Maps placement group names to their placement group ID for lookups by name. - absl::flat_hash_map named_placement_groups_; - // Debug info. enum CountType { CREATE_PLACEMENT_GROUP_REQUEST = 0, @@ -328,8 +318,7 @@ class GcsPlacementGroupManager : public rpc::PlacementGroupInfoHandler { GET_PLACEMENT_GROUP_REQUEST = 2, GET_ALL_PLACEMENT_GROUP_REQUEST = 3, WAIT_PLACEMENT_GROUP_UNTIL_READY_REQUEST = 4, - GET_NAMED_PLACEMENT_GROUP_REQUEST = 5, - CountType_MAX = 6, + CountType_MAX = 5, }; uint64_t counts_[CountType::CountType_MAX] = {0}; }; diff --git a/src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc index b8edb6e82164..b88c6702bfeb 100644 --- a/src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc @@ -35,8 +35,7 @@ class MockActorScheduler : public gcs::GcsActorSchedulerInterface { MOCK_METHOD1(CancelOnNode, std::vector(const NodeID &node_id)); MOCK_METHOD2(CancelOnWorker, ActorID(const NodeID &node_id, const WorkerID &worker_id)); - MOCK_METHOD3(CancelOnLeasing, void(const NodeID &node_id, const ActorID &actor_id, - const TaskID &task_id)); + MOCK_METHOD2(CancelOnLeasing, void(const NodeID &node_id, const ActorID &actor_id)); std::vector> actors; }; @@ -736,10 +735,8 @@ TEST_F(GcsActorManagerTest, TestRaceConditionCancelLease) { address.set_raylet_id(node_id.Binary()); address.set_worker_id(worker_id.Binary()); actor->UpdateAddress(address); - const auto &actor_id = actor->GetActorID(); - const auto &task_id = - TaskID::FromBinary(registered_actor->GetActorTableData().task_spec().task_id()); - EXPECT_CALL(*mock_actor_scheduler_, CancelOnLeasing(node_id, actor_id, task_id)); + const auto actor_id = actor->GetActorID(); + EXPECT_CALL(*mock_actor_scheduler_, CancelOnLeasing(node_id, actor_id)); gcs_actor_manager_->OnWorkerDead(owner_node_id, owner_worker_id); } diff --git a/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc b/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc index bd98d65ef0f9..d84f99b3fe88 100644 --- a/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc @@ -262,8 +262,7 @@ TEST_F(GcsActorSchedulerTest, TestLeasingCancelledWhenLeasing) { ASSERT_EQ(1, raylet_client_->callbacks.size()); // Cancel the lease request. - const auto &task_id = TaskID::FromBinary(create_actor_request.task_spec().task_id()); - gcs_actor_scheduler_->CancelOnLeasing(node_id, actor->GetActorID(), task_id); + gcs_actor_scheduler_->CancelOnLeasing(node_id, actor->GetActorID()); ASSERT_EQ(1, raylet_client_->num_workers_requested); ASSERT_EQ(1, raylet_client_->callbacks.size()); diff --git a/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_test.cc index 77784e44b9e4..fec3f2540401 100644 --- a/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_test.cc @@ -174,31 +174,6 @@ TEST_F(GcsPlacementGroupManagerTest, TestGetPlacementGroupIDByName) { PlacementGroupID::FromBinary(request.placement_group_spec().placement_group_id())); } -TEST_F(GcsPlacementGroupManagerTest, TestRemoveNamedPlacementGroup) { - auto request = Mocker::GenCreatePlacementGroupRequest("test_name"); - std::atomic finished_placement_group_count(0); - gcs_placement_group_manager_->RegisterPlacementGroup( - std::make_shared(request), - [&finished_placement_group_count](const Status &status) { - ++finished_placement_group_count; - }); - - ASSERT_EQ(finished_placement_group_count, 0); - WaitForExpectedPgCount(1); - auto placement_group = mock_placement_group_scheduler_->placement_groups_.back(); - mock_placement_group_scheduler_->placement_groups_.pop_back(); - - gcs_placement_group_manager_->OnPlacementGroupCreationSuccess(placement_group); - WaitForExpectedCount(finished_placement_group_count, 1); - ASSERT_EQ(placement_group->GetState(), rpc::PlacementGroupTableData::CREATED); - // Remove the named placement group. - gcs_placement_group_manager_->RemovePlacementGroup( - placement_group->GetPlacementGroupID(), - [](const Status &status) { ASSERT_TRUE(status.ok()); }); - ASSERT_EQ(gcs_placement_group_manager_->GetPlacementGroupIDByName("test_name"), - PlacementGroupID::Nil()); -} - TEST_F(GcsPlacementGroupManagerTest, TestRescheduleWhenNodeAdd) { auto request = Mocker::GenCreatePlacementGroupRequest(); std::atomic finished_placement_group_count(0); diff --git a/src/ray/gcs/pubsub/gcs_pub_sub.h b/src/ray/gcs/pubsub/gcs_pub_sub.h index b871a02b13dd..e5b3c1509265 100644 --- a/src/ray/gcs/pubsub/gcs_pub_sub.h +++ b/src/ray/gcs/pubsub/gcs_pub_sub.h @@ -45,7 +45,7 @@ class GcsPubSub { using Callback = std::function; explicit GcsPubSub(std::shared_ptr redis_client) - : redis_client_(redis_client), total_commands_queued_(0) {} + : redis_client_(redis_client) {} virtual ~GcsPubSub() = default; diff --git a/src/ray/gcs/store_client/redis_store_client.cc b/src/ray/gcs/store_client/redis_store_client.cc index 0216b92a6942..b104be3adbf4 100644 --- a/src/ray/gcs/store_client/redis_store_client.cc +++ b/src/ray/gcs/store_client/redis_store_client.cc @@ -104,8 +104,7 @@ Status RedisStoreClient::AsyncDelete(const std::string &table_name, } std::string redis_key = GenRedisKey(table_name, key); - // We always replace `DEL` with `UNLINK`. - std::vector args = {"UNLINK", redis_key}; + std::vector args = {"DEL", redis_key}; auto shard_context = redis_client_->GetShardContext(redis_key); return shard_context->RunArgvAsync(args, delete_callback); @@ -116,7 +115,7 @@ Status RedisStoreClient::AsyncDeleteWithIndex(const std::string &table_name, const std::string &index_key, const StatusCallback &callback) { std::vector redis_keys; - redis_keys.reserve(2); + redis_keys.reserve(20); redis_keys.push_back(GenRedisKey(table_name, key)); redis_keys.push_back(GenRedisKey(table_name, key, index_key)); @@ -219,11 +218,10 @@ Status RedisStoreClient::DoPut(const std::string &key, const std::string &data, Status RedisStoreClient::DeleteByKeys(const std::vector &keys, const StatusCallback &callback) { - // Delete for each shard. - // We always replace `DEL` with `UNLINK`. + // The `DEL` command for each shard. int total_count = 0; auto del_commands_by_shards = - GenCommandsByShards(redis_client_, "UNLINK", keys, &total_count); + GenCommandsByShards(redis_client_, "DEL", keys, &total_count); auto finished_count = std::make_shared(0); diff --git a/src/ray/gcs/test/gcs_test_util.h b/src/ray/gcs/test/gcs_test_util.h index 4d51fdd866f6..bf908c3a278f 100644 --- a/src/ray/gcs/test/gcs_test_util.h +++ b/src/ray/gcs/test/gcs_test_util.h @@ -101,9 +101,8 @@ struct Mocker { PlacementGroupSpecBuilder builder; auto placement_group_id = PlacementGroupID::FromRandom(); - builder.SetPlacementGroupSpec(placement_group_id, name, bundles, strategy, - /* is_detached */ false, job_id, actor_id, - /* is_creator_detached */ false); + builder.SetPlacementGroupSpec(placement_group_id, name, bundles, strategy, job_id, + actor_id, /* is_creator_detached */ false); return builder.Build(); } diff --git a/src/ray/object_manager/common.h b/src/ray/object_manager/common.h index 3cda75266ad0..9c71e2c2b5e8 100644 --- a/src/ray/object_manager/common.h +++ b/src/ray/object_manager/common.h @@ -17,8 +17,7 @@ using SpillObjectsCallback = std::function; using SpaceReleasedCallback = std::function; /// A callback to call when a spilled object needs to be returned to the object store. -using RestoreSpilledObjectCallback = - std::function)>; +using RestoreSpilledObjectCallback = std::function)>; } // namespace ray diff --git a/src/ray/object_manager/object_buffer_pool.cc b/src/ray/object_manager/object_buffer_pool.cc index 63dabcb419ef..4b6a44e6b5fd 100644 --- a/src/ray/object_manager/object_buffer_pool.cc +++ b/src/ray/object_manager/object_buffer_pool.cc @@ -57,13 +57,9 @@ std::pair ObjectBufferPool::Ge std::lock_guard lock(pool_mutex_); if (get_buffer_state_.count(object_id) == 0) { plasma::ObjectBuffer object_buffer; - RAY_CHECK_OK( - store_client_.Get(&object_id, 1, 0, &object_buffer, /*is_from_worker=*/false)); + RAY_CHECK_OK(store_client_.Get(&object_id, 1, 0, &object_buffer)); if (object_buffer.data == nullptr) { - RAY_LOG(INFO) - << "Failed to get a chunk of the object: " << object_id - << ". It is mostly because the object is already evicted or spilled when the " - "pull request is received. The caller will retry the pull request again."; + RAY_LOG(ERROR) << "Failed to get object"; return std::pair( errored_chunk_, ray::Status::IOError("Unable to obtain object chunk, object not local.")); diff --git a/src/ray/object_manager/object_directory.cc b/src/ray/object_manager/object_directory.cc index 27e6f42b0bd6..189cc0dd7d4b 100644 --- a/src/ray/object_manager/object_directory.cc +++ b/src/ray/object_manager/object_directory.cc @@ -31,21 +31,13 @@ using ray::rpc::ObjectTableData; /// object table entries up to but not including this notification. bool UpdateObjectLocations(const std::vector &location_updates, std::shared_ptr gcs_client, - std::unordered_set *node_ids, std::string *spilled_url, - NodeID *spilled_node_id, size_t *object_size) { + std::unordered_set *node_ids, + std::string *spilled_url) { // location_updates contains the updates of locations of the object. // with GcsChangeMode, we can determine whether the update mode is // addition or deletion. bool isUpdated = false; for (const auto &update : location_updates) { - // The size can be 0 if the update was a deletion. This assumes that an - // object's size is always greater than 0. - // TODO(swang): If that's not the case, we should use a flag to check - // whether the size is set instead. - if (update.size() > 0) { - *object_size = update.size(); - } - if (!update.node_id().empty()) { NodeID node_id = NodeID::FromBinary(update.node_id()); if (update.is_add() && 0 == node_ids->count(node_id)) { @@ -57,12 +49,9 @@ bool UpdateObjectLocations(const std::vector &locatio } } else { RAY_CHECK(!update.spilled_url().empty()); - const auto received_spilled_node_id = NodeID::FromBinary(update.spilled_node_id()); - RAY_LOG(DEBUG) << "Received object spilled at " << update.spilled_url() - << " spilled at " << NodeID::FromBinary(update.spilled_node_id()); + RAY_LOG(DEBUG) << "Received object spilled at " << update.spilled_url(); if (update.spilled_url() != *spilled_url) { *spilled_url = update.spilled_url(); - *spilled_node_id = received_spilled_node_id; isUpdated = true; } } @@ -84,10 +73,9 @@ bool UpdateObjectLocations(const std::vector &locatio ray::Status ObjectDirectory::ReportObjectAdded( const ObjectID &object_id, const NodeID &node_id, const object_manager::protocol::ObjectInfoT &object_info) { - size_t size = object_info.data_size + object_info.metadata_size; - RAY_LOG(DEBUG) << "Reporting object added to GCS " << object_id << " size " << size; + RAY_LOG(DEBUG) << "Reporting object added to GCS " << object_id; ray::Status status = - gcs_client_->Objects().AsyncAddLocation(object_id, node_id, size, nullptr); + gcs_client_->Objects().AsyncAddLocation(object_id, node_id, nullptr); return status; } @@ -131,17 +119,14 @@ void ObjectDirectory::HandleNodeRemoved(const NodeID &node_id) { // If the subscribed object has the removed node as a location, update // its locations with an empty update so that the location will be removed. UpdateObjectLocations({}, gcs_client_, &listener.second.current_object_locations, - &listener.second.spilled_url, - &listener.second.spilled_node_id, - &listener.second.object_size); + &listener.second.spilled_url); // Re-call all the subscribed callbacks for the object, since its // locations have changed. for (const auto &callback_pair : listener.second.callbacks) { // It is safe to call the callback directly since this is already running // in the subscription callback stack. callback_pair.second(object_id, listener.second.current_object_locations, - listener.second.spilled_url, listener.second.spilled_node_id, - listener.second.object_size); + listener.second.spilled_url); } } } @@ -168,11 +153,11 @@ ray::Status ObjectDirectory::SubscribeObjectLocations(const UniqueID &callback_i // Once this flag is set to true, it should never go back to false. it->second.subscribed = true; + // Update entries for this object. if (!UpdateObjectLocations(object_notifications, gcs_client_, &it->second.current_object_locations, - &it->second.spilled_url, &it->second.spilled_node_id, - &it->second.object_size)) { + &it->second.spilled_url)) { return; } // Copy the callbacks so that the callbacks can unsubscribe without interrupting @@ -186,8 +171,7 @@ ray::Status ObjectDirectory::SubscribeObjectLocations(const UniqueID &callback_i // It is safe to call the callback directly since this is already running // in the subscription callback stack. callback_pair.second(object_id, it->second.current_object_locations, - it->second.spilled_url, it->second.spilled_node_id, - it->second.object_size); + it->second.spilled_url); } }; status = gcs_client_->Objects().AsyncSubscribeToLocations( @@ -205,12 +189,9 @@ ray::Status ObjectDirectory::SubscribeObjectLocations(const UniqueID &callback_i if (listener_state.subscribed) { auto &locations = listener_state.current_object_locations; auto &spilled_url = listener_state.spilled_url; - auto &spilled_node_id = listener_state.spilled_node_id; - auto object_size = it->second.object_size; - io_service_.post( - [callback, locations, spilled_url, object_size, object_id, spilled_node_id]() { - callback(object_id, locations, spilled_url, spilled_node_id, object_size); - }); + io_service_.post([callback, locations, spilled_url, object_id]() { + callback(object_id, locations, spilled_url); + }); } return status; } @@ -242,12 +223,9 @@ ray::Status ObjectDirectory::LookupLocations(const ObjectID &object_id, // cached locations. auto &locations = it->second.current_object_locations; auto &spilled_url = it->second.spilled_url; - auto &spilled_node_id = it->second.spilled_node_id; - auto object_size = it->second.object_size; - io_service_.post( - [callback, object_id, spilled_url, locations, object_size, spilled_node_id]() { - callback(object_id, locations, spilled_url, spilled_node_id, object_size); - }); + io_service_.post([callback, object_id, spilled_url, locations]() { + callback(object_id, locations, spilled_url); + }); } else { // We do not have any locations cached due to a concurrent // SubscribeObjectLocations call, so look up the object's locations @@ -269,19 +247,15 @@ ray::Status ObjectDirectory::LookupLocations(const ObjectID &object_id, if (!update->spilled_url().empty()) { rpc::ObjectLocationChange change; change.set_spilled_url(update->spilled_url()); - change.set_spilled_node_id(update->spilled_node_id()); notification.push_back(change); } std::unordered_set node_ids; std::string spilled_url; - NodeID spilled_node_id; - size_t object_size = 0; - UpdateObjectLocations(notification, gcs_client_, &node_ids, &spilled_url, - &spilled_node_id, &object_size); + UpdateObjectLocations(notification, gcs_client_, &node_ids, &spilled_url); // It is safe to call the callback directly since this is already running // in the GCS client's lookup callback stack. - callback(object_id, node_ids, spilled_url, spilled_node_id, object_size); + callback(object_id, node_ids, spilled_url); }); } return status; diff --git a/src/ray/object_manager/object_directory.h b/src/ray/object_manager/object_directory.h index 0a4c6300a81a..3ce15882bfea 100644 --- a/src/ray/object_manager/object_directory.h +++ b/src/ray/object_manager/object_directory.h @@ -41,9 +41,9 @@ struct RemoteConnectionInfo { }; /// Callback for object location notifications. -using OnLocationsFound = std::function &, - const std::string &, const NodeID &, size_t object_size)>; +using OnLocationsFound = + std::function &, const std::string &)>; class ObjectDirectoryInterface { public: @@ -185,11 +185,6 @@ class ObjectDirectory : public ObjectDirectoryInterface { std::unordered_set current_object_locations; /// The location where this object has been spilled, if any. std::string spilled_url = ""; - // The node id that spills the object to the disk. - // It will be Nil if it uses a distributed external storage. - NodeID spilled_node_id = NodeID::Nil(); - /// The size of the object. - size_t object_size = 0; /// This flag will get set to true if received any notification of the object. /// It means current_object_locations is up-to-date with GCS. It /// should never go back to false once set to true. If this is true, and diff --git a/src/ray/object_manager/object_manager.cc b/src/ray/object_manager/object_manager.cc index d59737ca6c25..d82a5fb0d069 100644 --- a/src/ray/object_manager/object_manager.cc +++ b/src/ray/object_manager/object_manager.cc @@ -73,6 +73,18 @@ ObjectManager::ObjectManager(asio::io_service &main_service, const NodeID &self_ boost::posix_time::milliseconds(config.timer_freq_ms)) { RAY_CHECK(config_.rpc_service_threads_number > 0); + const auto &object_is_local = [this](const ObjectID &object_id) { + return local_objects_.count(object_id) != 0; + }; + const auto &send_pull_request = [this](const ObjectID &object_id, + const NodeID &client_id) { + SendPullRequest(object_id, client_id); + }; + const auto &get_time = []() { return absl::GetCurrentTimeNanos() / 1e9; }; + pull_manager_.reset(new PullManager(self_node_id_, object_is_local, send_pull_request, + restore_spilled_object_, get_time, + config.pull_timeout_ms)); + push_manager_.reset(new PushManager(/* max_chunks_in_flight= */ std::max( static_cast(1L), static_cast(config_.max_bytes_in_flight / config_.object_chunk_size)))); @@ -87,40 +99,14 @@ ObjectManager::ObjectManager(asio::io_service &main_service, const NodeID &self_ main_service, config_.store_socket_name); } - const auto &object_is_local = [this](const ObjectID &object_id) { - return local_objects_.count(object_id) != 0; - }; - const auto &send_pull_request = [this](const ObjectID &object_id, - const NodeID &client_id) { - SendPullRequest(object_id, client_id); - }; - const auto &get_time = []() { return absl::GetCurrentTimeNanos() / 1e9; }; - int64_t available_memory = config.object_store_memory; - if (available_memory < 0) { - available_memory = 0; - } - pull_manager_.reset(new PullManager( - self_node_id_, object_is_local, send_pull_request, restore_spilled_object_, - get_time, config.pull_timeout_ms, available_memory, - [spill_objects_callback, object_store_full_callback]() { - // TODO(swang): This copies the out-of-memory handling in the - // CreateRequestQueue. It would be nice to unify these. - if (object_store_full_callback) { - object_store_full_callback(); - } - - static_cast(spill_objects_callback()); - })); - store_notification_->SubscribeObjAdded( [this](const object_manager::protocol::ObjectInfoT &object_info) { HandleObjectAdded(object_info); }); store_notification_->SubscribeObjDeleted([this](const ObjectID &oid) { + // TODO(swang): We may want to force the pull manager to fetch this object + // again, in case it was needed by an active pull request. NotifyDirectoryObjectDeleted(oid); - // Ask the pull manager to fetch this object again as soon as possible, if - // it was needed by an active pull request. - pull_manager_->ResetRetryTimer(oid); }); // Start object manager rpc server and send & receive request threads @@ -220,10 +206,8 @@ uint64_t ObjectManager::Pull(const std::vector &object_ref const auto &callback = [this](const ObjectID &object_id, const std::unordered_set &client_ids, - const std::string &spilled_url, - const NodeID &spilled_node_id, size_t object_size) { - pull_manager_->OnLocationChange(object_id, client_ids, spilled_url, spilled_node_id, - object_size); + const std::string &spilled_url) { + pull_manager_->OnLocationChange(object_id, client_ids, spilled_url); }; for (const auto &ref : objects_to_locate) { @@ -515,8 +499,7 @@ ray::Status ObjectManager::LookupRemainingWaitObjects(const UniqueID &wait_id) { object_id, wait_state.owner_addresses[object_id], [this, wait_id](const ObjectID &lookup_object_id, const std::unordered_set &node_ids, - const std::string &spilled_url, const NodeID &spilled_node_id, - size_t object_size) { + const std::string &spilled_url) { auto &wait_state = active_wait_requests_.find(wait_id)->second; // Note that the object is guaranteed to be added to local_objects_ before // the notification is triggered. @@ -557,8 +540,7 @@ void ObjectManager::SubscribeRemainingWaitObjects(const UniqueID &wait_id) { wait_id, object_id, wait_state.owner_addresses[object_id], [this, wait_id](const ObjectID &subscribe_object_id, const std::unordered_set &node_ids, - const std::string &spilled_url, const NodeID &spilled_node_id, - size_t object_size) { + const std::string &spilled_url) { auto object_id_wait_state = active_wait_requests_.find(wait_id); if (object_id_wait_state == active_wait_requests_.end()) { // Depending on the timing of calls to the object directory, we @@ -818,7 +800,6 @@ std::string ObjectManager::DebugString() const { result << "\n" << object_directory_->DebugString(); result << "\n" << store_notification_->DebugString(); result << "\n" << buffer_pool_.DebugString(); - result << "\n" << pull_manager_->DebugString(); return result.str(); } @@ -834,9 +815,6 @@ void ObjectManager::FillObjectStoreStats(rpc::GetNodeStatsReply *reply) const { stats->set_object_store_bytes_used(used_memory_); stats->set_object_store_bytes_avail(config_.object_store_memory); stats->set_num_local_objects(local_objects_.size()); - if (plasma::plasma_store_runner) { - stats->set_consumed_bytes(plasma::plasma_store_runner->GetConsumedBytes()); - } } void ObjectManager::Tick(const boost::system::error_code &e) { @@ -844,16 +822,6 @@ void ObjectManager::Tick(const boost::system::error_code &e) { << ". Please file a bug report on here: " "https://github.com/ray-project/ray/issues"; - // Request the current available memory from the object - // store. - if (plasma::plasma_store_runner) { - plasma::plasma_store_runner->GetAvailableMemoryAsync([this](size_t available_memory) { - main_service_->post([this, available_memory]() { - pull_manager_->UpdatePullsBasedOnAvailableMemory(available_memory); - }); - }); - } - pull_manager_->Tick(); auto interval = boost::posix_time::milliseconds(config_.timer_freq_ms); diff --git a/src/ray/object_manager/object_manager.h b/src/ray/object_manager/object_manager.h index 00073012213a..a114f16bc446 100644 --- a/src/ray/object_manager/object_manager.h +++ b/src/ray/object_manager/object_manager.h @@ -106,9 +106,8 @@ class ObjectManagerInterface { class ObjectManager : public ObjectManagerInterface, public rpc::ObjectManagerServiceHandler { public: - using RestoreSpilledObjectCallback = - std::function)>; + using RestoreSpilledObjectCallback = std::function)>; /// Implementation of object manager service diff --git a/src/ray/object_manager/ownership_based_object_directory.cc b/src/ray/object_manager/ownership_based_object_directory.cc index e5477c0c20f7..df11a4bb750f 100644 --- a/src/ray/object_manager/ownership_based_object_directory.cc +++ b/src/ray/object_manager/ownership_based_object_directory.cc @@ -34,56 +34,6 @@ void FilterRemovedNodes(std::shared_ptr gcs_client, } } -/// Update object location data based on response from the owning core worker. -bool UpdateObjectLocations(const rpc::GetObjectLocationsOwnerReply &location_reply, - const Status &status, const ObjectID &object_id, - std::shared_ptr gcs_client, - std::unordered_set *node_ids, std::string *spilled_url, - NodeID *spilled_node_id, size_t *object_size) { - bool is_updated = false; - - std::unordered_set new_node_ids; - - if (!status.ok()) { - RAY_LOG(INFO) << "Failed to return location updates to subscribers for " << object_id - << ": " << status.ToString() - << ", assuming that the object was freed or evicted."; - // When we can't get location updates from the owner, we assume that the object was - // freed or evicted, so we send an empty location update to all subscribers. - *node_ids = new_node_ids; - is_updated = true; - } else { - // The size can be 0 if the update was a deletion. This assumes that an - // object's size is always greater than 0. - // TODO(swang): If that's not the case, we should use a flag to check - // whether the size is set instead. - if (location_reply.object_size() > 0) { - *object_size = location_reply.object_size(); - is_updated = true; - } - for (auto const &node_id : location_reply.node_ids()) { - new_node_ids.emplace(NodeID::FromBinary(node_id)); - } - // Filter out the removed nodes from the object locations. - FilterRemovedNodes(gcs_client, &new_node_ids); - if (new_node_ids != *node_ids) { - *node_ids = new_node_ids; - is_updated = true; - } - const std::string &new_spilled_url = location_reply.spilled_url(); - if (new_spilled_url != *spilled_url) { - const auto new_spilled_node_id = - NodeID::FromBinary(location_reply.spilled_node_id()); - RAY_LOG(DEBUG) << "Received object spilled to " << new_spilled_url << " spilled on " - << new_spilled_node_id; - *spilled_url = new_spilled_url; - *spilled_node_id = new_spilled_node_id; - is_updated = true; - } - } - return is_updated; -} - rpc::Address GetOwnerAddressFromObjectInfo( const object_manager::protocol::ObjectInfoT &object_info) { rpc::Address owner_address; @@ -130,18 +80,11 @@ ray::Status OwnershipBasedObjectDirectory::ReportObjectAdded( request.set_node_id(node_id.Binary()); rpc_client->AddObjectLocationOwner( - request, [worker_id, object_id, node_id]( - Status status, const rpc::AddObjectLocationOwnerReply &reply) { + request, [worker_id, object_id](Status status, + const rpc::AddObjectLocationOwnerReply &reply) { if (!status.ok()) { - if (status.IsObjectNotFound()) { - RAY_LOG(INFO) << "Worker " << worker_id << " failed to add the location " - << node_id << " for " << object_id - << " because the owner no longer has the object; we assume the " - "object was evicted."; - } else { - RAY_LOG(INFO) << "Worker " << worker_id << " failed to add the location " - << node_id << " for " << object_id << ": " << status.ToString(); - } + RAY_LOG(ERROR) << "Worker " << worker_id << " failed to add the location for " + << object_id; } }); return Status::OK(); @@ -165,18 +108,11 @@ ray::Status OwnershipBasedObjectDirectory::ReportObjectRemoved( request.set_node_id(node_id.Binary()); rpc_client->RemoveObjectLocationOwner( - request, [worker_id, object_id, node_id]( - Status status, const rpc::RemoveObjectLocationOwnerReply &reply) { + request, [worker_id, object_id](Status status, + const rpc::RemoveObjectLocationOwnerReply &reply) { if (!status.ok()) { - if (status.IsObjectNotFound()) { - RAY_LOG(INFO) << "Worker " << worker_id << " failed to remove the location " - << node_id << " for " << object_id - << " because the owner no longer has the object; we assume the " - "object was freed."; - } else { - RAY_LOG(INFO) << "Worker " << worker_id << " failed to remove the location " - << node_id << " for " << object_id << ": " << status.ToString(); - } + RAY_LOG(ERROR) << "Worker " << worker_id + << " failed to remove the location for " << object_id; } }); return Status::OK(); @@ -185,33 +121,27 @@ ray::Status OwnershipBasedObjectDirectory::ReportObjectRemoved( void OwnershipBasedObjectDirectory::SubscriptionCallback( ObjectID object_id, WorkerID worker_id, Status status, const rpc::GetObjectLocationsOwnerReply &reply) { - // Objects are added to this map in SubscribeObjectLocations. auto it = listeners_.find(object_id); - // Do nothing for objects we are not listening for. if (it == listeners_.end()) { return; } - // Once this flag is set to true, it should never go back to false. - it->second.subscribed = true; - // Update entries for this object. - if (UpdateObjectLocations(reply, status, object_id, gcs_client_, - &it->second.current_object_locations, &it->second.spilled_url, - &it->second.spilled_node_id, &it->second.object_size)) { - // Copy the callbacks so that the callbacks can unsubscribe without interrupting - // looping over the callbacks. + std::unordered_set node_ids; + for (auto const &node_id : reply.node_ids()) { + node_ids.emplace(NodeID::FromBinary(node_id)); + } + FilterRemovedNodes(gcs_client_, &node_ids); + if (node_ids != it->second.current_object_locations) { + it->second.current_object_locations = std::move(node_ids); auto callbacks = it->second.callbacks; // Call all callbacks associated with the object id locations we have // received. This notifies the client even if the list of locations is // empty, since this may indicate that the objects have been evicted from // all nodes. for (const auto &callback_pair : callbacks) { - // We can call the callback directly without worrying about invalidating caller - // iterators since this is already running in the subscription callback stack. - // See https://github.com/ray-project/ray/issues/2959. - callback_pair.second(object_id, it->second.current_object_locations, - it->second.spilled_url, it->second.spilled_node_id, - it->second.object_size); + // It is safe to call the callback directly since this is already running + // in the subscription callback stack. + callback_pair.second(object_id, it->second.current_object_locations, ""); } } @@ -219,7 +149,7 @@ void OwnershipBasedObjectDirectory::SubscriptionCallback( rpc::GetObjectLocationsOwnerRequest request; request.set_intended_worker_id(worker_id.Binary()); request.set_object_id(object_id.Binary()); - request.set_last_version(reply.current_version()); + // TODO(zhuohan): Fix this infinite loop. worker_it->second->GetObjectLocationsOwner( request, std::bind(&OwnershipBasedObjectDirectory::SubscriptionCallback, this, object_id, @@ -241,7 +171,6 @@ ray::Status OwnershipBasedObjectDirectory::SubscribeObjectLocations( rpc::GetObjectLocationsOwnerRequest request; request.set_intended_worker_id(owner_address.worker_id()); request.set_object_id(object_id.Binary()); - request.set_last_version(-1); rpc_client->GetObjectLocationsOwner( request, std::bind(&OwnershipBasedObjectDirectory::SubscriptionCallback, this, object_id, @@ -254,22 +183,6 @@ ray::Status OwnershipBasedObjectDirectory::SubscribeObjectLocations( return Status::OK(); } listener_state.callbacks.emplace(callback_id, callback); - - // If we previously received some notifications about the object's locations, - // immediately notify the caller of the current known locations. - if (listener_state.subscribed) { - auto &locations = listener_state.current_object_locations; - auto &spilled_url = listener_state.spilled_url; - auto &spilled_node_id = listener_state.spilled_node_id; - auto object_size = listener_state.object_size; - // We post the callback to the event loop in order to avoid mutating data structures - // shared with the caller and potentially invalidating caller iterators. - // See https://github.com/ray-project/ray/issues/2959. - io_service_.post( - [callback, locations, spilled_url, spilled_node_id, object_size, object_id]() { - callback(object_id, locations, spilled_url, spilled_node_id, object_size); - }); - } return Status::OK(); } @@ -289,63 +202,35 @@ ray::Status OwnershipBasedObjectDirectory::UnsubscribeObjectLocations( ray::Status OwnershipBasedObjectDirectory::LookupLocations( const ObjectID &object_id, const rpc::Address &owner_address, const OnLocationsFound &callback) { - auto it = listeners_.find(object_id); - if (it != listeners_.end() && it->second.subscribed) { - // If we have locations cached due to a concurrent SubscribeObjectLocations - // call, and we have received at least one update from the owner about - // the object's creation, then call the callback immediately with the - // cached locations. - auto &locations = it->second.current_object_locations; - auto &spilled_url = it->second.spilled_url; - auto &spilled_node_id = it->second.spilled_node_id; - auto object_size = it->second.object_size; - // We post the callback to the event loop in order to avoid mutating data structures - // shared with the caller and potentially invalidating caller iterators. - // See https://github.com/ray-project/ray/issues/2959. - io_service_.post( - [callback, object_id, locations, spilled_url, spilled_node_id, object_size]() { - callback(object_id, locations, spilled_url, spilled_node_id, object_size); - }); - } else { - WorkerID worker_id = WorkerID::FromBinary(owner_address.worker_id()); - std::shared_ptr rpc_client = GetClient(owner_address); - if (rpc_client == nullptr) { - RAY_LOG(WARNING) << "Object " << object_id << " does not have owner. " - << "LookupLocations returns an empty list of locations."; - // We post the callback to the event loop in order to avoid mutating data structures - // shared with the caller and potentially invalidating caller iterators. - // See https://github.com/ray-project/ray/issues/2959. - io_service_.post([callback, object_id]() { - callback(object_id, std::unordered_set(), "", NodeID::Nil(), 0); - }); - return Status::OK(); - } + WorkerID worker_id = WorkerID::FromBinary(owner_address.worker_id()); + std::shared_ptr rpc_client = GetClient(owner_address); + if (rpc_client == nullptr) { + RAY_LOG(WARNING) << "Object " << object_id << " does not have owner. " + << "LookupLocations returns an empty list of locations."; + io_service_.post([callback, object_id]() { + callback(object_id, std::unordered_set(), ""); + }); + return Status::OK(); + } - rpc::GetObjectLocationsOwnerRequest request; - request.set_intended_worker_id(owner_address.worker_id()); - request.set_object_id(object_id.Binary()); - request.set_last_version(-1); + rpc::GetObjectLocationsOwnerRequest request; + request.set_intended_worker_id(owner_address.worker_id()); + request.set_object_id(object_id.Binary()); - rpc_client->GetObjectLocationsOwner( - request, [this, worker_id, object_id, callback]( - Status status, const rpc::GetObjectLocationsOwnerReply &reply) { - if (!status.ok()) { - RAY_LOG(ERROR) << "Worker " << worker_id << " failed to get the location for " - << object_id; - } - std::unordered_set node_ids; - std::string spilled_url; - NodeID spilled_node_id; - size_t object_size = 0; - UpdateObjectLocations(reply, status, object_id, gcs_client_, &node_ids, - &spilled_url, &spilled_node_id, &object_size); - // We can call the callback directly without worrying about invalidating - // caller iterators since this is already running in the core worker - // client's lookup callback stack. - // See https://github.com/ray-project/ray/issues/2959. - callback(object_id, node_ids, spilled_url, spilled_node_id, object_size); - }); - } + rpc_client->GetObjectLocationsOwner( + request, [this, worker_id, object_id, callback]( + Status status, const rpc::GetObjectLocationsOwnerReply &reply) { + if (!status.ok()) { + RAY_LOG(ERROR) << "Worker " << worker_id << " failed to get the location for " + << object_id; + } + std::unordered_set node_ids; + for (auto const &node_id : reply.node_ids()) { + node_ids.emplace(NodeID::FromBinary(node_id)); + } + FilterRemovedNodes(gcs_client_, &node_ids); + callback(object_id, node_ids, ""); + }); return Status::OK(); } diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc index 9b9bb5408df4..a5429d985f91 100644 --- a/src/ray/object_manager/plasma/client.cc +++ b/src/ray/object_manager/plasma/client.cc @@ -121,10 +121,10 @@ class PlasmaClient::Impl : public std::enable_shared_from_this *data, int device_num); Status Get(const std::vector &object_ids, int64_t timeout_ms, - std::vector *object_buffers, bool is_from_worker); + std::vector *object_buffers); Status Get(const ObjectID *object_ids, int64_t num_objects, int64_t timeout_ms, - ObjectBuffer *object_buffers, bool is_from_worker); + ObjectBuffer *object_buffers); Status Release(const ObjectID &object_id); @@ -172,7 +172,7 @@ class PlasmaClient::Impl : public std::enable_shared_from_this( const ObjectID &, const std::shared_ptr &)> &wrap_buffer, - ObjectBuffer *object_buffers, bool is_from_worker); + ObjectBuffer *object_buffers); uint8_t *LookupMmappedFile(MEMFD_TYPE store_fd_val); @@ -362,7 +362,7 @@ Status PlasmaClient::Impl::GetBuffers( const ObjectID *object_ids, int64_t num_objects, int64_t timeout_ms, const std::function( const ObjectID &, const std::shared_ptr &)> &wrap_buffer, - ObjectBuffer *object_buffers, bool is_from_worker) { + ObjectBuffer *object_buffers) { // Fill out the info for the objects that are already in use locally. bool all_present = true; for (int64_t i = 0; i < num_objects; ++i) { @@ -409,8 +409,7 @@ Status PlasmaClient::Impl::GetBuffers( // If we get here, then the objects aren't all currently in use by this // client, so we need to send a request to the plasma store. - RAY_RETURN_NOT_OK(SendGetRequest(store_conn_, &object_ids[0], num_objects, timeout_ms, - is_from_worker)); + RAY_RETURN_NOT_OK(SendGetRequest(store_conn_, &object_ids[0], num_objects, timeout_ms)); std::vector buffer; RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaGetReply, &buffer)); std::vector received_object_ids(num_objects); @@ -471,8 +470,7 @@ Status PlasmaClient::Impl::GetBuffers( } Status PlasmaClient::Impl::Get(const std::vector &object_ids, - int64_t timeout_ms, std::vector *out, - bool is_from_worker) { + int64_t timeout_ms, std::vector *out) { std::lock_guard guard(client_mutex_); const auto wrap_buffer = [=](const ObjectID &object_id, @@ -481,19 +479,16 @@ Status PlasmaClient::Impl::Get(const std::vector &object_ids, }; const size_t num_objects = object_ids.size(); *out = std::vector(num_objects); - return GetBuffers(&object_ids[0], num_objects, timeout_ms, wrap_buffer, &(*out)[0], - is_from_worker); + return GetBuffers(&object_ids[0], num_objects, timeout_ms, wrap_buffer, &(*out)[0]); } Status PlasmaClient::Impl::Get(const ObjectID *object_ids, int64_t num_objects, - int64_t timeout_ms, ObjectBuffer *out, - bool is_from_worker) { + int64_t timeout_ms, ObjectBuffer *out) { std::lock_guard guard(client_mutex_); const auto wrap_buffer = [](const ObjectID &object_id, const std::shared_ptr &buffer) { return buffer; }; - return GetBuffers(object_ids, num_objects, timeout_ms, wrap_buffer, out, - is_from_worker); + return GetBuffers(object_ids, num_objects, timeout_ms, wrap_buffer, out); } Status PlasmaClient::Impl::MarkObjectUnused(const ObjectID &object_id) { @@ -758,14 +753,13 @@ Status PlasmaClient::TryCreateImmediately(const ObjectID &object_id, } Status PlasmaClient::Get(const std::vector &object_ids, int64_t timeout_ms, - std::vector *object_buffers, bool is_from_worker) { - return impl_->Get(object_ids, timeout_ms, object_buffers, is_from_worker); + std::vector *object_buffers) { + return impl_->Get(object_ids, timeout_ms, object_buffers); } Status PlasmaClient::Get(const ObjectID *object_ids, int64_t num_objects, - int64_t timeout_ms, ObjectBuffer *object_buffers, - bool is_from_worker) { - return impl_->Get(object_ids, num_objects, timeout_ms, object_buffers, is_from_worker); + int64_t timeout_ms, ObjectBuffer *object_buffers) { + return impl_->Get(object_ids, num_objects, timeout_ms, object_buffers); } Status PlasmaClient::Release(const ObjectID &object_id) { diff --git a/src/ray/object_manager/plasma/client.h b/src/ray/object_manager/plasma/client.h index 703250bd23b0..e88a9eb138a1 100644 --- a/src/ray/object_manager/plasma/client.h +++ b/src/ray/object_manager/plasma/client.h @@ -161,10 +161,9 @@ class PlasmaClient { /// \param timeout_ms The amount of time in milliseconds to wait before this /// request times out. If this value is -1, then no timeout is set. /// \param[out] object_buffers The object results. - /// \param is_from_worker Whether or not if the Get request comes from a Ray workers. /// \return The return status. Status Get(const std::vector &object_ids, int64_t timeout_ms, - std::vector *object_buffers, bool is_from_worker); + std::vector *object_buffers); /// Deprecated variant of Get() that doesn't automatically release buffers /// when they get out of scope. @@ -174,13 +173,12 @@ class PlasmaClient { /// \param timeout_ms The amount of time in milliseconds to wait before this /// request times out. If this value is -1, then no timeout is set. /// \param object_buffers An array where the results will be stored. - /// \param is_from_worker Whether or not if the Get request comes from a Ray workers. /// \return The return status. /// /// The caller is responsible for releasing any retrieved objects, but it /// should not release objects that were not retrieved. Status Get(const ObjectID *object_ids, int64_t num_objects, int64_t timeout_ms, - ObjectBuffer *object_buffers, bool is_from_worker); + ObjectBuffer *object_buffers); /// Tell Plasma that the client no longer needs the object. This should be /// called after Get() or Create() when the client is done with the object. diff --git a/src/ray/object_manager/plasma/create_request_queue.cc b/src/ray/object_manager/plasma/create_request_queue.cc index e8f45581b643..ddb9b089157d 100644 --- a/src/ray/object_manager/plasma/create_request_queue.cc +++ b/src/ray/object_manager/plasma/create_request_queue.cc @@ -81,7 +81,16 @@ std::pair CreateRequestQueue::TryRequestImmediately( } bool CreateRequestQueue::ProcessRequest(std::unique_ptr &request) { - request->error = request->create_callback(&request->result); + // TODO(sang): Delete this logic when lru evict is removed. + bool evict_if_full = evict_if_full_; + if (oom_start_time_ns_ != -1) { + // If the first attempt fails, we set the evict_if_full true. + // We need this logic because if lru_evict flag is on, this is false because we + // shouldn't evict objects in the first attempt. + evict_if_full = true; + } + request->error = + request->create_callback(/*evict_if_full=*/evict_if_full, &request->result); return request->error != PlasmaError::OutOfMemory; } diff --git a/src/ray/object_manager/plasma/create_request_queue.h b/src/ray/object_manager/plasma/create_request_queue.h index d22ac292b0a8..d2ac288bdeeb 100644 --- a/src/ray/object_manager/plasma/create_request_queue.h +++ b/src/ray/object_manager/plasma/create_request_queue.h @@ -31,16 +31,22 @@ namespace plasma { class CreateRequestQueue { public: - using CreateObjectCallback = std::function; + using CreateObjectCallback = + std::function; - CreateRequestQueue(int64_t oom_grace_period_s, + CreateRequestQueue(bool evict_if_full, int64_t oom_grace_period_s, ray::SpillObjectsCallback spill_objects_callback, std::function trigger_global_gc, std::function get_time) - : oom_grace_period_ns_(oom_grace_period_s * 1e9), + : evict_if_full_(evict_if_full), + oom_grace_period_ns_(oom_grace_period_s * 1e9), spill_objects_callback_(spill_objects_callback), trigger_global_gc_(trigger_global_gc), - get_time_(get_time) {} + get_time_(get_time) { + RAY_LOG(DEBUG) << "Starting plasma::CreateRequestQueue with OOM grace period " + << oom_grace_period_ns_ << ", evict if full? " + << (evict_if_full_ ? 1 : 0); + } /// Add a request to the queue. The caller should use the returned request ID /// to later get the result of the request. @@ -145,6 +151,11 @@ class CreateRequestQueue { /// a request by retrying. Start at 1 because 0 means "do not retry". uint64_t next_req_id_ = 1; + /// On the first attempt to create an object, whether to evict from the + /// object store to make space. If the first attempt fails, then we will + /// always try to evict. + const bool evict_if_full_; + /// Grace period until we throw the OOM error to the application. /// -1 means grace period is infinite. const int64_t oom_grace_period_ns_; diff --git a/src/ray/object_manager/plasma/eviction_policy.h b/src/ray/object_manager/plasma/eviction_policy.h index d20d0b51eeb7..91788bb34ca5 100644 --- a/src/ray/object_manager/plasma/eviction_policy.h +++ b/src/ray/object_manager/plasma/eviction_policy.h @@ -196,8 +196,6 @@ class EvictionPolicy { /// Returns debugging information for this eviction policy. virtual std::string DebugString() const; - int64_t GetPinnedMemoryBytes() const { return pinned_memory_bytes_; } - protected: /// Returns the size of the object int64_t GetObjectSize(const ObjectID &object_id) const; diff --git a/src/ray/object_manager/plasma/plasma.fbs b/src/ray/object_manager/plasma/plasma.fbs index 5a268a891d4a..3816de79e842 100644 --- a/src/ray/object_manager/plasma/plasma.fbs +++ b/src/ray/object_manager/plasma/plasma.fbs @@ -210,8 +210,6 @@ table PlasmaGetRequest { object_ids: [string]; // The number of milliseconds before the request should timeout. timeout_ms: long; - // Whether or not the get request is from the core worker. It is used to record how many bytes are consumed by core workers. - is_from_worker: bool; } table PlasmaGetReply { diff --git a/src/ray/object_manager/plasma/protocol.cc b/src/ray/object_manager/plasma/protocol.cc index c3b5b55ee1d5..8c3164d6a7df 100644 --- a/src/ray/object_manager/plasma/protocol.cc +++ b/src/ray/object_manager/plasma/protocol.cc @@ -553,16 +553,16 @@ Status ReadEvictReply(uint8_t *data, size_t size, int64_t &num_bytes) { // Get messages. Status SendGetRequest(const std::shared_ptr &store_conn, - const ObjectID *object_ids, int64_t num_objects, int64_t timeout_ms, - bool is_from_worker) { + const ObjectID *object_ids, int64_t num_objects, + int64_t timeout_ms) { flatbuffers::FlatBufferBuilder fbb; auto message = fb::CreatePlasmaGetRequest( - fbb, ToFlatbuffer(&fbb, object_ids, num_objects), timeout_ms, is_from_worker); + fbb, ToFlatbuffer(&fbb, object_ids, num_objects), timeout_ms); return PlasmaSend(store_conn, MessageType::PlasmaGetRequest, &fbb, message); } Status ReadGetRequest(uint8_t *data, size_t size, std::vector &object_ids, - int64_t *timeout_ms, bool *is_from_worker) { + int64_t *timeout_ms) { RAY_DCHECK(data); auto message = flatbuffers::GetRoot(data); RAY_DCHECK(VerifyFlatbuffer(message, data, size)); @@ -571,7 +571,6 @@ Status ReadGetRequest(uint8_t *data, size_t size, std::vector &object_ object_ids.push_back(ObjectID::FromBinary(object_id)); } *timeout_ms = message->timeout_ms(); - *is_from_worker = message->is_from_worker(); return Status::OK(); } diff --git a/src/ray/object_manager/plasma/protocol.h b/src/ray/object_manager/plasma/protocol.h index f5baf03ec955..a8ba71b4621f 100644 --- a/src/ray/object_manager/plasma/protocol.h +++ b/src/ray/object_manager/plasma/protocol.h @@ -128,11 +128,11 @@ Status ReadSealReply(uint8_t *data, size_t size, ObjectID *object_id); /* Plasma Get message functions. */ Status SendGetRequest(const std::shared_ptr &store_conn, - const ObjectID *object_ids, int64_t num_objects, int64_t timeout_ms, - bool is_from_worker); + const ObjectID *object_ids, int64_t num_objects, + int64_t timeout_ms); Status ReadGetRequest(uint8_t *data, size_t size, std::vector &object_ids, - int64_t *timeout_ms, bool *is_from_worker); + int64_t *timeout_ms); Status SendGetReply(const std::shared_ptr &client, ObjectID object_ids[], std::unordered_map &plasma_objects, diff --git a/src/ray/object_manager/plasma/store.cc b/src/ray/object_manager/plasma/store.cc index 642d842047c7..9bae68b3a3a8 100644 --- a/src/ray/object_manager/plasma/store.cc +++ b/src/ray/object_manager/plasma/store.cc @@ -69,7 +69,7 @@ namespace plasma { struct GetRequest { GetRequest(boost::asio::io_service &io_context, const std::shared_ptr &client, - const std::vector &object_ids, bool is_from_worker); + const std::vector &object_ids); /// The client that called get. std::shared_ptr client; /// The object IDs involved in this request. This is used in the reply. @@ -82,9 +82,6 @@ struct GetRequest { /// The number of object requests in this wait request that are already /// satisfied. int64_t num_satisfied; - /// Whether or not the request comes from the core worker. It is used to track the size - /// of total objects that are consumed by core worker. - bool is_from_worker; void AsyncWait(int64_t timeout_ms, std::function on_timeout) { @@ -103,12 +100,11 @@ struct GetRequest { GetRequest::GetRequest(boost::asio::io_service &io_context, const std::shared_ptr &client, - const std::vector &object_ids, bool is_from_worker) + const std::vector &object_ids) : client(client), object_ids(object_ids.begin(), object_ids.end()), objects(object_ids.size()), num_satisfied(0), - is_from_worker(is_from_worker), timer_(io_context) { std::unordered_set unique_ids(object_ids.begin(), object_ids.end()); num_objects_to_wait_for = unique_ids.size(); @@ -129,6 +125,7 @@ PlasmaStore::PlasmaStore(boost::asio::io_service &main_service, std::string dire usage_log_interval_ns_(RayConfig::instance().object_store_usage_log_interval_s() * 1e9), create_request_queue_( + /*evict_if_full=*/RayConfig::instance().object_pinning_enabled(), /*oom_grace_period_s=*/RayConfig::instance().oom_grace_period_s(), spill_objects_callback, object_store_full_callback, /*get_time=*/ @@ -162,7 +159,6 @@ void PlasmaStore::AddToClientObjectIds(const ObjectID &object_id, ObjectTableEnt if (entry->ref_count == 0) { // Tell the eviction policy that this object is being used. eviction_policy_.BeginObjectAccess(object_id); - num_bytes_in_use_ += entry->data_size + entry->metadata_size; } // Increase reference count. entry->ref_count++; @@ -172,19 +168,21 @@ void PlasmaStore::AddToClientObjectIds(const ObjectID &object_id, ObjectTableEnt } // Allocate memory -uint8_t *PlasmaStore::AllocateMemory(size_t size, MEMFD_TYPE *fd, int64_t *map_size, - ptrdiff_t *offset, +uint8_t *PlasmaStore::AllocateMemory(size_t size, bool evict_if_full, MEMFD_TYPE *fd, + int64_t *map_size, ptrdiff_t *offset, const std::shared_ptr &client, bool is_create, PlasmaError *error) { // First free up space from the client's LRU queue if quota enforcement is on. - std::vector client_objects_to_evict; - bool quota_ok = eviction_policy_.EnforcePerClientQuota(client.get(), size, is_create, - &client_objects_to_evict); - if (!quota_ok) { - *error = PlasmaError::OutOfMemory; - return nullptr; + if (evict_if_full) { + std::vector client_objects_to_evict; + bool quota_ok = eviction_policy_.EnforcePerClientQuota(client.get(), size, is_create, + &client_objects_to_evict); + if (!quota_ok) { + *error = PlasmaError::OutOfMemory; + return nullptr; + } + EvictObjects(client_objects_to_evict); } - EvictObjects(client_objects_to_evict); // Try to evict objects until there is enough space. uint8_t *pointer = nullptr; @@ -197,7 +195,7 @@ uint8_t *PlasmaStore::AllocateMemory(size_t size, MEMFD_TYPE *fd, int64_t *map_s // it is not guaranteed that the corresponding pointer in the client will be // 64-byte aligned, but in practice it often will be. pointer = reinterpret_cast(PlasmaAllocator::Memalign(kBlockSize, size)); - if (pointer) { + if (pointer || !evict_if_full) { // If we manage to allocate the memory, return the pointer. If we cannot // allocate the space, but we are also not allowed to evict anything to // make more space, return an error to the client. @@ -233,6 +231,7 @@ uint8_t *PlasmaStore::AllocateMemory(size_t size, MEMFD_TYPE *fd, int64_t *map_s PlasmaError PlasmaStore::HandleCreateObjectRequest(const std::shared_ptr &client, const std::vector &message, + bool evict_if_full, PlasmaObject *object) { uint8_t *input = (uint8_t *)message.data(); size_t input_size = message.size(); @@ -248,9 +247,9 @@ PlasmaError PlasmaStore::HandleCreateObjectRequest(const std::shared_ptr ReadCreateRequest(input, input_size, &object_id, &owner_raylet_id, &owner_ip_address, &owner_port, &owner_worker_id, &data_size, &metadata_size, &device_num); - auto error = - CreateObject(object_id, owner_raylet_id, owner_ip_address, owner_port, - owner_worker_id, data_size, metadata_size, device_num, client, object); + auto error = CreateObject(object_id, owner_raylet_id, owner_ip_address, owner_port, + owner_worker_id, evict_if_full, data_size, metadata_size, + device_num, client, object); if (error == PlasmaError::OutOfMemory) { RAY_LOG(DEBUG) << "Not enough memory to create the object " << object_id << ", data_size=" << data_size << ", metadata_size=" << metadata_size; @@ -258,13 +257,11 @@ PlasmaError PlasmaStore::HandleCreateObjectRequest(const std::shared_ptr return error; } -PlasmaError PlasmaStore::CreateObject(const ObjectID &object_id, - const NodeID &owner_raylet_id, - const std::string &owner_ip_address, int owner_port, - const WorkerID &owner_worker_id, int64_t data_size, - int64_t metadata_size, int device_num, - const std::shared_ptr &client, - PlasmaObject *result) { +PlasmaError PlasmaStore::CreateObject( + const ObjectID &object_id, const NodeID &owner_raylet_id, + const std::string &owner_ip_address, int owner_port, const WorkerID &owner_worker_id, + bool evict_if_full, int64_t data_size, int64_t metadata_size, int device_num, + const std::shared_ptr &client, PlasmaObject *result) { RAY_LOG(DEBUG) << "creating object " << object_id.Hex() << " size " << data_size; auto entry = GetObjectTableEntry(&store_info_, object_id); @@ -282,7 +279,8 @@ PlasmaError PlasmaStore::CreateObject(const ObjectID &object_id, if (device_num == 0) { PlasmaError error = PlasmaError::OK; - pointer = AllocateMemory(total_size, &fd, &map_size, &offset, client, true, &error); + pointer = AllocateMemory(total_size, evict_if_full, &fd, &map_size, &offset, client, + true, &error); if (!pointer) { return error; } @@ -394,9 +392,6 @@ void PlasmaStore::ReturnFromGet(GetRequest *get_req) { fds_to_send.insert(fd); store_fds.push_back(fd); mmap_sizes.push_back(GetMmapSize(fd)); - if (get_req->is_from_worker) { - total_consumed_bytes_ += object.data_size + object.metadata_size; - } } } // Send the get reply to the client. @@ -469,9 +464,9 @@ void PlasmaStore::UpdateObjectGetRequests(const ObjectID &object_id) { void PlasmaStore::ProcessGetRequest(const std::shared_ptr &client, const std::vector &object_ids, - int64_t timeout_ms, bool is_from_worker) { + int64_t timeout_ms) { // Create a get request for this object. - auto get_req = new GetRequest(io_context_, client, object_ids, is_from_worker); + auto get_req = new GetRequest(io_context_, client, object_ids); for (auto object_id : object_ids) { // Check if this object is already present // locally. If so, record that the object is being used and mark it as accounted for. @@ -488,9 +483,9 @@ void PlasmaStore::ProcessGetRequest(const std::shared_ptr &client, RAY_CHECK(!entry->pointer); PlasmaError error = PlasmaError::OK; - entry->pointer = - AllocateMemory(entry->data_size + entry->metadata_size, &entry->fd, - &entry->map_size, &entry->offset, client, false, &error); + entry->pointer = AllocateMemory(entry->data_size + entry->metadata_size, + /*evict=*/true, &entry->fd, &entry->map_size, + &entry->offset, client, false, &error); if (entry->pointer) { // TODO(suquark): Not sure if this old behavior is still compatible // with our current object spilling mechanics. @@ -542,7 +537,6 @@ int PlasmaStore::RemoveFromClientObjectIds(const ObjectID &object_id, // If no more clients are using this object, notify the eviction policy // that the object is no longer being used. if (entry->ref_count == 0) { - num_bytes_in_use_ -= entry->data_size + entry->metadata_size; RAY_LOG(DEBUG) << "Releasing object no longer in use " << object_id; if (deletion_cache_.count(object_id) == 0) { // Tell the eviction policy that this object is no longer being used. @@ -568,10 +562,6 @@ void PlasmaStore::EraseFromObjectTable(const ObjectID &object_id) { if (object->device_num == 0) { PlasmaAllocator::Free(object->pointer, buff_size); } - if (object->ref_count > 0) { - // A client was using this object. - num_bytes_in_use_ -= object->data_size + object->metadata_size; - } store_info_.objects.erase(object_id); } @@ -862,8 +852,9 @@ Status PlasmaStore::ProcessMessage(const std::shared_ptr &client, const auto &object_id = GetCreateRequestObjectId(message); const auto &request = flatbuffers::GetRoot(input); - auto handle_create = [this, client, message](PlasmaObject *result) { - return HandleCreateObjectRequest(client, message, result); + auto handle_create = [this, client, message](bool evict_if_full, + PlasmaObject *result) { + return HandleCreateObjectRequest(client, message, evict_if_full, result); }; if (request->try_immediately()) { @@ -901,10 +892,8 @@ Status PlasmaStore::ProcessMessage(const std::shared_ptr &client, case fb::MessageType::PlasmaGetRequest: { std::vector object_ids_to_get; int64_t timeout_ms; - bool is_from_worker; - RAY_RETURN_NOT_OK(ReadGetRequest(input, input_size, object_ids_to_get, &timeout_ms, - &is_from_worker)); - ProcessGetRequest(client, object_ids_to_get, timeout_ms, is_from_worker); + RAY_RETURN_NOT_OK(ReadGetRequest(input, input_size, object_ids_to_get, &timeout_ms)); + ProcessGetRequest(client, object_ids_to_get, timeout_ms); } break; case fb::MessageType::PlasmaReleaseRequest: { RAY_RETURN_NOT_OK(ReadReleaseRequest(input, input_size, &object_id)); @@ -1029,11 +1018,6 @@ void PlasmaStore::ReplyToCreateClient(const std::shared_ptr &client, } } -int64_t PlasmaStore::GetConsumedBytes() { - std::lock_guard guard(mutex_); - return total_consumed_bytes_; -} - bool PlasmaStore::IsObjectSpillable(const ObjectID &object_id) { // The lock is acquired when a request is received to the plasma store. // recursive mutex is used here to allow diff --git a/src/ray/object_manager/plasma/store.h b/src/ray/object_manager/plasma/store.h index c6561bf655b7..ec338d388514 100644 --- a/src/ray/object_manager/plasma/store.h +++ b/src/ray/object_manager/plasma/store.h @@ -33,7 +33,6 @@ #include "ray/object_manager/plasma/connection.h" #include "ray/object_manager/plasma/create_request_queue.h" #include "ray/object_manager/plasma/plasma.h" -#include "ray/object_manager/plasma/plasma_allocator.h" #include "ray/object_manager/plasma/protocol.h" #include "ray/object_manager/plasma/quota_aware_policy.h" @@ -77,6 +76,10 @@ class PlasmaStore { /// \param owner_ip_address IP address of the object's owner. /// \param owner_port Port of the object's owner. /// \param owner_worker_id Worker ID of the object's owner. + /// \param evict_if_full If this is true, then when the object store is full, + /// try to evict objects that are not currently referenced before + /// creating the object. Else, do not evict any objects and + /// immediately return an PlasmaError::OutOfMemory. /// \param data_size Size in bytes of the object to be created. /// \param metadata_size Size in bytes of the object metadata. /// \param device_num The number of the device where the object is being @@ -96,8 +99,8 @@ class PlasmaStore { /// plasma_release. PlasmaError CreateObject(const ObjectID &object_id, const NodeID &owner_raylet_id, const std::string &owner_ip_address, int owner_port, - const WorkerID &owner_worker_id, int64_t data_size, - int64_t metadata_size, int device_num, + const WorkerID &owner_worker_id, bool evict_if_full, + int64_t data_size, int64_t metadata_size, int device_num, const std::shared_ptr &client, PlasmaObject *result); /// Abort a created but unsealed object. If the client is not the @@ -135,8 +138,7 @@ class PlasmaStore { /// \param object_ids Object IDs of the objects to be gotten. /// \param timeout_ms The timeout for the get request in milliseconds. void ProcessGetRequest(const std::shared_ptr &client, - const std::vector &object_ids, int64_t timeout_ms, - bool is_from_worker); + const std::vector &object_ids, int64_t timeout_ms); /// Seal a vector of objects. The objects are now immutable and can be accessed with /// get. @@ -187,9 +189,6 @@ class PlasmaStore { /// before the object is pinned by raylet for the first time. bool IsObjectSpillable(const ObjectID &object_id); - /// Return the plasma object bytes that are consumed by core workers. - int64_t GetConsumedBytes(); - void SetNotificationListener( const std::shared_ptr ¬ification_listener) { notification_listener_ = notification_listener; @@ -210,17 +209,10 @@ class PlasmaStore { /// Process queued requests to create an object. void ProcessCreateRequests(); - void GetAvailableMemory(std::function callback) const { - int64_t num_bytes_in_use = static_cast(num_bytes_in_use_); - RAY_CHECK(PlasmaAllocator::GetFootprintLimit() >= num_bytes_in_use); - size_t available = PlasmaAllocator::GetFootprintLimit() - num_bytes_in_use; - callback(available); - } - private: PlasmaError HandleCreateObjectRequest(const std::shared_ptr &client, const std::vector &message, - PlasmaObject *object); + bool evict_if_full, PlasmaObject *object); void ReplyToCreateClient(const std::shared_ptr &client, const ObjectID &object_id, uint64_t req_id); @@ -251,9 +243,10 @@ class PlasmaStore { void EraseFromObjectTable(const ObjectID &object_id); - uint8_t *AllocateMemory(size_t size, MEMFD_TYPE *fd, int64_t *map_size, - ptrdiff_t *offset, const std::shared_ptr &client, - bool is_create, PlasmaError *error); + uint8_t *AllocateMemory(size_t size, bool evict_if_full, MEMFD_TYPE *fd, + int64_t *map_size, ptrdiff_t *offset, + const std::shared_ptr &client, bool is_create, + PlasmaError *error); // Start listening for clients. void DoAccept(); @@ -313,11 +306,6 @@ class PlasmaStore { /// interface that node manager or object manager can access the plasma store with this /// mutex if it is not absolutely necessary. std::recursive_mutex mutex_; - - size_t num_bytes_in_use_ = 0; - - /// Total plasma object bytes that are consumed by core workers. - int64_t total_consumed_bytes_ = 0; }; } // namespace plasma diff --git a/src/ray/object_manager/plasma/store_runner.cc b/src/ray/object_manager/plasma/store_runner.cc index 5a44e297cd42..34e08080cced 100644 --- a/src/ray/object_manager/plasma/store_runner.cc +++ b/src/ray/object_manager/plasma/store_runner.cc @@ -123,8 +123,6 @@ bool PlasmaStoreRunner::IsPlasmaObjectSpillable(const ObjectID &object_id) { return store_->IsObjectSpillable(object_id); } -int64_t PlasmaStoreRunner::GetConsumedBytes() { return store_->GetConsumedBytes(); } - std::unique_ptr plasma_store_runner; } // namespace plasma diff --git a/src/ray/object_manager/plasma/store_runner.h b/src/ray/object_manager/plasma/store_runner.h index f4785810cb24..3edd70350cc2 100644 --- a/src/ray/object_manager/plasma/store_runner.h +++ b/src/ray/object_manager/plasma/store_runner.h @@ -1,8 +1,9 @@ #pragma once -#include #include +#include + #include "absl/synchronization/mutex.h" #include "ray/object_manager/notification/object_store_notification_manager.h" #include "ray/object_manager/plasma/store.h" @@ -22,12 +23,6 @@ class PlasmaStoreRunner { } bool IsPlasmaObjectSpillable(const ObjectID &object_id); - int64_t GetConsumedBytes(); - - void GetAvailableMemoryAsync(std::function callback) const { - main_service_.post([this, callback]() { store_->GetAvailableMemory(callback); }); - } - private: void Shutdown(); absl::Mutex store_runner_mutex_; @@ -35,7 +30,7 @@ class PlasmaStoreRunner { int64_t system_memory_; bool hugepages_enabled_; std::string plasma_directory_; - mutable boost::asio::io_service main_service_; + boost::asio::io_service main_service_; std::unique_ptr store_; std::shared_ptr listener_; }; diff --git a/src/ray/object_manager/pull_manager.cc b/src/ray/object_manager/pull_manager.cc index 1ce460b81004..289ad13eb5cc 100644 --- a/src/ray/object_manager/pull_manager.cc +++ b/src/ray/object_manager/pull_manager.cc @@ -8,16 +8,13 @@ PullManager::PullManager( NodeID &self_node_id, const std::function object_is_local, const std::function send_pull_request, const RestoreSpilledObjectCallback restore_spilled_object, - const std::function get_time, int pull_timeout_ms, - size_t num_bytes_available, std::function object_store_full_callback) + const std::function get_time, int pull_timeout_ms) : self_node_id_(self_node_id), object_is_local_(object_is_local), send_pull_request_(send_pull_request), restore_spilled_object_(restore_spilled_object), get_time_(get_time), pull_timeout_ms_(pull_timeout_ms), - num_bytes_available_(num_bytes_available), - object_store_full_callback_(object_store_full_callback), gen_(std::chrono::high_resolution_clock::now().time_since_epoch().count()) {} uint64_t PullManager::Pull(const std::vector &object_ref_bundle, @@ -42,228 +39,33 @@ uint64_t PullManager::Pull(const std::vector &object_ref_b it->second.bundle_request_ids.insert(bundle_it->first); } - // We have a new request. Activate the new request, if the - // current available memory allows it. - UpdatePullsBasedOnAvailableMemory(num_bytes_available_); - return bundle_it->first; } -bool PullManager::ActivateNextPullBundleRequest( - const std::map>::iterator - &next_request_it, - std::vector *objects_to_pull) { - // Check that we have sizes for all of the objects in the bundle. If not, we - // should not activate the bundle, since it may put us over the available - // capacity. - for (const auto &ref : next_request_it->second) { - auto obj_id = ObjectRefToId(ref); - const auto it = object_pull_requests_.find(obj_id); - RAY_CHECK(it != object_pull_requests_.end()); - if (!it->second.object_size_set) { - // NOTE(swang): The size could be 0 if we haven't received size - // information yet. If we receive the size later on, we will update the - // total bytes being pulled then. - RAY_LOG(DEBUG) << "No size for " << obj_id << ", canceling activation for pull " - << next_request_it->first; - return false; - } - } - - // Activate the bundle. - for (const auto &ref : next_request_it->second) { - auto obj_id = ObjectRefToId(ref); - bool start_pull = active_object_pull_requests_.count(obj_id) == 0; - active_object_pull_requests_[obj_id].insert(next_request_it->first); - if (start_pull) { - RAY_LOG(DEBUG) << "Activating pull for object " << obj_id; - // This is the first bundle request in the queue to require this object. - // Add the size to the number of bytes being pulled. - auto it = object_pull_requests_.find(obj_id); - RAY_CHECK(it != object_pull_requests_.end()); - num_bytes_being_pulled_ += it->second.object_size; - objects_to_pull->push_back(obj_id); - } - } - - // Update the pointer to the last pull request that we are actively pulling. - RAY_CHECK(next_request_it->first > highest_req_id_being_pulled_); - highest_req_id_being_pulled_ = next_request_it->first; - return true; -} - -void PullManager::DeactivatePullBundleRequest( - const std::map>::iterator &request_it, - std::unordered_set *objects_to_cancel) { - for (const auto &ref : request_it->second) { - auto obj_id = ObjectRefToId(ref); - RAY_CHECK(active_object_pull_requests_[obj_id].erase(request_it->first)); - if (active_object_pull_requests_[obj_id].empty()) { - RAY_LOG(DEBUG) << "Deactivating pull for object " << obj_id; - auto it = object_pull_requests_.find(obj_id); - RAY_CHECK(it != object_pull_requests_.end()); - num_bytes_being_pulled_ -= it->second.object_size; - active_object_pull_requests_.erase(obj_id); - - if (objects_to_cancel) { - objects_to_cancel->insert(obj_id); - } - } - } - - // If this was the last active request, update the pointer to its - // predecessor, if one exists. - if (highest_req_id_being_pulled_ == request_it->first) { - if (request_it == pull_request_bundles_.begin()) { - highest_req_id_being_pulled_ = 0; - } else { - highest_req_id_being_pulled_ = std::prev(request_it)->first; - } - } -} - -void PullManager::UpdatePullsBasedOnAvailableMemory(size_t num_bytes_available) { - if (num_bytes_available_ != num_bytes_available) { - RAY_LOG(DEBUG) << "Updating pulls based on available memory: " << num_bytes_available; - } - num_bytes_available_ = num_bytes_available; - - // While there is available capacity, activate the next pull request. - std::vector objects_to_pull; - while (num_bytes_being_pulled_ < num_bytes_available_) { - // Get the next pull request in the queue. - const auto last_request_it = pull_request_bundles_.find(highest_req_id_being_pulled_); - auto next_request_it = last_request_it; - if (next_request_it == pull_request_bundles_.end()) { - // No requests are active. Get the first request in the queue. - next_request_it = pull_request_bundles_.begin(); - } else { - next_request_it++; - } - - if (next_request_it == pull_request_bundles_.end()) { - // No requests in the queue. - break; - } - - RAY_LOG(DEBUG) << "Activating request " << next_request_it->first - << " num bytes being pulled: " << num_bytes_being_pulled_ - << " num bytes available: " << num_bytes_available_; - // There is another pull bundle request that we could try, and there is - // enough space. Activate the next pull bundle request in the queue. - if (!ActivateNextPullBundleRequest(next_request_it, &objects_to_pull)) { - // This pull bundle request could not be activated, due to lack of object - // size information. Wait until we have object size information before - // activating this pull bundle. - break; - } - } - - std::unordered_set object_ids_to_cancel; - // While the total bytes requested is over the available capacity, deactivate - // the last pull request, ordered by request ID. - while (num_bytes_being_pulled_ > num_bytes_available_) { - RAY_LOG(DEBUG) << "Deactivating request " << highest_req_id_being_pulled_ - << " num bytes being pulled: " << num_bytes_being_pulled_ - << " num bytes available: " << num_bytes_available_; - const auto last_request_it = pull_request_bundles_.find(highest_req_id_being_pulled_); - RAY_CHECK(last_request_it != pull_request_bundles_.end()); - DeactivatePullBundleRequest(last_request_it, &object_ids_to_cancel); - } - - TriggerOutOfMemoryHandlingIfNeeded(); - - for (const auto &obj_id : objects_to_pull) { - if (object_ids_to_cancel.count(obj_id) == 0) { - TryToMakeObjectLocal(obj_id); - } - } -} - -void PullManager::TriggerOutOfMemoryHandlingIfNeeded() { - if (pull_request_bundles_.empty()) { - // No requests queued. - return; - } - - const auto head = pull_request_bundles_.begin(); - if (highest_req_id_being_pulled_ >= head->first) { - // At least one request is being actively pulled, so there is currently - // enough space. - return; - } - - // No requests are being pulled. Check whether this is because we don't have - // object size information yet. - size_t num_bytes_needed = 0; - for (const auto &ref : head->second) { - auto obj_id = ObjectRefToId(ref); - const auto it = object_pull_requests_.find(obj_id); - RAY_CHECK(it != object_pull_requests_.end()); - if (!it->second.object_size_set) { - // We're not pulling the first request because we don't have size - // information. Wait for the size information before triggering OOM - return; - } - num_bytes_needed += it->second.object_size; - } - - // The first request in the queue is not being pulled due to lack of space. - // Trigger out-of-memory handling to try to make room. - // TODO(swang): This can hang if no room can be made. We should return an - // error for requests whose total size is larger than the capacity of the - // memory store. - if (get_time_() - last_oom_reported_ms_ > 30000) { - RAY_LOG(WARNING) - << "There is not enough memory to pull objects needed by a queued task or " - "a worker blocked in ray.get or ray.wait. " - << "Need " << num_bytes_needed << " bytes, but only " << num_bytes_available_ - << " bytes are available on this node. " - << "This job may hang if no memory can be freed through garbage collection or " - "object spilling. See " - "https://docs.ray.io/en/master/memory-management.html for more information. " - "Please file a GitHub issue if you see this message repeatedly."; - last_oom_reported_ms_ = get_time_(); - } - object_store_full_callback_(); -} - std::vector PullManager::CancelPull(uint64_t request_id) { + std::vector objects_to_cancel; RAY_LOG(DEBUG) << "Cancel pull request " << request_id; auto bundle_it = pull_request_bundles_.find(request_id); RAY_CHECK(bundle_it != pull_request_bundles_.end()); - // If the pull request was being actively pulled, deactivate it now. - if (bundle_it->first <= highest_req_id_being_pulled_) { - DeactivatePullBundleRequest(bundle_it); - } - - // Erase this pull request. - std::vector object_ids_to_cancel; for (const auto &ref : bundle_it->second) { auto obj_id = ObjectRefToId(ref); auto it = object_pull_requests_.find(obj_id); RAY_CHECK(it != object_pull_requests_.end()); - RAY_CHECK(it->second.bundle_request_ids.erase(bundle_it->first)); + RAY_CHECK(it->second.bundle_request_ids.erase(request_id)); if (it->second.bundle_request_ids.empty()) { object_pull_requests_.erase(it); - object_ids_to_cancel.push_back(obj_id); + objects_to_cancel.push_back(obj_id); } } - pull_request_bundles_.erase(bundle_it); - - // We need to update the pulls in case there is another request(s) after this - // request that can now be activated. We do this after erasing the cancelled - // request to avoid reactivating it again. - UpdatePullsBasedOnAvailableMemory(num_bytes_available_); - return object_ids_to_cancel; + pull_request_bundles_.erase(bundle_it); + return objects_to_cancel; } void PullManager::OnLocationChange(const ObjectID &object_id, const std::unordered_set &client_ids, - const std::string &spilled_url, - const NodeID &spilled_node_id, size_t object_size) { + const std::string &spilled_url) { // Exit if the Pull request has already been fulfilled or canceled. auto it = object_pull_requests_.find(object_id); if (it == object_pull_requests_.end()) { @@ -275,20 +77,6 @@ void PullManager::OnLocationChange(const ObjectID &object_id, // before. it->second.client_locations = std::vector(client_ids.begin(), client_ids.end()); it->second.spilled_url = spilled_url; - it->second.spilled_node_id = spilled_node_id; - if (!it->second.object_size_set) { - it->second.object_size = object_size; - it->second.object_size_set = true; - UpdatePullsBasedOnAvailableMemory(num_bytes_available_); - RAY_LOG(DEBUG) << "Updated size of object " << object_id << " to " << object_size - << ", num bytes being pulled is now " << num_bytes_being_pulled_; - if (it->second.object_size == 0) { - RAY_LOG(WARNING) << "Size of object " << object_id - << " stored in object store is zero. This may be a bug since " - "objects in the object store should be large, and can result " - "in too many objects being fetched to this node"; - } - } RAY_LOG(DEBUG) << "OnLocationChange " << spilled_url << " num clients " << client_ids.size(); @@ -299,57 +87,39 @@ void PullManager::TryToMakeObjectLocal(const ObjectID &object_id) { if (object_is_local_(object_id)) { return; } - if (active_object_pull_requests_.count(object_id) == 0) { + auto it = object_pull_requests_.find(object_id); + if (it == object_pull_requests_.end()) { return; } - auto it = object_pull_requests_.find(object_id); - RAY_CHECK(it != object_pull_requests_.end()); auto &request = it->second; if (request.next_pull_time > get_time_()) { return; } - // We always pull objects from a remote node before - // restoring it because of two reasons. - // 1. This will help reducing the load of external storages - // or remote node that spilled the object. - // 2. Also, if we use multi-node file spilling, the restoration will be - // confirmed by a object location subscription, so we should pull first - // before requesting for object restoration. - bool did_pull = PullFromRandomLocation(object_id); - if (did_pull) { - // New object locations were found, so begin trying to pull from a - // client. - UpdateRetryTimer(request); - return; - } - - // If we cannot pull, it means all objects have been evicted, so try restoring objects - // from the external storage. If the object was spilled on the current node, the - // callback will restore the object from the local the disk. - // Otherwise, it will send a request to a remote node that spilled the object. - // If external storage is a distributed storage, we always try restoring from it without - // sending RPCs. if (!request.spilled_url.empty()) { - const auto spilled_node_id = request.spilled_node_id; + // Try to restore the spilled object. restore_spilled_object_( - object_id, request.spilled_url, spilled_node_id, - [this, object_id, spilled_node_id](const ray::Status &status) { + object_id, request.spilled_url, [this, object_id](const ray::Status &status) { + bool did_pull = true; + // Fall back to fetching from another object manager. if (!status.ok()) { - const auto node_id_with_issue = - spilled_node_id.IsNil() ? self_node_id_ : spilled_node_id; - RAY_LOG(WARNING) - << "Object restoration failed and the object could " - "not be " - "found on any other nodes. This can happen if the location where the " - "object was spilled is unreachable. This job may hang if the object " - "is permanently unreachable. " - "Please check the log of node of id: " - << node_id_with_issue << " Object id: " << object_id; + did_pull = PullFromRandomLocation(object_id); + } + if (!did_pull) { + RAY_LOG(WARNING) << "Object restoration failed and the object could not be " + "found on any other nodes. Object id: " + << object_id; } }); - // We shouldn't update the timer here because restoration takes some time, and since - // we retry pull requests with exponential backoff, the delay could be large. + UpdateRetryTimer(request); + } else { + // New object locations were found, so begin trying to pull from a + // client. This will be called every time a new client location + // appears. + bool did_pull = PullFromRandomLocation(object_id); + if (did_pull) { + UpdateRetryTimer(request); + } } } @@ -404,14 +174,6 @@ bool PullManager::PullFromRandomLocation(const ObjectID &object_id) { return true; } -void PullManager::ResetRetryTimer(const ObjectID &object_id) { - auto it = object_pull_requests_.find(object_id); - if (it != object_pull_requests_.end()) { - it->second.next_pull_time = get_time_(); - it->second.num_retries = 0; - } -} - void PullManager::UpdateRetryTimer(ObjectPullRequest &request) { const auto time = get_time_(); auto retry_timeout_len = (pull_timeout_ms_ / 1000.) * (1UL << request.num_retries); @@ -422,7 +184,7 @@ void PullManager::UpdateRetryTimer(ObjectPullRequest &request) { } void PullManager::Tick() { - for (auto &pair : active_object_pull_requests_) { + for (auto &pair : object_pull_requests_) { const auto &object_id = pair.first; TryToMakeObjectLocal(object_id); } @@ -430,16 +192,4 @@ void PullManager::Tick() { int PullManager::NumActiveRequests() const { return object_pull_requests_.size(); } -std::string PullManager::DebugString() const { - std::stringstream result; - result << "PullManager:"; - result << "\n- num bytes available for pulled objects: " << num_bytes_available_; - result << "\n- num bytes being pulled: " << num_bytes_being_pulled_; - result << "\n- num pull request bundles: " << pull_request_bundles_.size(); - result << "\n- num objects requested pull: " << object_pull_requests_.size(); - result << "\n- num objects actively being pulled: " - << active_object_pull_requests_.size(); - return result.str(); -} - } // namespace ray diff --git a/src/ray/object_manager/pull_manager.h b/src/ray/object_manager/pull_manager.h index b0c80e338597..6364ae34a68d 100644 --- a/src/ray/object_manager/pull_manager.h +++ b/src/ray/object_manager/pull_manager.h @@ -40,14 +40,9 @@ class PullManager { NodeID &self_node_id, const std::function object_is_local, const std::function send_pull_request, const RestoreSpilledObjectCallback restore_spilled_object, - const std::function get_time, int pull_timeout_ms, - size_t num_bytes_available, std::function object_store_full_callback); - - /// Add a new pull request for a bundle of objects. The objects in the - /// request will get pulled once: - /// 1. Their sizes are known. - /// 2. Their total size, together with the total size of all requests - /// preceding this one, is within the capacity of the local object store. + const std::function get_time, int pull_timeout_ms); + + /// Begin a new pull request for a bundle of objects. /// /// \param object_refs The bundle of objects that must be made local. /// \param objects_to_locate The objects whose new locations the caller @@ -56,15 +51,6 @@ class PullManager { uint64_t Pull(const std::vector &object_ref_bundle, std::vector *objects_to_locate); - /// Update the pull requests that are currently being pulled, according to - /// the current capacity. The PullManager will choose the objects to pull by - /// taking the longest contiguous prefix of the request queue whose total - /// size is less than the given capacity. - /// - /// \param num_bytes_available The number of bytes that are currently - /// available to store objects pulled from another node. - void UpdatePullsBasedOnAvailableMemory(size_t num_bytes_available); - /// Called when the available locations for a given object change. /// /// \param object_id The ID of the object which is now available in a new location. @@ -72,12 +58,9 @@ class PullManager { /// necessarily a super or subset of the previously available nodes. /// \param spilled_url The location of the object if it was spilled. If /// non-empty, the object may no longer be on any node. - /// \param spilled_node_id The node id of the object if it was spilled. If Nil, the - /// object may no longer be on any node. void OnLocationChange(const ObjectID &object_id, const std::unordered_set &client_ids, - const std::string &spilled_url, const NodeID &spilled_node_id, - size_t object_size); + const std::string &spilled_url); /// Cancel an existing pull request. /// @@ -90,18 +73,9 @@ class PullManager { /// existing objects from other nodes if necessary. void Tick(); - /// Call to reset the retry timer for an object that is actively being - /// pulled. This should be called for objects that were evicted but that may - /// still be needed on this node. - /// - /// \param object_id The object ID to reset. - void ResetRetryTimer(const ObjectID &object_id); - /// The number of ongoing object pulls. int NumActiveRequests() const; - std::string DebugString() const; - private: /// A helper structure for tracking information about each ongoing object pull. struct ObjectPullRequest { @@ -113,14 +87,8 @@ class PullManager { bundle_request_ids() {} std::vector client_locations; std::string spilled_url; - NodeID spilled_node_id; double next_pull_time; uint8_t num_retries; - bool object_size_set = false; - size_t object_size = 0; - // All bundle requests that haven't been canceled yet that require this - // object. This includes bundle requests whose objects are not actively - // being pulled. absl::flat_hash_set bundle_request_ids; }; @@ -144,24 +112,6 @@ class PullManager { /// \param request The request to update the retry time of. void UpdateRetryTimer(ObjectPullRequest &request); - /// Activate the next pull request in the queue. This will start pulls for - /// any objects in the request that are not already being pulled. - bool ActivateNextPullBundleRequest( - const std::map>::iterator - &next_request_it, - std::vector *objects_to_pull); - - /// Deactivate a pull request in the queue. This cancels any pull or restore - /// operations for the object. - void DeactivatePullBundleRequest( - const std::map>::iterator &request_it, - std::unordered_set *objects_to_cancel = nullptr); - - /// Trigger out-of-memory handling if the first request in the queue needs - /// more space than the bytes available. This is needed to make room for the - /// request. - void TriggerOutOfMemoryHandlingIfNeeded(); - /// See the constructor's arguments. NodeID self_node_id_; const std::function object_is_local_; @@ -174,51 +124,13 @@ class PullManager { /// cancel. Start at 1 because 0 means null. uint64_t next_req_id_ = 1; - /// The currently active pull requests. Each request is a bundle of objects - /// that must be made local. The key is the ID that was assigned to that - /// request, which can be used by the caller to cancel the request. - std::map> pull_request_bundles_; - - /// The total number of bytes that we are currently pulling. This is the - /// total size of the objects requested that we are actively pulling. To - /// avoid starvation, this is always less than the available capacity in the - /// local object store. - size_t num_bytes_being_pulled_ = 0; - - /// The total number of bytes that is available to store objects that we are - /// pulling. - size_t num_bytes_available_; - - /// Triggered when the first request in the queue can't be pulled due to - /// out-of-memory. This callback should try to make more bytes available. - std::function object_store_full_callback_; - - /// The last time OOM was reported. Track this so we don't spam warnings when - /// the object store is full. - uint64_t last_oom_reported_ms_ = 0; - - /// A pointer to the highest request ID whose objects we are currently - /// pulling. We always pull a contiguous prefix of the active pull requests. - /// This means that all requests with a lower ID are either already canceled - /// or their objects are also being pulled. - uint64_t highest_req_id_being_pulled_ = 0; - - /// The objects that this object manager has been asked to fetch from remote - /// object managers. - std::unordered_map object_pull_requests_; + std::unordered_map> pull_request_bundles_; - /// The objects that we are currently fetching. This is a subset of the - /// objects that we have been asked to fetch. The total size of these objects - /// is the number of bytes that we are currently pulling, and it must be less - /// than the bytes available. - absl::flat_hash_map> - active_object_pull_requests_; + /// The objects that this object manager is currently trying to fetch from + /// remote object managers. + std::unordered_map object_pull_requests_; /// Internally maintained random number generator. std::mt19937_64 gen_; - - friend class PullManagerTest; - friend class PullManagerTestWithCapacity; - friend class PullManagerWithAdmissionControlTest; }; } // namespace ray diff --git a/src/ray/object_manager/test/create_request_queue_test.cc b/src/ray/object_manager/test/create_request_queue_test.cc index 5b107c71ad27..ec75e0043e79 100644 --- a/src/ray/object_manager/test/create_request_queue_test.cc +++ b/src/ray/object_manager/test/create_request_queue_test.cc @@ -49,6 +49,7 @@ class CreateRequestQueueTest : public ::testing::Test { : oom_grace_period_s_(1), current_time_ns_(0), queue_( + /*evict_if_full=*/true, /*oom_grace_period_s=*/oom_grace_period_s_, /*spill_object_callback=*/[&]() { return false; }, /*on_global_gc=*/[&]() { num_global_gc_++; }, @@ -68,7 +69,7 @@ class CreateRequestQueueTest : public ::testing::Test { }; TEST_F(CreateRequestQueueTest, TestSimple) { - auto request = [&](PlasmaObject *result) { + auto request = [&](bool evict_if_full, PlasmaObject *result) { result->data_size = 1234; return PlasmaError::OK; }; @@ -104,8 +105,10 @@ TEST_F(CreateRequestQueueTest, TestSimple) { } TEST_F(CreateRequestQueueTest, TestOom) { - auto oom_request = [&](PlasmaObject *result) { return PlasmaError::OutOfMemory; }; - auto blocked_request = [&](PlasmaObject *result) { + auto oom_request = [&](bool evict_if_full, PlasmaObject *result) { + return PlasmaError::OutOfMemory; + }; + auto blocked_request = [&](bool evict_if_full, PlasmaObject *result) { result->data_size = 1234; return PlasmaError::OK; }; @@ -138,14 +141,17 @@ TEST(CreateRequestQueueParameterTest, TestOomInfiniteRetry) { int num_global_gc_ = 0; int64_t current_time_ns; CreateRequestQueue queue( + /*evict_if_full=*/true, /*oom_grace_period_s=*/100, // Spilling is failing. /*spill_object_callback=*/[&]() { return false; }, /*on_global_gc=*/[&]() { num_global_gc_++; }, /*get_time=*/[&]() { return current_time_ns; }); - auto oom_request = [&](PlasmaObject *result) { return PlasmaError::OutOfMemory; }; - auto blocked_request = [&](PlasmaObject *result) { + auto oom_request = [&](bool evict_if_full, PlasmaObject *result) { + return PlasmaError::OutOfMemory; + }; + auto blocked_request = [&](bool evict_if_full, PlasmaObject *result) { result->data_size = 1234; return PlasmaError::OK; }; @@ -168,19 +174,20 @@ TEST(CreateRequestQueueParameterTest, TestOomInfiniteRetry) { TEST_F(CreateRequestQueueTest, TestTransientOom) { CreateRequestQueue queue( + /*evict_if_full=*/true, /*oom_grace_period_s=*/oom_grace_period_s_, /*spill_object_callback=*/[&]() { return true; }, /*on_global_gc=*/[&]() { num_global_gc_++; }, /*get_time=*/[&]() { return current_time_ns_; }); auto return_status = PlasmaError::OutOfMemory; - auto oom_request = [&](PlasmaObject *result) { + auto oom_request = [&](bool evict_if_full, PlasmaObject *result) { if (return_status == PlasmaError::OK) { result->data_size = 1234; } return return_status; }; - auto blocked_request = [&](PlasmaObject *result) { + auto blocked_request = [&](bool evict_if_full, PlasmaObject *result) { result->data_size = 1234; return PlasmaError::OK; }; @@ -213,19 +220,20 @@ TEST_F(CreateRequestQueueTest, TestTransientOom) { TEST_F(CreateRequestQueueTest, TestTransientOomThenOom) { bool is_spilling_possible = true; CreateRequestQueue queue( + /*evict_if_full=*/true, /*oom_grace_period_s=*/oom_grace_period_s_, /*spill_object_callback=*/[&]() { return is_spilling_possible; }, /*on_global_gc=*/[&]() { num_global_gc_++; }, /*get_time=*/[&]() { return current_time_ns_; }); auto return_status = PlasmaError::OutOfMemory; - auto oom_request = [&](PlasmaObject *result) { + auto oom_request = [&](bool evict_if_full, PlasmaObject *result) { if (return_status == PlasmaError::OK) { result->data_size = 1234; } return return_status; }; - auto blocked_request = [&](PlasmaObject *result) { + auto blocked_request = [&](bool evict_if_full, PlasmaObject *result) { result->data_size = 1234; return PlasmaError::OK; }; @@ -263,15 +271,38 @@ TEST_F(CreateRequestQueueTest, TestTransientOomThenOom) { AssertNoLeaks(); } +TEST_F(CreateRequestQueueTest, TestEvictIfFull) { + auto oom_request = [&](bool evict_if_full, PlasmaObject *result) { + RAY_CHECK(evict_if_full); + return PlasmaError::OutOfMemory; + }; + + auto client = std::make_shared(); + static_cast(queue_.AddRequest(ObjectID::Nil(), client, oom_request)); + ASSERT_TRUE(queue_.ProcessRequests().IsObjectStoreFull()); + ASSERT_TRUE(queue_.ProcessRequests().IsObjectStoreFull()); +} + TEST(CreateRequestQueueParameterTest, TestNoEvictIfFull) { int64_t current_time_ns = 0; CreateRequestQueue queue( + /*evict_if_full=*/false, /*oom_grace_period_s=*/1, /*spill_object_callback=*/[&]() { return false; }, /*on_global_gc=*/[&]() {}, /*get_time=*/[&]() { return current_time_ns; }); - auto oom_request = [&](PlasmaObject *result) { return PlasmaError::OutOfMemory; }; + bool first_try = true; + + auto oom_request = [&](bool evict_if_full, PlasmaObject *result) { + if (first_try) { + RAY_CHECK(!evict_if_full); + first_try = false; + } else { + RAY_CHECK(evict_if_full); + } + return PlasmaError::OutOfMemory; + }; auto client = std::make_shared(); static_cast(queue.AddRequest(ObjectID::Nil(), client, oom_request)); @@ -281,7 +312,7 @@ TEST(CreateRequestQueueParameterTest, TestNoEvictIfFull) { } TEST_F(CreateRequestQueueTest, TestClientDisconnected) { - auto request = [&](PlasmaObject *result) { + auto request = [&](bool evict_if_full, PlasmaObject *result) { result->data_size = 1234; return PlasmaError::OK; }; @@ -310,7 +341,7 @@ TEST_F(CreateRequestQueueTest, TestClientDisconnected) { } TEST_F(CreateRequestQueueTest, TestTryRequestImmediately) { - auto request = [&](PlasmaObject *result) { + auto request = [&](bool evict_if_full, PlasmaObject *result) { result->data_size = 1234; return PlasmaError::OK; }; @@ -335,7 +366,9 @@ TEST_F(CreateRequestQueueTest, TestTryRequestImmediately) { // Queue is empty, but request would block. Check that we do not attempt to // retry the request. - auto oom_request = [&](PlasmaObject *result) { return PlasmaError::OutOfMemory; }; + auto oom_request = [&](bool evict_if_full, PlasmaObject *result) { + return PlasmaError::OutOfMemory; + }; result = queue_.TryRequestImmediately(ObjectID::Nil(), client, oom_request); ASSERT_EQ(result.first.data_size, 0); ASSERT_EQ(result.second, PlasmaError::OutOfMemory); diff --git a/src/ray/object_manager/test/object_manager_stress_test.cc b/src/ray/object_manager/test/object_manager_stress_test.cc new file mode 100644 index 000000000000..8896ba9968db --- /dev/null +++ b/src/ray/object_manager/test/object_manager_stress_test.cc @@ -0,0 +1,453 @@ +// Copyright 2017 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "ray/common/common_protocol.h" +#include "ray/common/status.h" +#include "ray/common/test_util.h" +#include "ray/gcs/gcs_client/service_based_gcs_client.h" +#include "ray/object_manager/object_manager.h" +#include "ray/util/filesystem.h" +#include "src/ray/protobuf/common.pb.h" + +extern "C" { +#include "hiredis/hiredis.h" +} + +namespace ray { + +using rpc::GcsNodeInfo; + +static inline bool flushall_redis(void) { + redisContext *context = redisConnect("127.0.0.1", 6379); + if (context == nullptr || context->err) { + return false; + } + freeReplyObject(redisCommand(context, "FLUSHALL")); + freeReplyObject(redisCommand(context, "SET NumRedisShards 1")); + freeReplyObject(redisCommand(context, "LPUSH RedisShards 127.0.0.1:6380")); + redisFree(context); + + redisContext *shard_context = redisConnect("127.0.0.1", 6380); + if (shard_context == nullptr || shard_context->err) { + return false; + } + freeReplyObject(redisCommand(shard_context, "FLUSHALL")); + redisFree(shard_context); + + return true; +} + +int64_t current_time_ms() { + std::chrono::milliseconds ms_since_epoch = + std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()); + return ms_since_epoch.count(); +} + +class MockServer { + public: + MockServer(boost::asio::io_service &main_service, + const ObjectManagerConfig &object_manager_config, + std::shared_ptr gcs_client) + : node_id_(NodeID::FromRandom()), + config_(object_manager_config), + gcs_client_(gcs_client), + object_manager_(main_service, node_id_, object_manager_config, + std::make_shared(main_service, gcs_client_), + nullptr) { + RAY_CHECK_OK(RegisterGcs(main_service)); + } + + ~MockServer() { RAY_CHECK_OK(gcs_client_->Nodes().UnregisterSelf()); } + + private: + ray::Status RegisterGcs(boost::asio::io_service &io_service) { + auto object_manager_port = object_manager_.GetServerPort(); + GcsNodeInfo node_info; + node_info.set_node_id(node_id_.Binary()); + node_info.set_node_manager_address("127.0.0.1"); + node_info.set_node_manager_port(object_manager_port); + node_info.set_object_manager_port(object_manager_port); + + ray::Status status = gcs_client_->Nodes().RegisterSelf(node_info, nullptr); + std::this_thread::sleep_for(std::chrono::milliseconds(5000)); + return status; + } + + friend class StressTestObjectManager; + + NodeID node_id_; + ObjectManagerConfig config_; + std::shared_ptr gcs_client_; + ObjectManager object_manager_; +}; + +class TestObjectManagerBase : public ::testing::Test { + public: + void SetUp() { + WaitForCondition(flushall_redis, 7000); + + // start store + socket_name_1 = TestSetupUtil::StartObjectStore(); + socket_name_2 = TestSetupUtil::StartObjectStore(); + + unsigned int pull_timeout_ms = 1000; + uint64_t object_chunk_size = static_cast(std::pow(10, 3)); + int push_timeout_ms = 10000; + + // start first server + gcs_server_socket_name_ = TestSetupUtil::StartGcsServer("127.0.0.1"); + gcs::GcsClientOptions client_options("127.0.0.1", 6379, /*password*/ "", + /*is_test_client=*/false); + gcs_client_1 = std::make_shared(client_options); + RAY_CHECK_OK(gcs_client_1->Connect(main_service)); + ObjectManagerConfig om_config_1; + om_config_1.store_socket_name = socket_name_1; + om_config_1.pull_timeout_ms = pull_timeout_ms; + om_config_1.object_chunk_size = object_chunk_size; + om_config_1.push_timeout_ms = push_timeout_ms; + om_config_1.object_manager_port = 0; + om_config_1.rpc_service_threads_number = 3; + server1.reset(new MockServer(main_service, om_config_1, gcs_client_1)); + + // start second server + gcs_client_2 = std::make_shared(client_options); + RAY_CHECK_OK(gcs_client_2->Connect(main_service)); + ObjectManagerConfig om_config_2; + om_config_2.store_socket_name = socket_name_2; + om_config_2.pull_timeout_ms = pull_timeout_ms; + om_config_2.object_chunk_size = object_chunk_size; + om_config_2.push_timeout_ms = push_timeout_ms; + om_config_2.object_manager_port = 0; + om_config_2.rpc_service_threads_number = 3; + server2.reset(new MockServer(main_service, om_config_2, gcs_client_2)); + + // connect to stores. + RAY_CHECK_OK(client1.Connect(socket_name_1)); + RAY_CHECK_OK(client2.Connect(socket_name_2)); + } + + void TearDown() { + Status client1_status = client1.Disconnect(); + Status client2_status = client2.Disconnect(); + ASSERT_TRUE(client1_status.ok() && client2_status.ok()); + + gcs_client_1->Disconnect(); + gcs_client_2->Disconnect(); + + this->server1.reset(); + this->server2.reset(); + + TestSetupUtil::StopObjectStore(socket_name_1); + TestSetupUtil::StopObjectStore(socket_name_2); + + if (!gcs_server_socket_name_.empty()) { + TestSetupUtil::StopGcsServer(gcs_server_socket_name_); + } + } + + ObjectID WriteDataToClient(plasma::PlasmaClient &client, int64_t data_size) { + ObjectID object_id = ObjectID::FromRandom(); + RAY_LOG(DEBUG) << "ObjectID Created: " << object_id; + uint8_t metadata[] = {5}; + int64_t metadata_size = sizeof(metadata); + uint64_t retry_with_request_id = 0; + std::shared_ptr data; + RAY_CHECK_OK(client.Create(object_id, ray::rpc::Address(), data_size, metadata, + metadata_size, &retry_with_request_id, &data)); + RAY_CHECK(retry_with_request_id == 0); + RAY_CHECK_OK(client.Seal(object_id)); + return object_id; + } + + void object_added_handler_1(ObjectID object_id) { v1.push_back(object_id); }; + + void object_added_handler_2(ObjectID object_id) { v2.push_back(object_id); }; + + protected: + std::thread p; + boost::asio::io_service main_service; + std::shared_ptr gcs_client_1; + std::shared_ptr gcs_client_2; + std::unique_ptr server1; + std::unique_ptr server2; + + plasma::PlasmaClient client1; + plasma::PlasmaClient client2; + std::vector v1; + std::vector v2; + + std::string gcs_server_socket_name_; + std::string socket_name_1; + std::string socket_name_2; +}; + +class StressTestObjectManager : public TestObjectManagerBase { + public: + enum class TransferPattern { + PUSH_A_B, + PUSH_B_A, + BIDIRECTIONAL_PUSH, + PULL_A_B, + PULL_B_A, + BIDIRECTIONAL_PULL, + BIDIRECTIONAL_PULL_VARIABLE_DATA_SIZE, + }; + + int async_loop_index = -1; + size_t num_expected_objects; + + std::vector async_loop_patterns = { + TransferPattern::PUSH_A_B, + TransferPattern::PUSH_B_A, + TransferPattern::BIDIRECTIONAL_PUSH, + TransferPattern::PULL_A_B, + TransferPattern::PULL_B_A, + TransferPattern::BIDIRECTIONAL_PULL, + TransferPattern::BIDIRECTIONAL_PULL_VARIABLE_DATA_SIZE}; + + int num_connected_clients = 0; + + NodeID node_id_1; + NodeID node_id_2; + + int64_t start_time; + + void WaitConnections() { + node_id_1 = gcs_client_1->Nodes().GetSelfId(); + node_id_2 = gcs_client_2->Nodes().GetSelfId(); + RAY_CHECK_OK(gcs_client_1->Nodes().AsyncSubscribeToNodeChange( + [this](const NodeID &node_id, const GcsNodeInfo &data) { + if (node_id == node_id_1 || node_id == node_id_2) { + num_connected_clients += 1; + } + if (num_connected_clients == 4) { + StartTests(); + } + }, + nullptr)); + RAY_CHECK_OK(gcs_client_2->Nodes().AsyncSubscribeToNodeChange( + [this](const NodeID &node_id, const GcsNodeInfo &data) { + if (node_id == node_id_1 || node_id == node_id_2) { + num_connected_clients += 1; + } + if (num_connected_clients == 4) { + StartTests(); + } + }, + nullptr)); + } + + void StartTests() { + TestConnections(); + AddTransferTestHandlers(); + TransferTestNext(); + } + + void AddTransferTestHandlers() { + ray::Status status = ray::Status::OK(); + status = server1->object_manager_.SubscribeObjAdded( + [this](const object_manager::protocol::ObjectInfoT &object_info) { + object_added_handler_1(ObjectID::FromBinary(object_info.object_id)); + if (v1.size() == num_expected_objects && v1.size() == v2.size()) { + TransferTestComplete(); + } + }); + RAY_CHECK_OK(status); + status = server2->object_manager_.SubscribeObjAdded( + [this](const object_manager::protocol::ObjectInfoT &object_info) { + object_added_handler_2(ObjectID::FromBinary(object_info.object_id)); + if (v2.size() == num_expected_objects && v1.size() == v2.size()) { + TransferTestComplete(); + } + }); + RAY_CHECK_OK(status); + } + + void TransferTestNext() { + async_loop_index += 1; + if ((size_t)async_loop_index < async_loop_patterns.size()) { + TransferPattern pattern = async_loop_patterns[async_loop_index]; + TransferTestExecute(100, 3 * std::pow(10, 3) - 1, pattern); + } else { + main_service.stop(); + } + } + + plasma::ObjectBuffer GetObject(plasma::PlasmaClient &client, ObjectID &object_id) { + plasma::ObjectBuffer object_buffer; + RAY_CHECK_OK(client.Get(&object_id, 1, 0, &object_buffer)); + return object_buffer; + } + + void CompareObjects(ObjectID &object_id_1, ObjectID &object_id_2) { + plasma::ObjectBuffer object_buffer_1 = GetObject(client1, object_id_1); + plasma::ObjectBuffer object_buffer_2 = GetObject(client2, object_id_2); + uint8_t *data_1 = const_cast(object_buffer_1.data->Data()); + uint8_t *data_2 = const_cast(object_buffer_2.data->Data()); + ASSERT_EQ(object_buffer_1.data->Size(), object_buffer_2.data->Size()); + ASSERT_EQ(object_buffer_1.metadata->Size(), object_buffer_2.metadata->Size()); + int64_t total_size = object_buffer_1.data->Size() + object_buffer_1.metadata->Size(); + RAY_LOG(DEBUG) << "total_size " << total_size; + for (int i = -1; ++i < total_size;) { + ASSERT_TRUE(data_1[i] == data_2[i]); + } + } + + void TransferTestComplete() { + int64_t elapsed = current_time_ms() - start_time; + RAY_LOG(INFO) << "TransferTestComplete: " + << static_cast(async_loop_patterns[async_loop_index]) << " " + << v1.size() << " " << elapsed; + ASSERT_TRUE(v1.size() == v2.size()); + for (size_t i = 0; i < v1.size(); ++i) { + ASSERT_TRUE(std::find(v1.begin(), v1.end(), v2[i]) != v1.end()); + } + + // Compare objects and their hashes. + for (size_t i = 0; i < v1.size(); ++i) { + ObjectID object_id_2 = v2[i]; + ObjectID object_id_1 = + v1[std::distance(v1.begin(), std::find(v1.begin(), v1.end(), v2[i]))]; + CompareObjects(object_id_1, object_id_2); + } + + v1.clear(); + v2.clear(); + TransferTestNext(); + } + + void TransferTestExecute(int num_trials, int64_t data_size, + TransferPattern transfer_pattern) { + NodeID node_id_1 = gcs_client_1->Nodes().GetSelfId(); + NodeID node_id_2 = gcs_client_2->Nodes().GetSelfId(); + + if (transfer_pattern == TransferPattern::BIDIRECTIONAL_PULL || + transfer_pattern == TransferPattern::BIDIRECTIONAL_PUSH || + transfer_pattern == TransferPattern::BIDIRECTIONAL_PULL_VARIABLE_DATA_SIZE) { + num_expected_objects = (size_t)2 * num_trials; + } else { + num_expected_objects = (size_t)num_trials; + } + + start_time = current_time_ms(); + + switch (transfer_pattern) { + case TransferPattern::PUSH_A_B: { + for (int i = -1; ++i < num_trials;) { + ObjectID oid1 = WriteDataToClient(client1, data_size); + server1->object_manager_.Push(oid1, node_id_2); + } + } break; + case TransferPattern::PUSH_B_A: { + for (int i = -1; ++i < num_trials;) { + ObjectID oid2 = WriteDataToClient(client2, data_size); + server2->object_manager_.Push(oid2, node_id_1); + } + } break; + case TransferPattern::BIDIRECTIONAL_PUSH: { + for (int i = -1; ++i < num_trials;) { + ObjectID oid1 = WriteDataToClient(client1, data_size); + server1->object_manager_.Push(oid1, node_id_2); + ObjectID oid2 = WriteDataToClient(client2, data_size); + server2->object_manager_.Push(oid2, node_id_1); + } + } break; + case TransferPattern::PULL_A_B: { + for (int i = -1; ++i < num_trials;) { + ObjectID oid1 = WriteDataToClient(client1, data_size); + static_cast( + server2->object_manager_.Pull({ObjectIdToRef(oid1, rpc::Address())})); + } + } break; + case TransferPattern::PULL_B_A: { + for (int i = -1; ++i < num_trials;) { + ObjectID oid2 = WriteDataToClient(client2, data_size); + static_cast( + server1->object_manager_.Pull({ObjectIdToRef(oid2, rpc::Address())})); + } + } break; + case TransferPattern::BIDIRECTIONAL_PULL: { + for (int i = -1; ++i < num_trials;) { + ObjectID oid1 = WriteDataToClient(client1, data_size); + static_cast( + server2->object_manager_.Pull({ObjectIdToRef(oid1, rpc::Address())})); + ObjectID oid2 = WriteDataToClient(client2, data_size); + static_cast( + server1->object_manager_.Pull({ObjectIdToRef(oid2, rpc::Address())})); + } + } break; + case TransferPattern::BIDIRECTIONAL_PULL_VARIABLE_DATA_SIZE: { + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution<> dis(1, 50); + for (int i = -1; ++i < num_trials;) { + ObjectID oid1 = WriteDataToClient(client1, data_size + dis(gen)); + static_cast( + server2->object_manager_.Pull({ObjectIdToRef(oid1, rpc::Address())})); + ObjectID oid2 = WriteDataToClient(client2, data_size + dis(gen)); + static_cast( + server1->object_manager_.Pull({ObjectIdToRef(oid2, rpc::Address())})); + } + } break; + default: { + RAY_LOG(FATAL) << "No case for transfer_pattern " + << static_cast(transfer_pattern); + } break; + } + } + + void TestConnections() { + RAY_LOG(DEBUG) << "\n" + << "Server node ids:" + << "\n"; + NodeID node_id_1 = gcs_client_1->Nodes().GetSelfId(); + NodeID node_id_2 = gcs_client_2->Nodes().GetSelfId(); + RAY_LOG(DEBUG) << "Server 1: " << node_id_1 << "\n" + << "Server 2: " << node_id_2; + + RAY_LOG(DEBUG) << "\n" + << "All connected nodes:" + << "\n"; + auto data = gcs_client_1->Nodes().Get(node_id_1); + RAY_LOG(DEBUG) << "NodeID=" << NodeID::FromBinary(data->node_id()) << "\n" + << "NodeIp=" << data->node_manager_address() << "\n" + << "NodePort=" << data->node_manager_port(); + auto data2 = gcs_client_1->Nodes().Get(node_id_2); + RAY_LOG(DEBUG) << "NodeID=" << NodeID::FromBinary(data2->node_id()) << "\n" + << "NodeIp=" << data2->node_manager_address() << "\n" + << "NodePort=" << data2->node_manager_port(); + } +}; + +TEST_F(StressTestObjectManager, StartStressTestObjectManager) { + auto AsyncStartTests = main_service.wrap([this]() { WaitConnections(); }); + AsyncStartTests(); + main_service.run(); +} + +} // namespace ray + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + ray::TEST_STORE_EXEC_PATH = std::string(argv[1]); + ray::TEST_GCS_SERVER_EXEC_PATH = std::string(argv[2]); + return RUN_ALL_TESTS(); +} diff --git a/src/ray/object_manager/test/object_manager_test.cc b/src/ray/object_manager/test/object_manager_test.cc new file mode 100644 index 000000000000..7afe2e42ef03 --- /dev/null +++ b/src/ray/object_manager/test/object_manager_test.cc @@ -0,0 +1,496 @@ +// Copyright 2017 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ray/object_manager/object_manager.h" + +#include +#include + +#include "gtest/gtest.h" +#include "ray/common/status.h" +#include "ray/common/test_util.h" +#include "ray/gcs/gcs_client/service_based_gcs_client.h" +#include "ray/util/filesystem.h" +#include "src/ray/protobuf/common.pb.h" + +extern "C" { +#include "hiredis/hiredis.h" +} + +namespace { +int64_t wait_timeout_ms; +} // namespace + +namespace ray { + +using rpc::GcsNodeInfo; + +static inline void flushall_redis(void) { + redisContext *context = redisConnect("127.0.0.1", 6379); + freeReplyObject(redisCommand(context, "FLUSHALL")); + freeReplyObject(redisCommand(context, "SET NumRedisShards 1")); + freeReplyObject(redisCommand(context, "LPUSH RedisShards 127.0.0.1:6380")); + redisFree(context); +} + +class MockServer { + public: + MockServer(boost::asio::io_service &main_service, + const ObjectManagerConfig &object_manager_config, + std::shared_ptr gcs_client) + : node_id_(NodeID::FromRandom()), + config_(object_manager_config), + gcs_client_(gcs_client), + object_manager_(main_service, node_id_, object_manager_config, + std::make_shared(main_service, gcs_client_), + nullptr) { + RAY_CHECK_OK(RegisterGcs(main_service)); + } + + ~MockServer() { RAY_CHECK_OK(gcs_client_->Nodes().UnregisterSelf()); } + + private: + ray::Status RegisterGcs(boost::asio::io_service &io_service) { + auto object_manager_port = object_manager_.GetServerPort(); + GcsNodeInfo node_info; + node_info.set_node_id(node_id_.Binary()); + node_info.set_node_manager_address("127.0.0.1"); + node_info.set_node_manager_port(object_manager_port); + node_info.set_object_manager_port(object_manager_port); + + ray::Status status = gcs_client_->Nodes().RegisterSelf(node_info, nullptr); + return status; + } + + friend class TestObjectManager; + + NodeID node_id_; + ObjectManagerConfig config_; + std::shared_ptr gcs_client_; + ObjectManager object_manager_; +}; + +class TestObjectManagerBase : public ::testing::Test { + public: + void SetUp() { + flushall_redis(); + + // start store + socket_name_1 = TestSetupUtil::StartObjectStore(); + socket_name_2 = TestSetupUtil::StartObjectStore(); + + unsigned int pull_timeout_ms = 1; + push_timeout_ms = 1500; + + // start first server + gcs_server_socket_name_ = TestSetupUtil::StartGcsServer("127.0.0.1"); + gcs::GcsClientOptions client_options("127.0.0.1", 6379, /*password*/ "", + /*is_test_client=*/true); + gcs_client_1 = std::make_shared(client_options); + RAY_CHECK_OK(gcs_client_1->Connect(main_service)); + ObjectManagerConfig om_config_1; + om_config_1.store_socket_name = socket_name_1; + om_config_1.pull_timeout_ms = pull_timeout_ms; + om_config_1.object_chunk_size = object_chunk_size; + om_config_1.push_timeout_ms = push_timeout_ms; + om_config_1.object_manager_port = 0; + om_config_1.rpc_service_threads_number = 3; + server1.reset(new MockServer(main_service, om_config_1, gcs_client_1)); + + // start second server + gcs_client_2 = std::make_shared(client_options); + RAY_CHECK_OK(gcs_client_2->Connect(main_service)); + ObjectManagerConfig om_config_2; + om_config_2.store_socket_name = socket_name_2; + om_config_2.pull_timeout_ms = pull_timeout_ms; + om_config_2.object_chunk_size = object_chunk_size; + om_config_2.push_timeout_ms = push_timeout_ms; + om_config_2.object_manager_port = 0; + om_config_2.rpc_service_threads_number = 3; + server2.reset(new MockServer(main_service, om_config_2, gcs_client_2)); + + // connect to stores. + RAY_CHECK_OK(client1.Connect(socket_name_1)); + RAY_CHECK_OK(client2.Connect(socket_name_2)); + } + + void TearDown() { + Status client1_status = client1.Disconnect(); + Status client2_status = client2.Disconnect(); + ASSERT_TRUE(client1_status.ok() && client2_status.ok()); + + gcs_client_1->Disconnect(); + gcs_client_2->Disconnect(); + + this->server1.reset(); + this->server2.reset(); + + TestSetupUtil::StopObjectStore(socket_name_1); + TestSetupUtil::StopObjectStore(socket_name_2); + + if (!gcs_server_socket_name_.empty()) { + TestSetupUtil::StopGcsServer(gcs_server_socket_name_); + } + } + + ObjectID WriteDataToClient(plasma::PlasmaClient &client, int64_t data_size) { + return WriteDataToClient(client, data_size, ObjectID::FromRandom()); + } + + ObjectID WriteDataToClient(plasma::PlasmaClient &client, int64_t data_size, + ObjectID object_id) { + RAY_LOG(DEBUG) << "ObjectID Created: " << object_id; + uint8_t metadata[] = {5}; + int64_t metadata_size = sizeof(metadata); + uint64_t retry_with_request_id = 0; + std::shared_ptr data; + RAY_CHECK_OK(client.Create(object_id, ray::rpc::Address(), data_size, metadata, + metadata_size, &retry_with_request_id, &data)); + RAY_CHECK(retry_with_request_id == 0); + RAY_CHECK_OK(client.Seal(object_id)); + return object_id; + } + + void object_added_handler_1(ObjectID object_id) { v1.push_back(object_id); }; + + void object_added_handler_2(ObjectID object_id) { v2.push_back(object_id); }; + + protected: + std::thread p; + boost::asio::io_service main_service; + std::shared_ptr gcs_client_1; + std::shared_ptr gcs_client_2; + std::unique_ptr server1; + std::unique_ptr server2; + + plasma::PlasmaClient client1; + plasma::PlasmaClient client2; + std::vector v1; + std::vector v2; + + std::string gcs_server_socket_name_; + std::string socket_name_1; + std::string socket_name_2; + + unsigned int push_timeout_ms; + + uint64_t object_chunk_size = static_cast(std::pow(10, 3)); +}; + +class TestObjectManager : public TestObjectManagerBase { + public: + int current_wait_test = -1; + int num_connected_clients_1 = 0; + int num_connected_clients_2 = 0; + std::atomic ready_cnt; + NodeID node_id_1; + NodeID node_id_2; + + ObjectID created_object_id1; + ObjectID created_object_id2; + + std::unique_ptr timer; + + void WaitConnections() { + node_id_1 = gcs_client_1->Nodes().GetSelfId(); + node_id_2 = gcs_client_2->Nodes().GetSelfId(); + RAY_CHECK_OK(gcs_client_1->Nodes().AsyncSubscribeToNodeChange( + [this](const NodeID &node_id, const GcsNodeInfo &data) { + if (node_id == node_id_1 || node_id == node_id_2) { + num_connected_clients_1 += 1; + } + if (num_connected_clients_1 == 2) { + ready_cnt += 1; + if (ready_cnt == 2) { + StartTests(); + } + } + }, + nullptr)); + RAY_CHECK_OK(gcs_client_2->Nodes().AsyncSubscribeToNodeChange( + [this](const NodeID &node_id, const GcsNodeInfo &data) { + if (node_id == node_id_1 || node_id == node_id_2) { + num_connected_clients_2 += 1; + } + if (num_connected_clients_2 == 2) { + ready_cnt += 1; + if (ready_cnt == 2) { + StartTests(); + } + } + }, + nullptr)); + } + + void StartTests() { + TestConnections(); + TestNotifications(); + } + + void TestNotifications() { + ray::Status status = ray::Status::OK(); + status = server1->object_manager_.SubscribeObjAdded( + [this](const object_manager::protocol::ObjectInfoT &object_info) { + object_added_handler_1(ObjectID::FromBinary(object_info.object_id)); + NotificationTestCompleteIfSatisfied(); + }); + RAY_CHECK_OK(status); + status = server2->object_manager_.SubscribeObjAdded( + [this](const object_manager::protocol::ObjectInfoT &object_info) { + object_added_handler_2(ObjectID::FromBinary(object_info.object_id)); + NotificationTestCompleteIfSatisfied(); + }); + RAY_CHECK_OK(status); + + size_t data_size = 1000000; + + // dummy_id is not local. The push function will timeout. + ObjectID dummy_id = ObjectID::FromRandom(); + server1->object_manager_.Push(dummy_id, gcs_client_2->Nodes().GetSelfId()); + + created_object_id1 = ObjectID::FromRandom(); + WriteDataToClient(client1, data_size, created_object_id1); + // Server1 holds Object1 so this Push call will success. + server1->object_manager_.Push(created_object_id1, gcs_client_2->Nodes().GetSelfId()); + + // This timer is used to guarantee that the Push function for dummy_id will timeout. + timer.reset(new boost::asio::deadline_timer(main_service)); + auto period = boost::posix_time::milliseconds(push_timeout_ms + 10); + timer->expires_from_now(period); + created_object_id2 = ObjectID::FromRandom(); + timer->async_wait([this, data_size](const boost::system::error_code &error) { + WriteDataToClient(client2, data_size, created_object_id2); + }); + } + + void NotificationTestCompleteIfSatisfied() { + size_t num_expected_objects1 = 1; + size_t num_expected_objects2 = 2; + if (v1.size() == num_expected_objects1 && v2.size() == num_expected_objects2) { + SubscribeObjectThenWait(); + } + } + + void SubscribeObjectThenWait() { + int data_size = 100; + // Test to ensure Wait works properly during an active subscription to the same + // object. + ObjectID object_1 = WriteDataToClient(client2, data_size); + ObjectID object_2 = WriteDataToClient(client2, data_size); + server2->object_manager_.Push(object_1, gcs_client_1->Nodes().GetSelfId()); + server2->object_manager_.Push(object_2, gcs_client_1->Nodes().GetSelfId()); + + UniqueID sub_id = ray::UniqueID::FromRandom(); + RAY_CHECK_OK(server1->object_manager_.object_directory_->SubscribeObjectLocations( + sub_id, object_1, rpc::Address(), + [this, sub_id, object_1, object_2](const ray::ObjectID &object_id, + const std::unordered_set &clients, + const std::string &spilled_url) { + if (!clients.empty()) { + TestWaitWhileSubscribed(sub_id, object_1, object_2); + } + })); + } + + void TestWaitWhileSubscribed(UniqueID sub_id, ObjectID object_1, ObjectID object_2) { + int required_objects = 1; + int timeout_ms = 1500; + + std::vector object_ids = {object_1, object_2}; + boost::posix_time::ptime start_time = boost::posix_time::second_clock::local_time(); + + UniqueID wait_id = UniqueID::FromRandom(); + + RAY_CHECK_OK(server1->object_manager_.AddWaitRequest( + wait_id, object_ids, std::unordered_map(), timeout_ms, + required_objects, + [this, sub_id, object_1, object_ids, start_time]( + const std::vector &found, + const std::vector &remaining) { + int64_t elapsed = (boost::posix_time::second_clock::local_time() - start_time) + .total_milliseconds(); + RAY_LOG(DEBUG) << "elapsed " << elapsed; + RAY_LOG(DEBUG) << "found " << found.size(); + RAY_LOG(DEBUG) << "remaining " << remaining.size(); + RAY_CHECK(found.size() == 1); + // There's nothing more to test. A check will fail if unexpected behavior is + // triggered. + RAY_CHECK_OK( + server1->object_manager_.object_directory_->UnsubscribeObjectLocations( + sub_id, object_1)); + NextWaitTest(); + })); + + // Skip lookups and rely on Subscribe only to test subscribe interaction. + server1->object_manager_.SubscribeRemainingWaitObjects(wait_id); + } + + void NextWaitTest() { + int data_size = 600; + current_wait_test += 1; + switch (current_wait_test) { + case 0: { + // Ensure timeout_ms = 0 is handled correctly. + // Out of 5 objects, we expect 3 ready objects and 2 remaining objects. + TestWait(data_size, 5, 3, /*timeout_ms=*/0, false, false); + } break; + case 1: { + // Ensure timeout_ms = 1500 is handled correctly. + // Out of 5 objects, we expect 3 ready objects and 2 remaining objects. + TestWait(data_size, 5, 3, wait_timeout_ms, false, false); + } break; + case 2: { + // Generate objects locally to ensure local object code-path works properly. + // Out of 5 objects, we expect 3 ready objects and 2 remaining objects. + TestWait(data_size, 5, 3, wait_timeout_ms, false, /*test_local=*/true); + } break; + case 3: { + // Wait on an object that's never registered with GCS to ensure timeout works + // properly. + TestWait(data_size, /*num_objects=*/5, /*required_objects=*/6, wait_timeout_ms, + /*include_nonexistent=*/true, false); + } break; + case 4: { + // Ensure infinite time code-path works properly. + TestWait(data_size, 5, 5, /*timeout_ms=*/-1, false, false); + } break; + } + } + + void TestWait(int data_size, int num_objects, uint64_t required_objects, int timeout_ms, + bool include_nonexistent, bool test_local) { + std::vector object_ids; + for (int i = -1; ++i < num_objects;) { + ObjectID oid; + if (test_local) { + oid = WriteDataToClient(client1, data_size); + } else { + oid = WriteDataToClient(client2, data_size); + server2->object_manager_.Push(oid, gcs_client_1->Nodes().GetSelfId()); + } + object_ids.push_back(oid); + } + if (include_nonexistent) { + num_objects += 1; + object_ids.push_back(ObjectID::FromRandom()); + } + + boost::posix_time::ptime start_time = boost::posix_time::second_clock::local_time(); + RAY_CHECK_OK(server1->object_manager_.Wait( + object_ids, std::unordered_map(), timeout_ms, + required_objects, + [this, object_ids, num_objects, timeout_ms, required_objects, start_time]( + const std::vector &found, + const std::vector &remaining) { + int64_t elapsed = (boost::posix_time::second_clock::local_time() - start_time) + .total_milliseconds(); + RAY_LOG(DEBUG) << "elapsed " << elapsed; + RAY_LOG(DEBUG) << "found " << found.size(); + RAY_LOG(DEBUG) << "remaining " << remaining.size(); + + // Ensure object order is preserved for all invocations. + size_t j = 0; + size_t k = 0; + for (size_t i = 0; i < object_ids.size(); ++i) { + ObjectID oid = object_ids[i]; + // Make sure the object is in either the found vector or the remaining vector. + if (j < found.size() && found[j] == oid) { + j += 1; + } + if (k < remaining.size() && remaining[k] == oid) { + k += 1; + } + } + if (!found.empty()) { + ASSERT_EQ(j, found.size()); + } + if (!remaining.empty()) { + ASSERT_EQ(k, remaining.size()); + } + + switch (current_wait_test) { + case 0: { + // Ensure timeout_ms = 0 returns expected number of found and remaining + // objects. + ASSERT_TRUE(found.size() <= required_objects); + ASSERT_TRUE(static_cast(found.size() + remaining.size()) == num_objects); + NextWaitTest(); + } break; + case 1: { + // Ensure lookup succeeds as expected when timeout_ms = 1500. + ASSERT_TRUE(found.size() >= required_objects); + ASSERT_TRUE(static_cast(found.size() + remaining.size()) == num_objects); + NextWaitTest(); + } break; + case 2: { + // Ensure lookup succeeds as expected when objects are local. + ASSERT_TRUE(found.size() >= required_objects); + ASSERT_TRUE(static_cast(found.size() + remaining.size()) == num_objects); + NextWaitTest(); + } break; + case 3: { + // Ensure lookup returns after timeout_ms elapses when one object doesn't + // exist. + ASSERT_TRUE(elapsed >= timeout_ms); + ASSERT_TRUE(static_cast(found.size() + remaining.size()) == num_objects); + NextWaitTest(); + } break; + case 4: { + // Ensure timeout_ms = -1 works properly. + ASSERT_TRUE(static_cast(found.size()) == num_objects); + ASSERT_TRUE(remaining.size() == 0); + TestWaitComplete(); + } break; + } + })); + } + + void TestWaitComplete() { main_service.stop(); } + + void TestConnections() { + RAY_LOG(DEBUG) << "\n" + << "Server node ids:" + << "\n"; + auto data = gcs_client_1->Nodes().Get(node_id_1); + RAY_LOG(DEBUG) << (NodeID::FromBinary(data->node_id()).IsNil()); + RAY_LOG(DEBUG) << "Server 1 NodeID=" << NodeID::FromBinary(data->node_id()); + RAY_LOG(DEBUG) << "Server 1 NodeIp=" << data->node_manager_address(); + RAY_LOG(DEBUG) << "Server 1 NodePort=" << data->node_manager_port(); + ASSERT_EQ(node_id_1, NodeID::FromBinary(data->node_id())); + auto data2 = gcs_client_1->Nodes().Get(node_id_2); + RAY_LOG(DEBUG) << "Server 2 NodeID=" << NodeID::FromBinary(data2->node_id()); + RAY_LOG(DEBUG) << "Server 2 NodeIp=" << data2->node_manager_address(); + RAY_LOG(DEBUG) << "Server 2 NodePort=" << data2->node_manager_port(); + ASSERT_EQ(node_id_2, NodeID::FromBinary(data2->node_id())); + } +}; + +/* TODO(ekl) this seems to be hanging occasionally on Linux +TEST_F(TestObjectManager, StartTestObjectManager) { + // TODO: Break this test suite into unit tests. + auto AsyncStartTests = main_service.wrap([this]() { WaitConnections(); }); + AsyncStartTests(); + main_service.run(); +} +*/ + +} // namespace ray + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + ray::TEST_STORE_EXEC_PATH = std::string(argv[1]); + wait_timeout_ms = std::stoi(std::string(argv[2])); + ray::TEST_GCS_SERVER_EXEC_PATH = std::string(argv[3]); + return RUN_ALL_TESTS(); +} diff --git a/src/ray/object_manager/test/pull_manager_test.cc b/src/ray/object_manager/test/pull_manager_test.cc index ecdaa06198fb..9230c87e9db9 100644 --- a/src/ray/object_manager/test/pull_manager_test.cc +++ b/src/ray/object_manager/test/pull_manager_test.cc @@ -10,70 +10,35 @@ namespace ray { using ::testing::ElementsAre; -class PullManagerTestWithCapacity { +class PullManagerTest : public ::testing::Test { public: - PullManagerTestWithCapacity(size_t num_available_bytes) + PullManagerTest() : self_node_id_(NodeID::FromRandom()), object_is_local_(false), num_send_pull_request_calls_(0), num_restore_spilled_object_calls_(0), - num_object_store_full_calls_(0), fake_time_(0), pull_manager_(self_node_id_, [this](const ObjectID &object_id) { return object_is_local_; }, [this](const ObjectID &object_id, const NodeID &node_id) { num_send_pull_request_calls_++; }, - [this](const ObjectID &, const std::string &, const NodeID &, + [this](const ObjectID &, const std::string &, std::function callback) { num_restore_spilled_object_calls_++; restore_object_callback_ = callback; }, - [this]() { return fake_time_; }, 10000, num_available_bytes, - [this]() { num_object_store_full_calls_++; }) {} - - void AssertNoLeaks() { - ASSERT_TRUE(pull_manager_.pull_request_bundles_.empty()); - ASSERT_TRUE(pull_manager_.object_pull_requests_.empty()); - ASSERT_TRUE(pull_manager_.active_object_pull_requests_.empty()); - // Most tests should not throw OOM. - ASSERT_EQ(num_object_store_full_calls_, 0); - } + [this]() { return fake_time_; }, 10000) {} NodeID self_node_id_; bool object_is_local_; int num_send_pull_request_calls_; int num_restore_spilled_object_calls_; - int num_object_store_full_calls_; std::function restore_object_callback_; double fake_time_; PullManager pull_manager_; }; -class PullManagerTest : public PullManagerTestWithCapacity, public ::testing::Test { - public: - PullManagerTest() : PullManagerTestWithCapacity(1) {} - - void AssertNumActiveRequestsEquals(size_t num_requests) { - ASSERT_EQ(pull_manager_.object_pull_requests_.size(), num_requests); - ASSERT_EQ(pull_manager_.active_object_pull_requests_.size(), num_requests); - } -}; - -class PullManagerWithAdmissionControlTest : public PullManagerTestWithCapacity, - public ::testing::Test { - public: - PullManagerWithAdmissionControlTest() : PullManagerTestWithCapacity(10) {} - - void AssertNumActiveRequestsEquals(size_t num_requests) { - ASSERT_EQ(pull_manager_.active_object_pull_requests_.size(), num_requests); - } - - bool IsUnderCapacity(size_t num_bytes_requested) { - return num_bytes_requested <= pull_manager_.num_bytes_available_; - } -}; - std::vector CreateObjectRefs(int num_objs) { std::vector refs; for (int i = 0; i < num_objs; i++) { @@ -88,14 +53,14 @@ std::vector CreateObjectRefs(int num_objs) { TEST_F(PullManagerTest, TestStaleSubscription) { auto refs = CreateObjectRefs(1); auto oid = ObjectRefsToIds(refs)[0]; - AssertNumActiveRequestsEquals(0); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); std::vector objects_to_locate; auto req_id = pull_manager_.Pull(refs, &objects_to_locate); ASSERT_EQ(ObjectRefsToIds(objects_to_locate), ObjectRefsToIds(refs)); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 1); std::unordered_set client_ids; - pull_manager_.OnLocationChange(oid, client_ids, "", NodeID::Nil(), 0); - AssertNumActiveRequestsEquals(1); + pull_manager_.OnLocationChange(oid, client_ids, ""); // There are no client ids to pull from. ASSERT_EQ(num_send_pull_request_calls_, 0); @@ -106,163 +71,119 @@ TEST_F(PullManagerTest, TestStaleSubscription) { ASSERT_EQ(num_send_pull_request_calls_, 0); ASSERT_EQ(num_restore_spilled_object_calls_, 0); - AssertNumActiveRequestsEquals(0); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); client_ids.insert(NodeID::FromRandom()); - pull_manager_.OnLocationChange(oid, client_ids, "", NodeID::Nil(), 0); + pull_manager_.OnLocationChange(oid, client_ids, ""); // Now we're getting a notification about an object that was already cancelled. ASSERT_EQ(num_send_pull_request_calls_, 0); ASSERT_EQ(num_restore_spilled_object_calls_, 0); - - AssertNoLeaks(); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); } TEST_F(PullManagerTest, TestRestoreSpilledObject) { auto refs = CreateObjectRefs(1); auto obj1 = ObjectRefsToIds(refs)[0]; rpc::Address addr1; - AssertNumActiveRequestsEquals(0); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); std::vector objects_to_locate; auto req_id = pull_manager_.Pull(refs, &objects_to_locate); ASSERT_EQ(ObjectRefsToIds(objects_to_locate), ObjectRefsToIds(refs)); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 1); std::unordered_set client_ids; - pull_manager_.OnLocationChange(obj1, client_ids, "", NodeID::Nil(), 0); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar"); // client_ids is empty here, so there's nowhere to pull from. ASSERT_EQ(num_send_pull_request_calls_, 0); - ASSERT_EQ(num_restore_spilled_object_calls_, 0); + ASSERT_EQ(num_restore_spilled_object_calls_, 1); - NodeID node_that_object_spilled = NodeID::FromRandom(); + client_ids.insert(NodeID::FromRandom()); fake_time_ += 10.; - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", - node_that_object_spilled, 0); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar"); // The behavior is supposed to be to always restore the spilled object if possible (even // if it exists elsewhere in the cluster). ASSERT_EQ(num_send_pull_request_calls_, 0); - ASSERT_EQ(num_restore_spilled_object_calls_, 1); - - // The restore object call will ask the remote node to restore the object, and the - // client location is updated accordingly. - client_ids.insert(node_that_object_spilled); - fake_time_ += 10.; - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", - node_that_object_spilled, 0); - - // Now the pull requests are sent. - ASSERT_EQ(num_send_pull_request_calls_, 1); - ASSERT_EQ(num_restore_spilled_object_calls_, 1); + ASSERT_EQ(num_restore_spilled_object_calls_, 2); // Don't restore an object if it's local. object_is_local_ = true; num_restore_spilled_object_calls_ = 0; - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", - NodeID::FromRandom(), 0); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar"); ASSERT_EQ(num_restore_spilled_object_calls_, 0); auto objects_to_cancel = pull_manager_.CancelPull(req_id); ASSERT_EQ(objects_to_cancel, ObjectRefsToIds(refs)); - - AssertNoLeaks(); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); } TEST_F(PullManagerTest, TestRestoreObjectFailed) { auto refs = CreateObjectRefs(1); auto obj1 = ObjectRefsToIds(refs)[0]; rpc::Address addr1; - AssertNumActiveRequestsEquals(0); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); std::vector objects_to_locate; - auto req_id = pull_manager_.Pull(refs, &objects_to_locate); + pull_manager_.Pull(refs, &objects_to_locate); ASSERT_EQ(ObjectRefsToIds(objects_to_locate), ObjectRefsToIds(refs)); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 1); + std::unordered_set client_ids; - pull_manager_.OnLocationChange(obj1, client_ids, "", NodeID::Nil(), 0); - AssertNumActiveRequestsEquals(1); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar"); // client_ids is empty here, so there's nowhere to pull from. ASSERT_EQ(num_send_pull_request_calls_, 0); - ASSERT_EQ(num_restore_spilled_object_calls_, 0); + ASSERT_EQ(num_restore_spilled_object_calls_, 1); - // Object is now spilled to a remote node, but the client_ids are still empty. - const NodeID remote_node_object_spilled = NodeID::FromRandom(); - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", - remote_node_object_spilled, 0); + restore_object_callback_(ray::Status::IOError(":(")); + // client_ids is empty here, so there's nowhere to pull from. ASSERT_EQ(num_send_pull_request_calls_, 0); ASSERT_EQ(num_restore_spilled_object_calls_, 1); - restore_object_callback_(ray::Status::IOError(":(")); + client_ids.insert(NodeID::FromRandom()); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar"); + + // We always assume the restore succeeded so there's only 1 restore call still. + ASSERT_EQ(num_send_pull_request_calls_, 0); + ASSERT_EQ(num_restore_spilled_object_calls_, 1); - // Now the restore request has failed, the remote object shouldn't have been properly - // restored. fake_time_ += 10.0; - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", - remote_node_object_spilled, 0); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar"); ASSERT_EQ(num_send_pull_request_calls_, 0); ASSERT_EQ(num_restore_spilled_object_calls_, 2); - restore_object_callback_(ray::Status::OK()); - // Now the remote restoration request succeeds, so we sholud be able to pull the object. - client_ids.insert(remote_node_object_spilled); - // Since it is the second retry, the interval gets doubled. - fake_time_ += 20.0; - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", - remote_node_object_spilled, 0); + restore_object_callback_(ray::Status::IOError(":(")); - // Now that we've successfully sent a pull request, we need to wait for the retry period - // before sending another one. + // Since restore failed, we can fallback to pulling from another node immediately. ASSERT_EQ(num_send_pull_request_calls_, 1); ASSERT_EQ(num_restore_spilled_object_calls_, 2); - auto objects_to_cancel = pull_manager_.CancelPull(req_id); - AssertNoLeaks(); -} - -TEST_F(PullManagerTest, TestLoadBalancingRestorationRequest) { - /* Make sure when the object copy is in other raylet, we pull object from there instead - * of requesting the owner node to restore the object. */ - - auto refs = CreateObjectRefs(1); - auto obj1 = ObjectRefsToIds(refs)[0]; - rpc::Address addr1; - ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); - std::vector objects_to_locate; - pull_manager_.Pull(refs, &objects_to_locate); - ASSERT_EQ(ObjectRefsToIds(objects_to_locate), ObjectRefsToIds(refs)); - ASSERT_EQ(pull_manager_.NumActiveRequests(), 1); - - std::unordered_set client_ids; - const auto copy_node1 = NodeID::FromRandom(); - const auto copy_node2 = NodeID::FromRandom(); - const auto remote_node_that_spilled_object = NodeID::FromRandom(); - client_ids.insert(copy_node1); - client_ids.insert(copy_node2); - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", - remote_node_that_spilled_object, 0); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar"); + // Now that we've successfully sent a pull request, we need to wait for the retry period + // before sending another one. ASSERT_EQ(num_send_pull_request_calls_, 1); - // Make sure the restore request wasn't sent since there are nodes that have a copied - // object. - ASSERT_EQ(num_restore_spilled_object_calls_, 0); + ASSERT_EQ(num_restore_spilled_object_calls_, 2); } TEST_F(PullManagerTest, TestManyUpdates) { auto refs = CreateObjectRefs(1); auto obj1 = ObjectRefsToIds(refs)[0]; rpc::Address addr1; - AssertNumActiveRequestsEquals(0); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); std::vector objects_to_locate; auto req_id = pull_manager_.Pull(refs, &objects_to_locate); ASSERT_EQ(ObjectRefsToIds(objects_to_locate), ObjectRefsToIds(refs)); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 1); std::unordered_set client_ids; client_ids.insert(NodeID::FromRandom()); for (int i = 0; i < 100; i++) { - pull_manager_.OnLocationChange(obj1, client_ids, "", NodeID::Nil(), 0); - AssertNumActiveRequestsEquals(1); + pull_manager_.OnLocationChange(obj1, client_ids, ""); } // Since no time has passed, only send a single pull request. @@ -271,26 +192,25 @@ TEST_F(PullManagerTest, TestManyUpdates) { auto objects_to_cancel = pull_manager_.CancelPull(req_id); ASSERT_EQ(objects_to_cancel, ObjectRefsToIds(refs)); - - AssertNoLeaks(); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); } TEST_F(PullManagerTest, TestRetryTimer) { auto refs = CreateObjectRefs(1); auto obj1 = ObjectRefsToIds(refs)[0]; rpc::Address addr1; - AssertNumActiveRequestsEquals(0); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); std::vector objects_to_locate; auto req_id = pull_manager_.Pull(refs, &objects_to_locate); ASSERT_EQ(ObjectRefsToIds(objects_to_locate), ObjectRefsToIds(refs)); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 1); std::unordered_set client_ids; client_ids.insert(NodeID::FromRandom()); // We need to call OnLocationChange at least once, to population the list of nodes with // the object. - pull_manager_.OnLocationChange(obj1, client_ids, "", NodeID::Nil(), 0); - AssertNumActiveRequestsEquals(1); + pull_manager_.OnLocationChange(obj1, client_ids, ""); ASSERT_EQ(num_send_pull_request_calls_, 1); ASSERT_EQ(num_restore_spilled_object_calls_, 0); @@ -300,7 +220,7 @@ TEST_F(PullManagerTest, TestRetryTimer) { // Location changes can trigger reset timer. for (; fake_time_ <= 120 * 10; fake_time_ += 1.) { - pull_manager_.OnLocationChange(obj1, client_ids, "", NodeID::Nil(), 0); + pull_manager_.OnLocationChange(obj1, client_ids, ""); } // We should make a pull request every tick (even if it's a duplicate to a node we're @@ -318,59 +238,55 @@ TEST_F(PullManagerTest, TestRetryTimer) { auto objects_to_cancel = pull_manager_.CancelPull(req_id); ASSERT_EQ(objects_to_cancel, ObjectRefsToIds(refs)); - - AssertNoLeaks(); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); } TEST_F(PullManagerTest, TestBasic) { auto refs = CreateObjectRefs(3); auto oids = ObjectRefsToIds(refs); - AssertNumActiveRequestsEquals(0); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); std::vector objects_to_locate; auto req_id = pull_manager_.Pull(refs, &objects_to_locate); ASSERT_EQ(ObjectRefsToIds(objects_to_locate), oids); + ASSERT_EQ(pull_manager_.NumActiveRequests(), oids.size()); std::unordered_set client_ids; client_ids.insert(NodeID::FromRandom()); for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), 0); + pull_manager_.OnLocationChange(oids[i], client_ids, ""); + ASSERT_EQ(num_send_pull_request_calls_, i + 1); + ASSERT_EQ(num_restore_spilled_object_calls_, 0); } - ASSERT_EQ(num_send_pull_request_calls_, oids.size()); - ASSERT_EQ(num_restore_spilled_object_calls_, 0); - AssertNumActiveRequestsEquals(oids.size()); // Don't pull an object if it's local. object_is_local_ = true; num_send_pull_request_calls_ = 0; - fake_time_ += 10; for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), 0); + pull_manager_.OnLocationChange(oids[i], client_ids, ""); } ASSERT_EQ(num_send_pull_request_calls_, 0); auto objects_to_cancel = pull_manager_.CancelPull(req_id); ASSERT_EQ(objects_to_cancel, oids); - AssertNumActiveRequestsEquals(0); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); // Don't pull a remote object if we've canceled. object_is_local_ = false; num_send_pull_request_calls_ = 0; - fake_time_ += 10; for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), 0); + pull_manager_.OnLocationChange(oids[i], client_ids, ""); } ASSERT_EQ(num_send_pull_request_calls_, 0); - - AssertNoLeaks(); } TEST_F(PullManagerTest, TestDeduplicateBundles) { auto refs = CreateObjectRefs(3); auto oids = ObjectRefsToIds(refs); - AssertNumActiveRequestsEquals(0); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); std::vector objects_to_locate; auto req_id1 = pull_manager_.Pull(refs, &objects_to_locate); ASSERT_EQ(ObjectRefsToIds(objects_to_locate), oids); + ASSERT_EQ(pull_manager_.NumActiveRequests(), oids.size()); objects_to_locate.clear(); auto req_id2 = pull_manager_.Pull(refs, &objects_to_locate); @@ -379,22 +295,20 @@ TEST_F(PullManagerTest, TestDeduplicateBundles) { std::unordered_set client_ids; client_ids.insert(NodeID::FromRandom()); for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), 0); + pull_manager_.OnLocationChange(oids[i], client_ids, ""); + ASSERT_EQ(num_send_pull_request_calls_, i + 1); + ASSERT_EQ(num_restore_spilled_object_calls_, 0); } - ASSERT_EQ(num_send_pull_request_calls_, oids.size()); - ASSERT_EQ(num_restore_spilled_object_calls_, 0); - AssertNumActiveRequestsEquals(oids.size()); // Cancel one request. auto objects_to_cancel = pull_manager_.CancelPull(req_id1); ASSERT_TRUE(objects_to_cancel.empty()); // Objects should still be pulled because the other request is still open. - AssertNumActiveRequestsEquals(oids.size()); + ASSERT_EQ(pull_manager_.NumActiveRequests(), oids.size()); fake_time_ += 10; num_send_pull_request_calls_ = 0; for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), 0); - pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), 0); + pull_manager_.OnLocationChange(oids[i], client_ids, ""); ASSERT_EQ(num_send_pull_request_calls_, i + 1); ASSERT_EQ(num_restore_spilled_object_calls_, 0); } @@ -402,191 +316,15 @@ TEST_F(PullManagerTest, TestDeduplicateBundles) { // Cancel the other request. objects_to_cancel = pull_manager_.CancelPull(req_id2); ASSERT_EQ(objects_to_cancel, oids); - AssertNumActiveRequestsEquals(0); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); // Don't pull a remote object if we've canceled. object_is_local_ = false; num_send_pull_request_calls_ = 0; for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), 0); + pull_manager_.OnLocationChange(oids[i], client_ids, ""); } ASSERT_EQ(num_send_pull_request_calls_, 0); - - AssertNoLeaks(); -} - -TEST_F(PullManagerWithAdmissionControlTest, TestBasic) { - /// Test admission control for a single pull bundle request. We should - /// activate the request when we are under the reported capacity and - /// deactivate it when we are over. - auto refs = CreateObjectRefs(3); - auto oids = ObjectRefsToIds(refs); - size_t object_size = 2; - AssertNumActiveRequestsEquals(0); - std::vector objects_to_locate; - auto req_id = pull_manager_.Pull(refs, &objects_to_locate); - ASSERT_EQ(ObjectRefsToIds(objects_to_locate), oids); - - std::unordered_set client_ids; - client_ids.insert(NodeID::FromRandom()); - for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), object_size); - } - ASSERT_EQ(num_send_pull_request_calls_, oids.size()); - ASSERT_EQ(num_restore_spilled_object_calls_, 0); - AssertNumActiveRequestsEquals(oids.size()); - ASSERT_TRUE(IsUnderCapacity(oids.size() * object_size)); - - // Reduce the available memory. - ASSERT_EQ(num_object_store_full_calls_, 0); - pull_manager_.UpdatePullsBasedOnAvailableMemory(oids.size() * object_size - 1); - AssertNumActiveRequestsEquals(0); - ASSERT_EQ(num_object_store_full_calls_, 1); - // No new pull requests after the next tick. - fake_time_ += 10; - auto prev_pull_requests = num_send_pull_request_calls_; - for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), object_size); - ASSERT_EQ(num_send_pull_request_calls_, prev_pull_requests); - ASSERT_EQ(num_restore_spilled_object_calls_, 0); - } - - // Increase the available memory again. - pull_manager_.UpdatePullsBasedOnAvailableMemory(oids.size() * object_size); - AssertNumActiveRequestsEquals(oids.size()); - ASSERT_TRUE(IsUnderCapacity(oids.size() * object_size)); - ASSERT_EQ(num_send_pull_request_calls_, prev_pull_requests + oids.size()); - - // OOM was not triggered a second time. - ASSERT_EQ(num_object_store_full_calls_, 1); - num_object_store_full_calls_ = 0; - - pull_manager_.CancelPull(req_id); - AssertNoLeaks(); -} - -TEST_F(PullManagerWithAdmissionControlTest, TestQueue) { - /// Test admission control for a queue of pull bundle requests. We should - /// activate as many requests as we can, subject to the reported capacity. - int object_size = 2; - int num_oids_per_request = 2; - int num_requests = 3; - - std::vector> bundles; - std::vector req_ids; - for (int i = 0; i < num_requests; i++) { - auto refs = CreateObjectRefs(num_oids_per_request); - auto oids = ObjectRefsToIds(refs); - std::vector objects_to_locate; - auto req_id = pull_manager_.Pull(refs, &objects_to_locate); - ASSERT_EQ(ObjectRefsToIds(objects_to_locate), oids); - - bundles.push_back(oids); - req_ids.push_back(req_id); - } - - std::unordered_set client_ids; - client_ids.insert(NodeID::FromRandom()); - for (auto &oids : bundles) { - for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), object_size); - } - } - - for (int capacity = 0; capacity < 20; capacity++) { - int num_requests_expected = - std::min(num_requests, capacity / (object_size * num_oids_per_request)); - pull_manager_.UpdatePullsBasedOnAvailableMemory(capacity); - - AssertNumActiveRequestsEquals(num_requests_expected * num_oids_per_request); - // The total requests that are active is under the specified capacity. - ASSERT_TRUE( - IsUnderCapacity(num_requests_expected * num_oids_per_request * object_size)); - // This is the maximum number of requests that can be served at once that - // is under the capacity. - if (num_requests_expected < num_requests) { - ASSERT_FALSE(IsUnderCapacity((num_requests_expected + 1) * num_oids_per_request * - object_size)); - } - // Check that OOM was triggered. - if (num_requests_expected == 0) { - ASSERT_EQ(num_object_store_full_calls_, 1); - } else { - ASSERT_EQ(num_object_store_full_calls_, 0); - } - num_object_store_full_calls_ = 0; - } - - for (auto req_id : req_ids) { - pull_manager_.CancelPull(req_id); - } - AssertNoLeaks(); -} - -TEST_F(PullManagerWithAdmissionControlTest, TestCancel) { - /// Test admission control while requests are cancelled out-of-order. When an - /// active request is cancelled, we should activate another request in the - /// queue, if there is one that satisfies the reported capacity. - auto test_cancel = [&](std::vector object_sizes, int capacity, size_t cancel_idx, - int num_active_requests_expected_before, - int num_active_requests_expected_after) { - pull_manager_.UpdatePullsBasedOnAvailableMemory(capacity); - auto refs = CreateObjectRefs(object_sizes.size()); - auto oids = ObjectRefsToIds(refs); - std::vector req_ids; - for (auto &ref : refs) { - std::vector objects_to_locate; - auto req_id = pull_manager_.Pull({ref}, &objects_to_locate); - req_ids.push_back(req_id); - } - for (size_t i = 0; i < object_sizes.size(); i++) { - pull_manager_.OnLocationChange(oids[i], {}, "", NodeID::Nil(), object_sizes[i]); - } - AssertNumActiveRequestsEquals(num_active_requests_expected_before); - pull_manager_.CancelPull(req_ids[cancel_idx]); - AssertNumActiveRequestsEquals(num_active_requests_expected_after); - - // Request is really canceled. - pull_manager_.OnLocationChange(oids[cancel_idx], {NodeID::FromRandom()}, "", - NodeID::Nil(), object_sizes[cancel_idx]); - ASSERT_EQ(num_send_pull_request_calls_, 0); - - // The expected number of requests at the head of the queue are pulled. - int num_active = 0; - for (size_t i = 0; i < refs.size() && num_active < num_active_requests_expected_after; - i++) { - pull_manager_.OnLocationChange(oids[i], {NodeID::FromRandom()}, "", NodeID::Nil(), - object_sizes[i]); - if (i != cancel_idx) { - num_active++; - } - } - ASSERT_EQ(num_send_pull_request_calls_, num_active_requests_expected_after); - - // Reset state. - for (size_t i = 0; i < req_ids.size(); i++) { - if (i != cancel_idx) { - pull_manager_.CancelPull(req_ids[i]); - } - } - num_send_pull_request_calls_ = 0; - }; - - // The next request in the queue is infeasible. If it is canceled, the - // request after that is activated. - test_cancel({1, 1, 2, 1}, 3, 2, 2, 3); - - // If an activated request is canceled, the next request is activated. - test_cancel({1, 1, 2, 1}, 3, 0, 2, 2); - test_cancel({1, 1, 2, 1}, 3, 1, 2, 2); - - // Cancellation of requests at the end of the queue has no effect. - test_cancel({1, 1, 2, 1, 1}, 3, 3, 2, 2); - - // As many new requests as possible are activated when one is canceled. - test_cancel({1, 2, 1, 1, 1}, 3, 1, 2, 3); - - AssertNoLeaks(); } } // namespace ray diff --git a/src/ray/protobuf/common.proto b/src/ray/protobuf/common.proto index 7178fe7159d8..cc3149e84f46 100644 --- a/src/ray/protobuf/common.proto +++ b/src/ray/protobuf/common.proto @@ -46,6 +46,19 @@ enum TaskType { DRIVER_TASK = 3; } +// Type of placement group strategy. +enum PlacementStrategy { + // Packs Bundles into as few nodes as possible. + PACK = 0; + // Places Bundles across distinct nodes or processes as even as possible. + SPREAD = 1; + // Packs Bundles within one node. The group is not allowed to span multiple nodes. + STRICT_PACK = 2; + // Places Bundles across distinct nodes. + // The group is not allowed to deploy more than one bundle on a node. + STRICT_SPREAD = 3; +} + // Address of a worker or node manager. message Address { bytes raylet_id = 1; @@ -220,8 +233,6 @@ message PlacementGroupSpec { bool creator_job_dead = 7; // Whether or not if the creator actor is dead. bool creator_actor_dead = 8; - // Whether the placement group is persistent. - bool is_detached = 9; } message ObjectReference { @@ -443,24 +454,3 @@ enum WorkerExitType { // Worker exit due to placement group removal. PLACEMENT_GROUP_REMOVED = 3; } -/////////////////////////////////////////////////////////////////////////////// -/* Please do not modify/remove/change the following enum to maintain -backwards compatibility in autoscaler. This is necessary to make sure we can -run autoscaler with any version of ray. For example, the K8s operator runs -autoscaler in a separate pod, if the user upgrades the ray version on the head -pod autoscaler can crash (if the newer version of ray modified the messages -below). */ - -// Type of placement group strategy. -enum PlacementStrategy { - // Packs Bundles into as few nodes as possible. - PACK = 0; - // Places Bundles across distinct nodes or processes as even as possible. - SPREAD = 1; - // Packs Bundles within one node. The group is not allowed to span multiple nodes. - STRICT_PACK = 2; - // Places Bundles across distinct nodes. - // The group is not allowed to deploy more than one bundle on a node. - STRICT_SPREAD = 3; -} -/////////////////////////////////////////////////////////////////////////////// diff --git a/src/ray/protobuf/core_worker.proto b/src/ray/protobuf/core_worker.proto index 66d5eb570782..799530d274e9 100644 --- a/src/ray/protobuf/core_worker.proto +++ b/src/ray/protobuf/core_worker.proto @@ -182,25 +182,10 @@ message RemoveObjectLocationOwnerReply { message GetObjectLocationsOwnerRequest { bytes intended_worker_id = 1; bytes object_id = 2; - // The version of the last location update. Only updates more recent than this version - // will be returned. -1 indicates that the current location data should - // always be returned. - int64 last_version = 3; } message GetObjectLocationsOwnerReply { - // The IDs of the nodes that this object appeared on or was evicted by. repeated bytes node_ids = 1; - // The size of the object in bytes. - uint64 object_size = 2; - // The object has been spilled to this URL. This should be set xor the above - // fields are set. - string spilled_url = 3; - // The ID of the node that spilled the object. - // This will be Nil if the object was spilled to distributed external storage. - bytes spilled_node_id = 4; - // The version of the returned location updates. - int64 current_version = 5; } message KillActorRequest { @@ -314,9 +299,6 @@ message PlasmaObjectReadyReply { message SpillObjectsRequest { // The IDs of objects to be spilled. repeated bytes object_ids_to_spill = 1; - // The owner addresses of the objects to be spilled. Must be in the same order as - // object_ids_to_spill. - repeated Address owner_addresses = 2; } message SpillObjectsReply { @@ -344,22 +326,6 @@ message DeleteSpilledObjectsRequest { message DeleteSpilledObjectsReply { } -message AddSpilledUrlRequest { - // Object that was spilled. - bytes object_id = 1; - // For objects that have been spilled to external storage, the URL from which - // they can be retrieved. - string spilled_url = 2; - // The ID of the node that spilled the object. - // This will be Nil if the object was spilled to distributed external storage. - bytes spilled_node_id = 3; - // The size of the object in bytes. - int64 size = 4; -} - -message AddSpilledUrlReply { -} - message ExitRequest { } @@ -412,9 +378,6 @@ service CoreWorkerService { // Delete spilled objects from external storage. Caller: raylet; callee: I/O worker. rpc DeleteSpilledObjects(DeleteSpilledObjectsRequest) returns (DeleteSpilledObjectsReply); - // Add spilled URL, spilled node ID, and update object size for owned object. - // Caller: raylet; callee: owner worker. - rpc AddSpilledUrl(AddSpilledUrlRequest) returns (AddSpilledUrlReply); // Notification from raylet that an object ID is available in local plasma. rpc PlasmaObjectReady(PlasmaObjectReadyRequest) returns (PlasmaObjectReadyReply); // Request for a worker to exit. diff --git a/src/ray/protobuf/gcs.proto b/src/ray/protobuf/gcs.proto index 5da9842f9619..d0793c35ca13 100644 --- a/src/ray/protobuf/gcs.proto +++ b/src/ray/protobuf/gcs.proto @@ -158,6 +158,41 @@ message ErrorTableData { double timestamp = 4; } +message PlacementGroupTableData { + // State of a placement group. + enum PlacementGroupState { + // Placement Group is pending or scheduling + PENDING = 0; + // Placement Group is created. + CREATED = 1; + // Placement Group is already removed and won't be reschedule. + REMOVED = 2; + // Placement Group is rescheduling because the node it placed is dead. + RESCHEDULING = 3; + } + + // ID of the PlacementGroup. + bytes placement_group_id = 1; + // The name of the placement group. + string name = 2; + // The array of the bundle in Placement Group. + repeated Bundle bundles = 3; + // The schedule strategy of this Placement Group. + PlacementStrategy strategy = 4; + // Current state of this placement group. + PlacementGroupState state = 5; + // Fields to detect the owner of the placement group + // for automatic lifecycle management. + // The job id that created this placement group. + bytes creator_job_id = 6; + // The actor id that created this placement group. + bytes creator_actor_id = 7; + // Whether or not if the creator job is dead. + bool creator_job_dead = 8; + // Whether or not if the creator actor is dead. + bool creator_actor_dead = 9; +} + message ScheduleData { map schedule_plan = 1; } @@ -238,11 +273,69 @@ message GcsNodeInfo { int64 timestamp = 10; } +// Represents the demand for a particular resource shape. +message ResourceDemand { + // The resource shape requested. This is a map from the resource string + // (e.g., "CPU") to the amount requested. + map shape = 1; + // The number of requests that are ready to run (i.e., dependencies have been + // fulfilled), but that are waiting for resources. + uint64 num_ready_requests_queued = 2; + // The number of requests for which there is no node that is a superset of + // the requested resource shape. + uint64 num_infeasible_requests_queued = 3; + // The number of requests of this shape still queued in CoreWorkers that this + // raylet knows about. + int64 backlog_size = 4; +} + +// Represents the demand sorted by resource shape. +message ResourceLoad { + // A list of all resource demands. The resource shape in each demand is + // unique. + repeated ResourceDemand resource_demands = 1; +} + +message PlacementGroupLoad { + // The list of pending placement group specifications. + repeated PlacementGroupTableData placement_group_data = 1; +} + message HeartbeatTableData { // Node id. bytes node_id = 1; } +message ResourcesData { + // Node id. + bytes node_id = 1; + // Resource capacity currently available on this node manager. + map resources_available = 2; + // Indicates whether available resources is changed. Only used when light + // heartbeat enabled. + bool resources_available_changed = 3; + // Total resource capacity configured for this node manager. + map resources_total = 4; + // Aggregate outstanding resource load on this node manager. + map resource_load = 5; + // Indicates whether resource load is changed. Only used when + // light heartbeat enabled. + bool resource_load_changed = 6; + // The resource load on this node, sorted by resource shape. + ResourceLoad resource_load_by_shape = 7; + // Whether this node manager is requesting global GC. + bool should_global_gc = 8; +} + +message ResourceUsageBatchData { + repeated ResourcesData batch = 1; + // The total resource demand on all nodes included in the batch, sorted by + // resource shape. + ResourceLoad resource_load_by_shape = 2; + // The pending list of placement groups. + PlacementGroupLoad placement_group_load = 3; +} + // Data for a lease on task execution. message TaskLeaseData { // The task ID. @@ -320,11 +413,6 @@ message ObjectLocationInfo { // For objects that have been spilled to external storage, the URL from which // they can be retrieved. string spilled_url = 3; - // The node id that spills the object to the disk. - // It will be Nil if it uses a distributed external storage. - bytes spilled_node_id = 4; - // The size of the object in bytes. - uint64 size = 5; } // A notification message about one object's locations being changed. @@ -335,11 +423,6 @@ message ObjectLocationChange { // The object has been spilled to this URL. This should be set xor the above // fields are set. string spilled_url = 3; - // The node id that spills the object to the disk. - // It will be Nil if it uses a distributed external storage. - bytes spilled_node_id = 4; - // The size of the object in bytes. - uint64 size = 5; } // A notification message about one node's resources being changed. @@ -356,109 +439,3 @@ message PubSubMessage { bytes id = 1; bytes data = 2; } - -/////////////////////////////////////////////////////////////////////////////// -/* Please do not modify/remove/change the following messages to maintain -backwards compatibility in autoscaler. This is necessary to make sure we can -run autoscaler with any version of ray. For example, the K8s operator runs -autoscaler in a separate pod, if the user upgrades the ray version on the head -pod autoscaler can crash (if the newer version of ray modified the messages -below). */ - -// Represents the demand for a particular resource shape. -message ResourceDemand { - // The resource shape requested. This is a map from the resource string - // (e.g., "CPU") to the amount requested. - map shape = 1; - // The number of requests that are ready to run (i.e., dependencies have been - // fulfilled), but that are waiting for resources. - uint64 num_ready_requests_queued = 2; - // The number of requests for which there is no node that is a superset of - // the requested resource shape. - uint64 num_infeasible_requests_queued = 3; - // The number of requests of this shape still queued in CoreWorkers that this - // raylet knows about. - int64 backlog_size = 4; -} - -// Represents the demand sorted by resource shape. -message ResourceLoad { - // A list of all resource demands. The resource shape in each demand is - // unique. - repeated ResourceDemand resource_demands = 1; -} - -message ResourcesData { - // Node id. - bytes node_id = 1; - // Resource capacity currently available on this node manager. - map resources_available = 2; - // Indicates whether available resources is changed. Only used when light - // heartbeat enabled. - bool resources_available_changed = 3; - // Total resource capacity configured for this node manager. - map resources_total = 4; - // Aggregate outstanding resource load on this node manager. - map resource_load = 5; - // Indicates whether resource load is changed. Only used when - // light heartbeat enabled. - bool resource_load_changed = 6; - // The resource load on this node, sorted by resource shape. - ResourceLoad resource_load_by_shape = 7; - // Whether this node manager is requesting global GC. - bool should_global_gc = 8; - // IP address of the node. - string node_manager_address = 9; -} - -message ResourceUsageBatchData { - repeated ResourcesData batch = 1; - // The total resource demand on all nodes included in the batch, sorted by - // resource shape. - ResourceLoad resource_load_by_shape = 2; - // The pending list of placement groups. - PlacementGroupLoad placement_group_load = 3; -} - -message PlacementGroupLoad { - // The list of pending placement group specifications. - repeated PlacementGroupTableData placement_group_data = 1; -} - -message PlacementGroupTableData { - // State of a placement group. - enum PlacementGroupState { - // Placement Group is pending or scheduling - PENDING = 0; - // Placement Group is created. - CREATED = 1; - // Placement Group is already removed and won't be reschedule. - REMOVED = 2; - // Placement Group is rescheduling because the node it placed is dead. - RESCHEDULING = 3; - } - - // ID of the PlacementGroup. - bytes placement_group_id = 1; - // The name of the placement group. - string name = 2; - // The array of the bundle in Placement Group. - repeated Bundle bundles = 3; - // The schedule strategy of this Placement Group. - PlacementStrategy strategy = 4; - // Current state of this placement group. - PlacementGroupState state = 5; - // Fields to detect the owner of the placement group - // for automatic lifecycle management. - // The job id that created this placement group. - bytes creator_job_id = 6; - // The actor id that created this placement group. - bytes creator_actor_id = 7; - // Whether or not if the creator job is dead. - bool creator_job_dead = 8; - // Whether or not if the creator actor is dead. - bool creator_actor_dead = 9; - // Whether the placement group is persistent. - bool is_detached = 10; -} -/////////////////////////////////////////////////////////////////////////////// diff --git a/src/ray/protobuf/gcs_service.proto b/src/ray/protobuf/gcs_service.proto index 41c71c7e05ca..35c86b3bedbe 100644 --- a/src/ray/protobuf/gcs_service.proto +++ b/src/ray/protobuf/gcs_service.proto @@ -19,6 +19,11 @@ package ray.rpc; import "src/ray/protobuf/common.proto"; import "src/ray/protobuf/gcs.proto"; +message GcsStatus { + int32 code = 1; + string message = 2; +} + message AddJobRequest { JobTableData data = 1; } @@ -87,22 +92,6 @@ message GetAllActorInfoReply { repeated ActorTableData actor_table_data = 2; } -// `KillActorViaGcsRequest` is sent to GCS Service to ask to kill an actor. -// `KillActorViaGcsRequest` is different from `KillActorRequest`. -// `KillActorRequest` is send to core worker to ask to kill an actor. -message KillActorViaGcsRequest { - // ID of this actor. - bytes actor_id = 1; - // Whether to force kill the actor. - bool force_kill = 2; - // If set to true, the killed actor will not be restarted anymore. - bool no_restart = 3; -} - -message KillActorViaGcsReply { - GcsStatus status = 1; -} - // Service for actor info access. service ActorInfoGcsService { // Register actor to gcs service. @@ -115,8 +104,6 @@ service ActorInfoGcsService { rpc GetNamedActorInfo(GetNamedActorInfoRequest) returns (GetNamedActorInfoReply); // Get information of all actor from GCS Service. rpc GetAllActorInfo(GetAllActorInfoRequest) returns (GetAllActorInfoReply); - // Kill actor via GCS Service. - rpc KillActorViaGcs(KillActorViaGcsRequest) returns (KillActorViaGcsReply); } message RegisterNodeRequest { @@ -226,6 +213,31 @@ message ReportResourceUsageReply { GcsStatus status = 1; } +message GetAllResourceUsageRequest { +} + +message GetAllResourceUsageReply { + GcsStatus status = 1; + ResourceUsageBatchData resource_usage_data = 2; +} + +// Service for node resource info access. +service NodeResourceInfoGcsService { + // Get node's resources from GCS Service. + rpc GetResources(GetResourcesRequest) returns (GetResourcesReply); + // Update resources of a node in GCS Service. + rpc UpdateResources(UpdateResourcesRequest) returns (UpdateResourcesReply); + // Delete resources of a node in GCS Service. + rpc DeleteResources(DeleteResourcesRequest) returns (DeleteResourcesReply); + // Get available resources of all nodes. + rpc GetAllAvailableResources(GetAllAvailableResourcesRequest) + returns (GetAllAvailableResourcesReply); + // Report resource usage of a node to GCS Service. + rpc ReportResourceUsage(ReportResourceUsageRequest) returns (ReportResourceUsageReply); + // Get resource usage of all nodes from GCS Service. + rpc GetAllResourceUsage(GetAllResourceUsageRequest) returns (GetAllResourceUsageReply); +} + // Service for heartbeat info access. service HeartbeatInfoGcsService { // Report heartbeat of a node to GCS Service. @@ -260,11 +272,6 @@ message AddObjectLocationRequest { // The spilled URL that will be added to GCS Service. Either this or the node // ID should be set. string spilled_url = 3; - // The node id that spills the object to the disk. - // It will be Nil if it uses a distributed external storage. - bytes spilled_node_id = 4; - // The size of the object in bytes. - uint64 size = 5; } message AddObjectLocationReply { @@ -492,17 +499,6 @@ message WaitPlacementGroupUntilReadyReply { GcsStatus status = 1; } -message GetNamedPlacementGroupRequest { - // Name of the placement group. - string name = 1; -} - -message GetNamedPlacementGroupReply { - GcsStatus status = 1; - // Data of placement group. - PlacementGroupTableData placement_group_table_data = 2; -} - // Service for placement group info access. service PlacementGroupInfoGcsService { // Create placement group via gcs service. @@ -513,9 +509,6 @@ service PlacementGroupInfoGcsService { returns (RemovePlacementGroupReply); // Get placement group information via gcs service. rpc GetPlacementGroup(GetPlacementGroupRequest) returns (GetPlacementGroupReply); - // Get named placement group information via gcs service. - rpc GetNamedPlacementGroup(GetNamedPlacementGroupRequest) - returns (GetNamedPlacementGroupReply); // Get information of all placement group from GCS Service. rpc GetAllPlacementGroup(GetAllPlacementGroupRequest) returns (GetAllPlacementGroupReply); @@ -523,41 +516,3 @@ service PlacementGroupInfoGcsService { rpc WaitPlacementGroupUntilReady(WaitPlacementGroupUntilReadyRequest) returns (WaitPlacementGroupUntilReadyReply); } -/////////////////////////////////////////////////////////////////////////////// -/* Please do not modify/remove/change the following messages to maintain -backwards compatibility in autoscaler. This is necessary to make sure we can -run autoscaler with any version of ray. For example, the K8s operator runs -autoscaler in a separate pod, if the user upgrades the ray version on the head -pod autoscaler can crash (if the newer version of ray modified the messages -below). */ - -message GetAllResourceUsageRequest { -} - -message GetAllResourceUsageReply { - GcsStatus status = 1; - ResourceUsageBatchData resource_usage_data = 2; -} - -// Service for node resource info access. -service NodeResourceInfoGcsService { - // Get node's resources from GCS Service. - rpc GetResources(GetResourcesRequest) returns (GetResourcesReply); - // Update resources of a node in GCS Service. - rpc UpdateResources(UpdateResourcesRequest) returns (UpdateResourcesReply); - // Delete resources of a node in GCS Service. - rpc DeleteResources(DeleteResourcesRequest) returns (DeleteResourcesReply); - // Get available resources of all nodes. - rpc GetAllAvailableResources(GetAllAvailableResourcesRequest) - returns (GetAllAvailableResourcesReply); - // Report resource usage of a node to GCS Service. - rpc ReportResourceUsage(ReportResourceUsageRequest) returns (ReportResourceUsageReply); - // Get resource usage of all nodes from GCS Service. - rpc GetAllResourceUsage(GetAllResourceUsageRequest) returns (GetAllResourceUsageReply); -} - -message GcsStatus { - int32 code = 1; - string message = 2; -} -/////////////////////////////////////////////////////////////////////////////// diff --git a/src/ray/protobuf/node_manager.proto b/src/ray/protobuf/node_manager.proto index 9273665f3ed2..bae2a9715100 100644 --- a/src/ray/protobuf/node_manager.proto +++ b/src/ray/protobuf/node_manager.proto @@ -138,8 +138,6 @@ message ObjectStoreStats { int64 object_store_bytes_avail = 8; // The number of local objects total. int64 num_local_objects = 9; - // The number of plasma object bytes that are consumed by core workers. - int64 consumed_bytes = 10; } message GetNodeStatsReply { @@ -179,22 +177,6 @@ message RequestObjectSpillageRequest { message RequestObjectSpillageReply { // Whether the object spilling was successful or not. bool success = 1; - // Object URL where the object is spilled. - string object_url = 2; - // The node id of a node where the object is spilled. - bytes spilled_node_id = 3; -} - -message RestoreSpilledObjectRequest { - // ObjectID to restore. - bytes object_id = 1; - // Object URL where the object is spilled. - string object_url = 2; - // The node id of a node where the object is spilled. - bytes spilled_node_id = 3; -} - -message RestoreSpilledObjectReply { } message ReleaseUnusedBundlesRequest { @@ -242,9 +224,6 @@ service NodeManagerService { // Ask the raylet to spill an object to external storage. rpc RequestObjectSpillage(RequestObjectSpillageRequest) returns (RequestObjectSpillageReply); - // Ask the raylet to restore the object from the external storage. - rpc RestoreSpilledObject(RestoreSpilledObjectRequest) - returns (RestoreSpilledObjectReply); // This method is only used by GCS, and the purpose is to release bundles // that may be leaked. When GCS restarts, it doesn't know which bundles it has leased // in the previous lifecycle. In this case, GCS will send a list of bundles that diff --git a/src/ray/protobuf/ray_client.proto b/src/ray/protobuf/ray_client.proto index 6781f1935246..1ba8675017d8 100644 --- a/src/ray/protobuf/ray_client.proto +++ b/src/ray/protobuf/ray_client.proto @@ -266,8 +266,6 @@ message ConnectionInfoResponse { string ray_commit = 3; // The Python version (e.g., "3.7.2"). string python_version = 4; - // The protocol version of the server (e.g., "2020-02-01"). - string protocol_version = 5; } message DataRequest { diff --git a/src/ray/raylet/dependency_manager.cc b/src/ray/raylet/dependency_manager.cc index 7c9faf642d3c..988893beaa47 100644 --- a/src/ray/raylet/dependency_manager.cc +++ b/src/ray/raylet/dependency_manager.cc @@ -185,6 +185,12 @@ bool DependencyManager::RequestTaskDependencies( return task_entry.num_missing_dependencies == 0; } +bool DependencyManager::IsTaskReady(const TaskID &task_id) const { + auto task_entry = queued_task_requests_.find(task_id); + RAY_CHECK(task_entry != queued_task_requests_.end()); + return task_entry->second.num_missing_dependencies == 0; +} + void DependencyManager::RemoveTaskDependencies(const TaskID &task_id) { RAY_LOG(DEBUG) << "Removing dependencies for task " << task_id; auto task_entry = queued_task_requests_.find(task_id); diff --git a/src/ray/raylet/dependency_manager.h b/src/ray/raylet/dependency_manager.h index 903a9893a579..1e7ddfcb17c1 100644 --- a/src/ray/raylet/dependency_manager.h +++ b/src/ray/raylet/dependency_manager.h @@ -37,6 +37,7 @@ class TaskDependencyManagerInterface { virtual bool RequestTaskDependencies( const TaskID &task_id, const std::vector &required_objects) = 0; + virtual bool IsTaskReady(const TaskID &task_id) const = 0; virtual void RemoveTaskDependencies(const TaskID &task_id) = 0; virtual ~TaskDependencyManagerInterface(){}; }; @@ -130,6 +131,14 @@ class DependencyManager : public TaskDependencyManagerInterface { bool RequestTaskDependencies(const TaskID &task_id, const std::vector &required_objects); + /// Check whether a task is ready to run. The task ID must have been + /// previously added by the caller. + /// + /// \param task_id The ID of the task to check. + /// \return Whether all of the dependencies for the task are + /// local. + bool IsTaskReady(const TaskID &task_id) const; + /// Cancel a task's dependencies. We will no longer attempt to fetch any /// remote dependencies, if no other task or worker requires them. /// diff --git a/src/ray/raylet/dependency_manager_test.cc b/src/ray/raylet/dependency_manager_test.cc index 6ea260bc3d97..c6d0ab2ee8c5 100644 --- a/src/ray/raylet/dependency_manager_test.cc +++ b/src/ray/raylet/dependency_manager_test.cc @@ -89,6 +89,7 @@ TEST_F(DependencyManagerTest, TestSimpleTask) { dependency_manager_.RequestTaskDependencies(task_id, ObjectIdsToRefs(arguments)); ASSERT_FALSE(ready); ASSERT_EQ(object_manager_mock_.active_requests.size(), 1); + ASSERT_FALSE(dependency_manager_.IsTaskReady(task_id)); // For each argument, tell the task dependency manager that the argument is // local. All arguments should be canceled as they become available locally. @@ -97,12 +98,15 @@ TEST_F(DependencyManagerTest, TestSimpleTask) { } auto ready_task_ids = dependency_manager_.HandleObjectLocal(arguments[0]); ASSERT_TRUE(ready_task_ids.empty()); + ASSERT_FALSE(dependency_manager_.IsTaskReady(task_id)); ready_task_ids = dependency_manager_.HandleObjectLocal(arguments[1]); ASSERT_TRUE(ready_task_ids.empty()); + ASSERT_FALSE(dependency_manager_.IsTaskReady(task_id)); // The task is ready to run. ready_task_ids = dependency_manager_.HandleObjectLocal(arguments[2]); ASSERT_EQ(ready_task_ids.size(), 1); ASSERT_EQ(ready_task_ids.front(), task_id); + ASSERT_TRUE(dependency_manager_.IsTaskReady(task_id)); // Remove the task. dependency_manager_.RemoveTaskDependencies(task_id); @@ -123,6 +127,7 @@ TEST_F(DependencyManagerTest, TestMultipleTasks) { bool ready = dependency_manager_.RequestTaskDependencies( task_id, ObjectIdsToRefs({argument_id})); ASSERT_FALSE(ready); + ASSERT_FALSE(dependency_manager_.IsTaskReady(task_id)); // The object should be requested from the object manager once for each task. ASSERT_EQ(object_manager_mock_.active_requests.size(), i + 1); } @@ -134,6 +139,7 @@ TEST_F(DependencyManagerTest, TestMultipleTasks) { std::unordered_set added_tasks(dependent_tasks.begin(), dependent_tasks.end()); for (auto &id : ready_task_ids) { ASSERT_TRUE(added_tasks.erase(id)); + ASSERT_TRUE(dependency_manager_.IsTaskReady(id)); } ASSERT_TRUE(added_tasks.empty()); @@ -160,6 +166,7 @@ TEST_F(DependencyManagerTest, TestTaskArgEviction) { bool ready = dependency_manager_.RequestTaskDependencies(task_id, ObjectIdsToRefs(arguments)); ASSERT_FALSE(ready); + ASSERT_FALSE(dependency_manager_.IsTaskReady(task_id)); // Tell the task dependency manager that each of the arguments is now // available. @@ -176,6 +183,7 @@ TEST_F(DependencyManagerTest, TestTaskArgEviction) { ASSERT_TRUE(ready_tasks.empty()); } } + ASSERT_TRUE(dependency_manager_.IsTaskReady(task_id)); // Simulate each of the arguments getting evicted. Each object should now be // considered remote. @@ -195,6 +203,7 @@ TEST_F(DependencyManagerTest, TestTaskArgEviction) { // the waiting state. ASSERT_TRUE(waiting_tasks.empty()); } + ASSERT_FALSE(dependency_manager_.IsTaskReady(task_id)); } // Tell the task dependency manager that each of the arguments is available @@ -212,6 +221,7 @@ TEST_F(DependencyManagerTest, TestTaskArgEviction) { ASSERT_TRUE(ready_tasks.empty()); } } + ASSERT_TRUE(dependency_manager_.IsTaskReady(task_id)); dependency_manager_.RemoveTaskDependencies(task_id); AssertNoLeaks(); diff --git a/src/ray/raylet/local_object_manager.cc b/src/ray/raylet/local_object_manager.cc index d37576a48ede..721adb6bd3eb 100644 --- a/src/ray/raylet/local_object_manager.cc +++ b/src/ray/raylet/local_object_manager.cc @@ -21,8 +21,8 @@ namespace ray { namespace raylet { void LocalObjectManager::PinObjects(const std::vector &object_ids, - std::vector> &&objects, - const rpc::Address &owner_address) { + std::vector> &&objects) { + RAY_CHECK(object_pinning_enabled_); for (size_t i = 0; i < object_ids.size(); i++) { const auto &object_id = object_ids[i]; auto &object = objects[i]; @@ -32,8 +32,7 @@ void LocalObjectManager::PinObjects(const std::vector &object_ids, continue; } RAY_LOG(DEBUG) << "Pinning object " << object_id; - pinned_objects_size_ += object->GetSize(); - pinned_objects_.emplace(object_id, std::make_pair(std::move(object), owner_address)); + pinned_objects_.emplace(object_id, std::move(object)); } } @@ -60,16 +59,16 @@ void LocalObjectManager::WaitForObjectFree(const rpc::Address &owner_address, } void LocalObjectManager::ReleaseFreedObject(const ObjectID &object_id) { - RAY_LOG(DEBUG) << "Unpinning object " << object_id; - // The object should be in one of these stats. pinned, spilling, or spilled. - RAY_CHECK((pinned_objects_.count(object_id) > 0) || - (spilled_objects_url_.count(object_id) > 0) || - (objects_pending_spill_.count(object_id) > 0)); - if (automatic_object_deletion_enabled_) { - spilled_object_pending_delete_.push(object_id); - } - if (pinned_objects_.count(object_id)) { - pinned_objects_size_ -= pinned_objects_[object_id].first->GetSize(); + // object_pinning_enabled_ flag is off when the --lru-evict flag is on. + if (object_pinning_enabled_) { + RAY_LOG(DEBUG) << "Unpinning object " << object_id; + // The object should be in one of these stats. pinned, spilling, or spilled. + RAY_CHECK((pinned_objects_.count(object_id) > 0) || + (spilled_objects_url_.count(object_id) > 0) || + (objects_pending_spill_.count(object_id) > 0)); + if (automatic_object_deletion_enabled_) { + spilled_object_pending_delete_.push(object_id); + } pinned_objects_.erase(object_id); } @@ -89,7 +88,7 @@ void LocalObjectManager::FlushFreeObjects() { on_objects_freed_(objects_to_free_); objects_to_free_.clear(); } - if (automatic_object_deletion_enabled_) { + if (object_pinning_enabled_ && automatic_object_deletion_enabled_) { // Deletion wouldn't work when the object pinning is not enabled. ProcessSpilledObjectsDeleteQueue(free_objects_batch_size_); } @@ -140,7 +139,7 @@ bool LocalObjectManager::SpillObjectsOfSize(int64_t num_bytes_to_spill) { std::vector objects_to_spill; while (bytes_to_spill <= num_bytes_to_spill && it != pinned_objects_.end()) { if (is_plasma_object_spillable_(it->first)) { - bytes_to_spill += it->second.first->GetSize(); + bytes_to_spill += it->second->GetSize(); objects_to_spill.push_back(it->first); } it++; @@ -152,7 +151,7 @@ bool LocalObjectManager::SpillObjectsOfSize(int64_t num_bytes_to_spill) { SpillObjectsInternal(objects_to_spill, [this, bytes_to_spill, objects_to_spill, start_time](const Status &status) { if (!status.ok()) { - RAY_LOG(INFO) << "Failed to spill objects: " << status.ToString(); + RAY_LOG(ERROR) << "Error spilling objects " << status.ToString(); } else { auto now = absl::GetCurrentTimeNanos(); RAY_LOG(DEBUG) << "Spilled " << bytes_to_spill << " bytes in " @@ -207,7 +206,7 @@ void LocalObjectManager::SpillObjectsInternal( if (it != pinned_objects_.end()) { RAY_LOG(DEBUG) << "Spilling object " << id; objects_to_spill.push_back(id); - num_bytes_pending_spill_ += it->second.first->GetSize(); + num_bytes_pending_spill_ += it->second->GetSize(); objects_pending_spill_[id] = std::move(it->second); pinned_objects_.erase(it); } @@ -225,9 +224,6 @@ void LocalObjectManager::SpillObjectsInternal( for (const auto &object_id : objects_to_spill) { RAY_LOG(DEBUG) << "Sending spill request for object " << object_id; request.add_object_ids_to_spill(object_id.Binary()); - auto it = objects_pending_spill_.find(object_id); - RAY_CHECK(it != objects_pending_spill_.end()); - request.add_owner_addresses()->MergeFrom(it->second.second); } io_worker->rpc_client()->SpillObjects( request, [this, objects_to_spill, callback, io_worker]( @@ -241,7 +237,6 @@ void LocalObjectManager::SpillObjectsInternal( for (const auto &object_id : objects_to_spill) { auto it = objects_pending_spill_.find(object_id); RAY_CHECK(it != objects_pending_spill_.end()); - pinned_objects_size_ += it->second.first->GetSize(); pinned_objects_.emplace(object_id, std::move(it->second)); objects_pending_spill_.erase(it); } @@ -258,46 +253,6 @@ void LocalObjectManager::SpillObjectsInternal( }); } -void LocalObjectManager::UnpinSpilledObjectCallback( - const ObjectID &object_id, const std::string &object_url, - std::shared_ptr num_remaining, - std::function callback, ray::Status status) { - if (!status.ok()) { - RAY_LOG(INFO) << "Failed to send spilled url for object " << object_id - << " to object directory, considering the object to have been freed: " - << status.ToString(); - } else { - RAY_LOG(DEBUG) << "Object " << object_id << " spilled to " << object_url - << " and object directory has been informed"; - } - RAY_LOG(DEBUG) << "Unpinning pending spill object " << object_id; - // Unpin the object. - auto it = objects_pending_spill_.find(object_id); - RAY_CHECK(it != objects_pending_spill_.end()); - num_bytes_pending_spill_ -= it->second.first->GetSize(); - objects_pending_spill_.erase(it); - - // Update the object_id -> url_ref_count to use it for deletion later. - // We need to track the references here because a single file can contain - // multiple objects, and we shouldn't delete the file until - // all the objects are gone out of scope. - // object_url is equivalent to url_with_offset. - auto parsed_url = ParseURL(object_url); - const auto base_url_it = parsed_url->find("url"); - RAY_CHECK(base_url_it != parsed_url->end()); - if (!url_ref_count_.contains(base_url_it->second)) { - url_ref_count_[base_url_it->second] = 1; - } else { - url_ref_count_[base_url_it->second] += 1; - } - spilled_objects_url_.emplace(object_id, object_url); - - (*num_remaining)--; - if (*num_remaining == 0 && callback) { - callback(status); - } -} - void LocalObjectManager::AddSpilledUrls( const std::vector &object_ids, const rpc::SpillObjectsReply &worker_reply, std::function callback) { @@ -306,77 +261,51 @@ void LocalObjectManager::AddSpilledUrls( const ObjectID &object_id = object_ids[i]; const std::string &object_url = worker_reply.spilled_objects_url(i); RAY_LOG(DEBUG) << "Object " << object_id << " spilled at " << object_url; - // Choose a node id to report. If an external storage type is not a filesystem, we - // don't need to report where this object is spilled. - const auto node_id_object_spilled = - is_external_storage_type_fs_ ? self_node_id_ : NodeID::Nil(); - - auto it = objects_pending_spill_.find(object_id); - RAY_CHECK(it != objects_pending_spill_.end()); - - auto unpin_callback = - std::bind(&LocalObjectManager::UnpinSpilledObjectCallback, this, object_id, - object_url, num_remaining, callback, std::placeholders::_1); - - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - // TODO(Clark): Don't send RPC to owner if we're fulfilling an owner-initiated - // spill RPC. - rpc::AddSpilledUrlRequest request; - request.set_object_id(object_id.Binary()); - request.set_spilled_url(object_url); - request.set_spilled_node_id(node_id_object_spilled.Binary()); - request.set_size(it->second.first->GetSize()); + // Write to object directory. Wait for the write to finish before + // releasing the object to make sure that the spilled object can + // be retrieved by other raylets. + RAY_CHECK_OK(object_info_accessor_.AsyncAddSpilledUrl( + object_id, object_url, + [this, object_id, object_url, callback, num_remaining](Status status) { + RAY_CHECK_OK(status); + // Unpin the object. + auto it = objects_pending_spill_.find(object_id); + RAY_CHECK(it != objects_pending_spill_.end()); + num_bytes_pending_spill_ -= it->second->GetSize(); + objects_pending_spill_.erase(it); + + // Update the object_id -> url_ref_count to use it for deletion later. + // We need to track the references here because a single file can contain + // multiple objects, and we shouldn't delete the file until + // all the objects are gone out of scope. + // object_url is equivalent to url_with_offset. + auto parsed_url = ParseURL(object_url); + const auto base_url_it = parsed_url->find("url"); + RAY_CHECK(base_url_it != parsed_url->end()); + if (!url_ref_count_.contains(base_url_it->second)) { + url_ref_count_[base_url_it->second] = 1; + } else { + url_ref_count_[base_url_it->second] += 1; + } + spilled_objects_url_.emplace(object_id, object_url); - auto owner_client = owner_client_pool_.GetOrConnect(it->second.second); - RAY_LOG(DEBUG) << "Sending spilled URL " << object_url << " for object " - << object_id << " to owner " - << WorkerID::FromBinary(it->second.second.worker_id()); - // Send spilled URL, spilled node ID, and object size to owner. - owner_client->AddSpilledUrl( - request, [unpin_callback](Status status, const rpc::AddSpilledUrlReply &reply) { - unpin_callback(status); - }); - } else { - // Write to object directory. Wait for the write to finish before - // releasing the object to make sure that the spilled object can - // be retrieved by other raylets. - RAY_CHECK_OK(object_info_accessor_.AsyncAddSpilledUrl( - object_id, object_url, node_id_object_spilled, it->second.first->GetSize(), - unpin_callback)); - } + (*num_remaining)--; + if (*num_remaining == 0 && callback) { + callback(status); + } + })); } } void LocalObjectManager::AsyncRestoreSpilledObject( - const ObjectID &object_id, const std::string &object_url, const NodeID &node_id, + const ObjectID &object_id, const std::string &object_url, std::function callback) { + RAY_LOG(DEBUG) << "Restoring spilled object " << object_id << " from URL " + << object_url; if (objects_pending_restore_.count(object_id) > 0) { // If the same object is restoring, we dedup here. return; } - - if (!node_id.IsNil() && node_id != self_node_id_) { - // If we know where this object was spilled, and the current node is not that one, - // send a RPC to a remote node that spilled the object to restore it. - RAY_LOG(DEBUG) << "Send an object restoration request of id: " << object_id - << " to a remote node: " << node_id; - // TODO(sang): We need to deduplicate this remote RPC. Since restore request - // is retried every 10ms without exponential backoff, this can add huge overhead to - // a remote node that spilled the object. - restore_object_from_remote_node_(object_id, object_url, node_id); - if (callback) { - callback(Status::OK()); - } - return; - } - - // Restore the object. - RAY_LOG(DEBUG) << "Restoring spilled object " << object_id << " from URL " - << object_url; - if (!node_id.IsNil()) { - RAY_CHECK(spilled_objects_url_.count(object_id) > 0); - } - RAY_CHECK(objects_pending_restore_.emplace(object_id).second) << "Object dedupe wasn't done properly. Please report if you see this issue."; io_worker_pool_.PopRestoreWorker([this, object_id, object_url, callback]( @@ -432,9 +361,9 @@ void LocalObjectManager::ProcessSpilledObjectsDeleteQueue(uint32_t max_batch_siz object_urls_to_delete.size() < max_batch_size) { auto &object_id = spilled_object_pending_delete_.front(); // If the object is still spilling, do nothing. This will block other entries to be - // processed, but it should be fine because the spilling will be eventually done, - // and deleting objects is the low priority tasks. This will instead enable simpler - // logic after this block. + // processed, but it should be fine because the spilling will be eventually done, and + // deleting objects is the low priority tasks. + // This will instead enable simpler logic after this block. if (objects_pending_spill_.contains(object_id)) { break; } @@ -442,8 +371,8 @@ void LocalObjectManager::ProcessSpilledObjectsDeleteQueue(uint32_t max_batch_siz // Object id is either spilled or not spilled at this point. const auto spilled_objects_url_it = spilled_objects_url_.find(object_id); if (spilled_objects_url_it != spilled_objects_url_.end()) { - // If the object was spilled, see if we can delete it. We should first check the - // ref count. + // If the object was spilled, see if we can delete it. We should first check the ref + // count. std::string &object_url = spilled_objects_url_it->second; // Note that here, we need to parse the object url to obtain the base_url. auto parsed_url = ParseURL(object_url); @@ -500,16 +429,6 @@ void LocalObjectManager::FillObjectSpillingStats(rpc::GetNodeStatsReply *reply) stats->set_restored_objects_total(restored_objects_total_); } -std::string LocalObjectManager::DebugString() const { - std::stringstream result; - result << "LocalObjectManager:\n"; - result << "- num pinned objects: " << pinned_objects_.size() << "\n"; - result << "- pinned objects size: " << pinned_objects_size_ << "\n"; - result << "- num objects pending restore: " << objects_pending_restore_.size() << "\n"; - result << "- num objects pending spill: " << objects_pending_spill_.size() << "\n"; - result << "- num bytes pending spill: " << num_bytes_pending_spill_ << "\n"; - return result.str(); -} - }; // namespace raylet + }; // namespace ray diff --git a/src/ray/raylet/local_object_manager.h b/src/ray/raylet/local_object_manager.h index 285060ab5cd3..14142f5f913d 100644 --- a/src/ray/raylet/local_object_manager.h +++ b/src/ray/raylet/local_object_manager.h @@ -16,8 +16,6 @@ #include -#include -#include #include #include "ray/common/id.h" @@ -26,7 +24,6 @@ #include "ray/object_manager/common.h" #include "ray/raylet/worker_pool.h" #include "ray/rpc/worker/core_worker_client_pool.h" -#include "ray/util/util.h" #include "src/ray/protobuf/node_manager.pb.h" namespace ray { @@ -38,41 +35,35 @@ namespace raylet { class LocalObjectManager { public: LocalObjectManager( - const NodeID &node_id, size_t free_objects_batch_size, + boost::asio::io_service &io_context, size_t free_objects_batch_size, int64_t free_objects_period_ms, IOWorkerPoolInterface &io_worker_pool, gcs::ObjectInfoAccessor &object_info_accessor, - rpc::CoreWorkerClientPool &owner_client_pool, + rpc::CoreWorkerClientPool &owner_client_pool, bool object_pinning_enabled, bool automatic_object_deletion_enabled, int max_io_workers, - int64_t min_spilling_size, bool is_external_storage_type_fs, + int64_t min_spilling_size, std::function &)> on_objects_freed, - std::function is_plasma_object_spillable, - std::function - restore_object_from_remote_node) - : self_node_id_(node_id), - free_objects_period_ms_(free_objects_period_ms), + std::function is_plasma_object_spillable) + : free_objects_period_ms_(free_objects_period_ms), free_objects_batch_size_(free_objects_batch_size), io_worker_pool_(io_worker_pool), object_info_accessor_(object_info_accessor), owner_client_pool_(owner_client_pool), + object_pinning_enabled_(object_pinning_enabled), automatic_object_deletion_enabled_(automatic_object_deletion_enabled), on_objects_freed_(on_objects_freed), last_free_objects_at_ms_(current_time_ms()), min_spilling_size_(min_spilling_size), num_active_workers_(0), max_active_workers_(max_io_workers), - is_plasma_object_spillable_(is_plasma_object_spillable), - restore_object_from_remote_node_(restore_object_from_remote_node), - is_external_storage_type_fs_(is_external_storage_type_fs) {} + is_plasma_object_spillable_(is_plasma_object_spillable) {} /// Pin objects. /// /// \param object_ids The objects to be pinned. /// \param objects Pointers to the objects to be pinned. The pointer should /// be kept in scope until the object can be released. - /// \param owner_address The owner of the objects to be pinned. void PinObjects(const std::vector &object_ids, - std::vector> &&objects, - const rpc::Address &owner_address); + std::vector> &&objects); /// Wait for the objects' owner to free the object. The objects will be /// released when the owner at the given address fails or replies that the @@ -99,15 +90,10 @@ class LocalObjectManager { /// Restore a spilled object from external storage back into local memory. /// /// \param object_id The ID of the object to restore. - /// \param object_url The URL where the object is spilled. - /// \param node_id Node id that we try restoring the object. If Nil is provided, the - /// object is restored directly from the external storage. If a node id is provided, it - /// sends a RPC request to a corresponding node if the given node_id is not equivalent - /// to a self node id. - /// \param callback A callback to call when the restoration is done. - /// Status will contain the error during restoration, if any. + /// \param object_url The URL in external storage from which the object can be restored. + /// \param callback A callback to call when the restoration is done. Status + /// will contain the error during restoration, if any. void AsyncRestoreSpilledObject(const ObjectID &object_id, const std::string &object_url, - const NodeID &node_id, std::function callback); /// Try to clear any objects that have been freed. @@ -137,8 +123,6 @@ class LocalObjectManager { /// \param Output parameter. void FillObjectSpillingStats(rpc::GetNodeStatsReply *reply) const; - std::string DebugString() const; - private: FRIEND_TEST(LocalObjectManagerTest, TestSpillObjectsOfSize); FRIEND_TEST(LocalObjectManagerTest, @@ -165,14 +149,6 @@ class LocalObjectManager { /// objects. void FlushFreeObjects(); - // A callback for unpinning spilled objects. This should be invoked after the object - // has been spilled and after the object directory has been sent the spilled URL. - void UnpinSpilledObjectCallback(const ObjectID &object_id, - const std::string &object_url, - std::shared_ptr num_remaining, - std::function callback, - ray::Status status); - /// Add objects' spilled URLs to the global object directory. Call the /// callback once all URLs have been added. void AddSpilledUrls(const std::vector &object_ids, @@ -184,8 +160,6 @@ class LocalObjectManager { /// \param urls_to_delete List of urls to delete from external storages. void DeleteSpilledObjects(std::vector &urls_to_delete); - const NodeID self_node_id_; - /// The period between attempts to eagerly evict objects from plasma. const int64_t free_objects_period_ms_; @@ -202,6 +176,9 @@ class LocalObjectManager { /// this node. rpc::CoreWorkerClientPool &owner_client_pool_; + /// Whether to enable pinning for plasma objects. + bool object_pinning_enabled_; + /// Whether to enable automatic deletion when refs are gone out of scope. bool automatic_object_deletion_enabled_; @@ -209,17 +186,12 @@ class LocalObjectManager { std::function &)> on_objects_freed_; // Objects that are pinned on this node. - absl::flat_hash_map, rpc::Address>> - pinned_objects_; - - // Total size of objects pinned on this node. - size_t pinned_objects_size_ = 0; + absl::flat_hash_map> pinned_objects_; // Objects that were pinned on this node but that are being spilled. // These objects will be released once spilling is complete and the URL is // written to the object directory. - absl::flat_hash_map, rpc::Address>> - objects_pending_spill_; + absl::flat_hash_map> objects_pending_spill_; /// Objects that were spilled on this node but that are being restored. /// The field is used to dedup the same restore request while restoration is in @@ -275,16 +247,6 @@ class LocalObjectManager { /// Return true if unpinned, meaning we can safely spill the object. False otherwise. std::function is_plasma_object_spillable_; - /// Callback to restore object of object id from a remote node of node id. - std::function - restore_object_from_remote_node_; - - /// Used to decide spilling protocol. - /// If it is "filesystem", it restores spilled objects only from an owner node. - /// If it is not (meaning it is distributed backend), it always restores objects - /// directly from the external storage. - bool is_external_storage_type_fs_; - /// /// Stats /// diff --git a/src/ray/raylet/main.cc b/src/ray/raylet/main.cc index 729c400fe31a..ba6a53ee473f 100644 --- a/src/ray/raylet/main.cc +++ b/src/ray/raylet/main.cc @@ -196,7 +196,7 @@ int main(int argc, char *argv[]) { } node_manager_config.heartbeat_period_ms = - RayConfig::instance().raylet_heartbeat_period_milliseconds(); + RayConfig::instance().raylet_heartbeat_timeout_milliseconds(); node_manager_config.report_resources_period_ms = RayConfig::instance().raylet_report_resources_period_milliseconds(); node_manager_config.debug_dump_period_ms = @@ -205,6 +205,8 @@ int main(int argc, char *argv[]) { RayConfig::instance().metrics_report_interval_ms() / 2; node_manager_config.fair_queueing_enabled = RayConfig::instance().fair_queueing_enabled(); + node_manager_config.object_pinning_enabled = + RayConfig::instance().object_pinning_enabled(); node_manager_config.automatic_object_deletion_enabled = RayConfig::instance().automatic_object_deletion_enabled(); node_manager_config.store_socket_name = store_socket_name; diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index 4eb3941dd260..1b8c50c5870e 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -130,6 +130,7 @@ NodeManager::NodeManager(boost::asio::io_service &io_service, const NodeID &self std::chrono::milliseconds(config.report_resources_period_ms)), debug_dump_period_(config.debug_dump_period_ms), fair_queueing_enabled_(config.fair_queueing_enabled), + object_pinning_enabled_(config.object_pinning_enabled), temp_dir_(config.temp_dir), object_manager_profile_timer_(io_service), initial_config_(config), @@ -157,28 +158,19 @@ NodeManager::NodeManager(boost::asio::io_service &io_service, const NodeID &self agent_manager_service_(io_service, *agent_manager_service_handler_), client_call_manager_(io_service), worker_rpc_pool_(client_call_manager_), - local_object_manager_( - self_node_id_, RayConfig::instance().free_objects_batch_size(), - RayConfig::instance().free_objects_period_milliseconds(), worker_pool_, - gcs_client_->Objects(), worker_rpc_pool_, - /* automatic_object_deletion_enabled */ - config.automatic_object_deletion_enabled, - /*max_io_workers*/ config.max_io_workers, - /*min_spilling_size*/ config.min_spilling_size, - /*is_external_storage_type_fs*/ - RayConfig::instance().is_external_storage_type_fs(), - /*on_objects_freed*/ - [this](const std::vector &object_ids) { - object_manager_.FreeObjects(object_ids, - /*local_only=*/false); - }, - is_plasma_object_spillable, - /*restore_object_from_remote_node*/ - [this](const ObjectID &object_id, const std::string &spilled_url, - const NodeID &node_id) { - SendSpilledObjectRestorationRequestToRemoteNode(object_id, spilled_url, - node_id); - }), + local_object_manager_(io_service_, RayConfig::instance().free_objects_batch_size(), + RayConfig::instance().free_objects_period_milliseconds(), + worker_pool_, gcs_client_->Objects(), worker_rpc_pool_, + /* object_pinning_enabled */ config.object_pinning_enabled, + /* automatic_object_deletion_enabled */ + config.automatic_object_deletion_enabled, + /*max_io_workers*/ config.max_io_workers, + /*min_spilling_size*/ config.min_spilling_size, + [this](const std::vector &object_ids) { + object_manager_.FreeObjects(object_ids, + /*local_only=*/false); + }, + is_plasma_object_spillable), report_worker_backlog_(RayConfig::instance().report_worker_backlog()), last_local_gc_ns_(absl::GetCurrentTimeNanos()), local_gc_interval_ns_(RayConfig::instance().local_gc_interval_s() * 1e9), @@ -220,11 +212,7 @@ NodeManager::NodeManager(boost::asio::io_service &io_service, const NodeID &self self_node_id_, std::dynamic_pointer_cast(cluster_resource_scheduler_), dependency_manager_, is_owner_alive, get_node_info_func, announce_infeasible_task, - worker_pool_, leased_workers_, - [this](const std::vector &object_ids, - std::vector> *results) { - return GetObjectsFromPlasma(object_ids, results); - })); + worker_pool_, leased_workers_)); placement_group_resource_manager_ = std::make_shared( std::dynamic_pointer_cast( @@ -408,7 +396,7 @@ void NodeManager::Heartbeat() { uint64_t now_ms = current_time_ms(); uint64_t interval = now_ms - last_heartbeat_at_ms_; if (interval > RayConfig::instance().num_heartbeats_warning() * - RayConfig::instance().raylet_heartbeat_period_milliseconds()) { + RayConfig::instance().raylet_heartbeat_timeout_milliseconds()) { RAY_LOG(WARNING) << "Last heartbeat was sent " << interval << " ms ago. There might be resource pressure on this node. If heartbeat keeps " @@ -454,7 +442,6 @@ void NodeManager::Heartbeat() { void NodeManager::ReportResourceUsage() { auto resources_data = std::make_shared(); resources_data->set_node_id(self_node_id_.Binary()); - resources_data->set_node_manager_address(initial_config_.node_manager_address); // Update local chche from gcs remote cache, this is needed when gcs restart. // We should always keep the cache view consistent. cluster_resource_scheduler_->UpdateLastResourceUsage( @@ -514,40 +501,16 @@ void NodeManager::DoLocalGC() { void NodeManager::HandleRequestObjectSpillage( const rpc::RequestObjectSpillageRequest &request, rpc::RequestObjectSpillageReply *reply, rpc::SendReplyCallback send_reply_callback) { - const auto &object_id = ObjectID::FromBinary(request.object_id()); - RAY_LOG(DEBUG) << "Received RequestObjectSpillage for object " << object_id; local_object_manager_.SpillObjects( - {object_id}, [object_id, reply, send_reply_callback](const ray::Status &status) { + {ObjectID::FromBinary(request.object_id())}, + [reply, send_reply_callback](const ray::Status &status) { if (status.ok()) { - RAY_LOG(DEBUG) << "Object " << object_id - << " has been spilled, replying to owner"; reply->set_success(true); - // TODO(Clark): Add spilled URLs and spilled node ID to owner RPC reply here - // if OBOD is enabled, instead of relying on automatic raylet spilling path to - // send an extra RPC to the owner. } send_reply_callback(Status::OK(), nullptr, nullptr); }); } -void NodeManager::HandleRestoreSpilledObject( - const rpc::RestoreSpilledObjectRequest &request, - rpc::RestoreSpilledObjectReply *reply, rpc::SendReplyCallback send_reply_callback) { - const auto object_id = ObjectID::FromBinary(request.object_id()); - const auto spilled_node_id = NodeID::FromBinary(request.spilled_node_id()); - const auto object_url = request.object_url(); - RAY_CHECK(spilled_node_id == self_node_id_); - RAY_LOG(DEBUG) << "Restore spilled object request received. Object id: " << object_id - << " spilled_node_id: " << self_node_id_ - << " object url: " << object_url; - local_object_manager_.AsyncRestoreSpilledObject(object_id, object_url, spilled_node_id, - nullptr); - // Just reply right away. The caller will keep hitting this RPC endpoint until - // restoration succeeds, so we can safely reply here without waiting for the - // restoreSpilledObject to be done. - send_reply_callback(Status::OK(), nullptr, nullptr); -} - void NodeManager::HandleReleaseUnusedBundles( const rpc::ReleaseUnusedBundlesRequest &request, rpc::ReleaseUnusedBundlesReply *reply, rpc::SendReplyCallback send_reply_callback) { @@ -727,7 +690,7 @@ void NodeManager::NodeRemoved(const NodeID &node_id) { << "Exiting because this node manager has mistakenly been marked dead by the " << "monitor: GCS didn't receive heartbeats within timeout " << RayConfig::instance().num_heartbeats_timeout() * - RayConfig::instance().raylet_heartbeat_period_milliseconds() + RayConfig::instance().raylet_heartbeat_timeout_milliseconds() << " ms. This is likely since the machine or raylet became overloaded."; // Below, when we remove node_id from all of these data structures, we could @@ -1251,9 +1214,8 @@ void NodeManager::DisconnectClient(const std::shared_ptr &clie if ((!task_id.IsNil() || !actor_id.IsNil()) && !worker->IsDead()) { // If the worker was an actor, it'll be cleaned by GCS. if (actor_id.IsNil()) { - // Return the resources that were being used by this worker. Task task; - cluster_task_manager_->TaskFinished(worker, &task); + static_cast(local_queues_.RemoveTask(task_id, &task)); } if (disconnect_type == rpc::WorkerExitType::SYSTEM_ERROR_EXIT) { @@ -1271,7 +1233,7 @@ void NodeManager::DisconnectClient(const std::shared_ptr &clie } // Remove the dead client from the pool and stop listening for messages. - worker_pool_.DisconnectWorker(worker, disconnect_type); + worker_pool_.DisconnectWorker(worker); // Return the resources that were being used by this worker. cluster_task_manager_->ReleaseWorkerResources(worker); @@ -2067,42 +2029,52 @@ void NodeManager::HandleTaskReconstruction(const TaskID &task_id, rpc::Address owner_addr; bool has_owner = dependency_manager_.GetOwnerAddress(required_object_id, &owner_addr); if (has_owner) { - RAY_LOG(DEBUG) << "Required object " << required_object_id - << " fetch timed out, asking owner " - << WorkerID::FromBinary(owner_addr.worker_id()); - // The owner's address exists. Poll the owner to check if the object is - // still in scope. If not, mark the object as failed. - // TODO(swang): If the owner has died, we could also mark the object as - // failed as soon as we hear about the owner's failure from the GCS, - // avoiding the raylet's reconstruction timeout. - auto client = std::unique_ptr( - new rpc::CoreWorkerClient(owner_addr, client_call_manager_)); - - rpc::GetObjectStatusRequest request; - request.set_object_id(required_object_id.Binary()); - request.set_owner_worker_id(owner_addr.worker_id()); - client->GetObjectStatus( - request, [this, required_object_id, owner_addr]( - Status status, const rpc::GetObjectStatusReply &reply) { - if (!status.ok() || reply.status() == rpc::GetObjectStatusReply::OUT_OF_SCOPE || - reply.status() == rpc::GetObjectStatusReply::FREED) { - // The owner is gone, or the owner replied that the object has - // gone out of scope (this is an edge case in the distributed ref - // counting protocol where a borrower dies before it can notify - // the owner of another borrower), or the object value has been - // freed. Store an error in the local plasma store so that an - // exception will be thrown when the worker tries to get the - // value. - rpc::ObjectReference ref; - ref.set_object_id(required_object_id.Binary()); - ref.mutable_owner_address()->CopyFrom(owner_addr); - MarkObjectsAsFailed(ErrorType::OBJECT_UNRECONSTRUCTABLE, {ref}, JobID::Nil()); - } - // Do nothing if the owner replied that the object is available. The - // object manager will continue trying to fetch the object, and this - // handler will get triggered again if the object is still - // unavailable after another timeout. - }); + if (!RayConfig::instance().object_pinning_enabled()) { + // LRU eviction is enabled. The object may still be in scope, but we + // weren't able to fetch the value within the timeout, so the value has + // most likely been evicted. Mark the object as unreachable. + rpc::ObjectReference ref; + ref.set_object_id(required_object_id.Binary()); + ref.mutable_owner_address()->CopyFrom(owner_addr); + MarkObjectsAsFailed(ErrorType::OBJECT_UNRECONSTRUCTABLE, {ref}, JobID::Nil()); + } else { + RAY_LOG(DEBUG) << "Required object " << required_object_id + << " fetch timed out, asking owner " + << WorkerID::FromBinary(owner_addr.worker_id()); + // The owner's address exists. Poll the owner to check if the object is + // still in scope. If not, mark the object as failed. + // TODO(swang): If the owner has died, we could also mark the object as + // failed as soon as we hear about the owner's failure from the GCS, + // avoiding the raylet's reconstruction timeout. + auto client = std::unique_ptr( + new rpc::CoreWorkerClient(owner_addr, client_call_manager_)); + + rpc::GetObjectStatusRequest request; + request.set_object_id(required_object_id.Binary()); + request.set_owner_worker_id(owner_addr.worker_id()); + client->GetObjectStatus(request, [this, required_object_id, owner_addr]( + Status status, + const rpc::GetObjectStatusReply &reply) { + if (!status.ok() || reply.status() == rpc::GetObjectStatusReply::OUT_OF_SCOPE || + reply.status() == rpc::GetObjectStatusReply::FREED) { + // The owner is gone, or the owner replied that the object has + // gone out of scope (this is an edge case in the distributed ref + // counting protocol where a borrower dies before it can notify + // the owner of another borrower), or the object value has been + // freed. Store an error in the local plasma store so that an + // exception will be thrown when the worker tries to get the + // value. + rpc::ObjectReference ref; + ref.set_object_id(required_object_id.Binary()); + ref.mutable_owner_address()->CopyFrom(owner_addr); + MarkObjectsAsFailed(ErrorType::OBJECT_UNRECONSTRUCTABLE, {ref}, JobID::Nil()); + } + // Do nothing if the owner replied that the object is available. The + // object manager will continue trying to fetch the object, and this + // handler will get triggered again if the object is still + // unavailable after another timeout. + }); + } } else { RAY_LOG(WARNING) << "Ray cannot get the value of ObjectIDs that are generated " @@ -2329,7 +2301,6 @@ std::string NodeManager::DebugString() const { for (auto &pair : cluster_resource_map_) { result << "\n" << pair.first.Hex() << ": " << pair.second.DebugString(); } - result << "\n" << local_object_manager_.DebugString(); result << "\n" << object_manager_.DebugString(); result << "\n" << gcs_client_->DebugString(); result << "\n" << worker_pool_.DebugString(); @@ -2366,56 +2337,45 @@ std::string compact_tag_string(const opencensus::stats::ViewDescriptor &view, return result.str(); } -bool NodeManager::GetObjectsFromPlasma(const std::vector &object_ids, - std::vector> *results) { - // Pin the objects in plasma by getting them and holding a reference to - // the returned buffer. - // NOTE: the caller must ensure that the objects already exist in plasma before - // sending a PinObjectIDs request. - std::vector plasma_results; - // TODO(swang): This `Get` has a timeout of 0, so the plasma store will not - // block when serving the request. However, if the plasma store is under - // heavy load, then this request can still block the NodeManager event loop - // since we must wait for the plasma store's reply. We should consider using - // an `AsyncGet` instead. - if (!store_client_ - .Get(object_ids, /*timeout_ms=*/0, &plasma_results, /*is_from_worker=*/false) - .ok()) { - return false; - } - - for (const auto &plasma_result : plasma_results) { - if (plasma_result.data == nullptr) { - results->push_back(nullptr); - } else { - results->emplace_back(std::unique_ptr( - new RayObject(plasma_result.data, plasma_result.metadata, {}))); - } - } - return true; -} - void NodeManager::HandlePinObjectIDs(const rpc::PinObjectIDsRequest &request, rpc::PinObjectIDsReply *reply, rpc::SendReplyCallback send_reply_callback) { std::vector object_ids; object_ids.reserve(request.object_ids_size()); - const auto &owner_address = request.owner_address(); for (const auto &object_id_binary : request.object_ids()) { object_ids.push_back(ObjectID::FromBinary(object_id_binary)); } - std::vector> results; - if (!GetObjectsFromPlasma(object_ids, &results)) { - RAY_LOG(WARNING) - << "Failed to get objects that should have been in the object store. These " - "objects may have been evicted while there are still references in scope."; - // TODO(suquark): Maybe "Status::ObjectNotFound" is more accurate here. - send_reply_callback(Status::Invalid("Failed to get objects."), nullptr, nullptr); - return; + if (object_pinning_enabled_) { + // Pin the objects in plasma by getting them and holding a reference to + // the returned buffer. + // NOTE: the caller must ensure that the objects already exist in plasma before + // sending a PinObjectIDs request. + std::vector plasma_results; + // TODO(swang): This `Get` has a timeout of 0, so the plasma store will not + // block when serving the request. However, if the plasma store is under + // heavy load, then this request can still block the NodeManager event loop + // since we must wait for the plasma store's reply. We should consider using + // an `AsyncGet` instead. + if (!store_client_.Get(object_ids, /*timeout_ms=*/0, &plasma_results).ok()) { + RAY_LOG(WARNING) << "Failed to get objects to be pinned from object store."; + // TODO(suquark): Maybe "Status::ObjectNotFound" is more accurate here. + send_reply_callback(Status::Invalid("Failed to get objects."), nullptr, nullptr); + return; + } + + std::vector> objects; + for (int64_t i = 0; i < request.object_ids().size(); i++) { + if (plasma_results[i].data == nullptr) { + objects.push_back(nullptr); + } else { + objects.emplace_back(std::unique_ptr( + new RayObject(plasma_results[i].data, plasma_results[i].metadata, {}))); + } + } + local_object_manager_.PinObjects(object_ids, std::move(objects)); } - local_object_manager_.PinObjects(object_ids, std::move(results), owner_address); // Wait for the object to be freed by the owner, which keeps the ref count. - local_object_manager_.WaitForObjectFree(owner_address, object_ids); + local_object_manager_.WaitForObjectFree(request.owner_address(), object_ids); send_reply_callback(Status::OK(), nullptr, nullptr); } @@ -2521,16 +2481,14 @@ rpc::ObjectStoreStats AccumulateStoreStats( rpc::ObjectStoreStats store_stats; for (const auto &reply : node_stats) { auto cur_store = reply.store_stats(); - // Use max aggregation for time, since the nodes are spilling concurrently. - store_stats.set_spill_time_total_s( - std::max(store_stats.spill_time_total_s(), cur_store.spill_time_total_s())); - store_stats.set_restore_time_total_s( - std::max(store_stats.restore_time_total_s(), cur_store.restore_time_total_s())); - // Use sum aggregation for the rest of the metrics. + store_stats.set_spill_time_total_s(store_stats.spill_time_total_s() + + cur_store.spill_time_total_s()); store_stats.set_spilled_bytes_total(store_stats.spilled_bytes_total() + cur_store.spilled_bytes_total()); store_stats.set_spilled_objects_total(store_stats.spilled_objects_total() + cur_store.spilled_objects_total()); + store_stats.set_restore_time_total_s(store_stats.restore_time_total_s() + + cur_store.restore_time_total_s()); store_stats.set_restored_bytes_total(store_stats.restored_bytes_total() + cur_store.restored_bytes_total()); store_stats.set_restored_objects_total(store_stats.restored_objects_total() + @@ -2541,8 +2499,6 @@ rpc::ObjectStoreStats AccumulateStoreStats( cur_store.object_store_bytes_avail()); store_stats.set_num_local_objects(store_stats.num_local_objects() + cur_store.num_local_objects()); - store_stats.set_consumed_bytes(store_stats.consumed_bytes() + - cur_store.consumed_bytes()); } return store_stats; } @@ -2758,30 +2714,6 @@ void NodeManager::PublishInfeasibleTaskError(const Task &task) const { } } -void NodeManager::SendSpilledObjectRestorationRequestToRemoteNode( - const ObjectID &object_id, const std::string &spilled_url, const NodeID &node_id) { - // Fetch from a remote node. - if (!remote_node_manager_addresses_.contains(node_id)) { - // It is possible the new node information is not received at this point. - // In this case, the PullManager will handle retry, so we just return. - return; - } - const auto &entry = remote_node_manager_addresses_.find(node_id); - // TODO(sang): Use a node manager pool instead. - auto raylet_client = - std::make_shared(rpc::NodeManagerWorkerClient::make( - entry->second.first, entry->second.second, client_call_manager_)); - raylet_client->RestoreSpilledObject( - object_id, spilled_url, node_id, - [](const ray::Status &status, const rpc::RestoreSpilledObjectReply &r) { - if (!status.ok()) { - RAY_LOG(WARNING) << "Failed to send a spilled object restoration request to a " - "remote node. This request will be retried. Error message: " - << status.ToString(); - } - }); -} - } // namespace raylet } // namespace ray diff --git a/src/ray/raylet/node_manager.h b/src/ray/raylet/node_manager.h index d0819550958a..d626e5246297 100644 --- a/src/ray/raylet/node_manager.h +++ b/src/ray/raylet/node_manager.h @@ -28,7 +28,6 @@ #include "ray/common/task/scheduling_resources.h" #include "ray/object_manager/object_manager.h" #include "ray/raylet/agent_manager.h" -#include "ray/raylet_client/raylet_client.h" #include "ray/raylet/local_object_manager.h" #include "ray/raylet/scheduling/scheduling_ids.h" #include "ray/raylet/scheduling/cluster_resource_scheduler.h" @@ -93,6 +92,8 @@ struct NodeManagerConfig { uint64_t debug_dump_period_ms; /// Whether to enable fair queueing between task classes in raylet. bool fair_queueing_enabled; + /// Whether to enable pinning for plasma objects. + bool object_pinning_enabled; /// Whether to enable automatic object deletion for object spilling. bool automatic_object_deletion_enabled; /// The store socket name. @@ -602,11 +603,6 @@ class NodeManager : public rpc::NodeManagerServiceHandler, rpc::RequestObjectSpillageReply *reply, rpc::SendReplyCallback send_reply_callback) override; - /// Handle a `RestoreSpilledObject` request. - void HandleRestoreSpilledObject(const rpc::RestoreSpilledObjectRequest &request, - rpc::RestoreSpilledObjectReply *reply, - rpc::SendReplyCallback send_reply_callback) override; - /// Handle a `ReleaseUnusedBundles` request. void HandleReleaseUnusedBundles(const rpc::ReleaseUnusedBundlesRequest &request, rpc::ReleaseUnusedBundlesReply *reply, @@ -637,24 +633,9 @@ class NodeManager : public rpc::NodeManagerServiceHandler, /// \param task Task that is infeasible void PublishInfeasibleTaskError(const Task &task) const; - /// Send a object restoration request to a remote node of a given node id. - void SendSpilledObjectRestorationRequestToRemoteNode(const ObjectID &object_id, - const std::string &spilled_url, - const NodeID &node_id); - std::unordered_map> MakeTasksByClass( const std::vector &tasks) const; - /// Get pointers to objects stored in plasma. They will be - /// released once the returned references go out of scope. - /// - /// \param[in] object_ids The objects to get. - /// \param[out] results The pointers to objects stored in - /// plasma. - /// \return Whether the request was successful. - bool GetObjectsFromPlasma(const std::vector &object_ids, - std::vector> *results); - /////////////////////////////////////////////////////////////////////////////////////// //////////////////// Begin of the override methods of ClusterTaskManager ////////////// // The following methods are defined in node_manager.task.cc instead of node_manager.cc @@ -799,6 +780,8 @@ class NodeManager : public rpc::NodeManagerServiceHandler, int64_t debug_dump_period_; /// Whether to enable fair queueing between task classes in raylet. bool fair_queueing_enabled_; + /// Whether to enable pinning for plasma objects. + bool object_pinning_enabled_; /// Incremented each time we encounter a potential resource deadlock condition. /// This is reset to zero when the condition is cleared. int resource_deadlock_warned_ = 0; diff --git a/src/ray/raylet/node_manager.task.cc b/src/ray/raylet/node_manager.task.cc index 2fec7360b354..150ecb02d2ba 100644 --- a/src/ray/raylet/node_manager.task.cc +++ b/src/ray/raylet/node_manager.task.cc @@ -116,6 +116,8 @@ void NodeManager::FillResourceUsage(std::shared_ptr resource (*resources_data->mutable_resource_load())[resource_pair.first] = resource_pair.second; } + last_heartbeat_resources->SetLoadResources( + ResourceSet(local_resources.GetLoadResources())); } // Add resource load by shape. This will be used by the new autoscaler. diff --git a/src/ray/raylet/raylet.cc b/src/ray/raylet/raylet.cc index 4d9514e626da..6aeec576e1e4 100644 --- a/src/ray/raylet/raylet.cc +++ b/src/ray/raylet/raylet.cc @@ -72,11 +72,10 @@ Raylet::Raylet(boost::asio::io_service &main_service, const std::string &socket_ std::make_shared(main_service, gcs_client_))), object_manager_( main_service, self_node_id_, object_manager_config, object_directory_, - [this](const ObjectID &object_id, const std::string &object_url, - const NodeID &node_id, + [this](const ObjectID &object_id, const std::string &spilled_url, std::function callback) { node_manager_.GetLocalObjectManager().AsyncRestoreSpilledObject( - object_id, object_url, node_id, callback); + object_id, spilled_url, callback); }, [this]() { // This callback is called from the plasma store thread. diff --git a/src/ray/raylet/reconstruction_policy.cc b/src/ray/raylet/reconstruction_policy.cc index 1da422529cda..59d4789f08c5 100644 --- a/src/ray/raylet/reconstruction_policy.cc +++ b/src/ray/raylet/reconstruction_policy.cc @@ -179,8 +179,7 @@ void ReconstructionPolicy::HandleTaskLeaseExpired(const TaskID &task_id) { created_object_id, it->second.owner_addresses[created_object_id], [this, task_id, reconstruction_attempt]( const ray::ObjectID &object_id, const std::unordered_set &nodes, - const std::string &spilled_url, const ray::NodeID &spilled_node_id, - size_t object_size) { + const std::string &spilled_url) { if (nodes.empty() && spilled_url.empty()) { // The required object no longer exists on any live nodes. Attempt // reconstruction. diff --git a/src/ray/raylet/reconstruction_policy_test.cc b/src/ray/raylet/reconstruction_policy_test.cc index d4eb387a3ac0..199e4d51ee2d 100644 --- a/src/ray/raylet/reconstruction_policy_test.cc +++ b/src/ray/raylet/reconstruction_policy_test.cc @@ -58,10 +58,9 @@ class MockObjectDirectory : public ObjectDirectoryInterface { const ObjectID object_id = callback.first; auto it = locations_.find(object_id); if (it == locations_.end()) { - callback.second(object_id, std::unordered_set(), "", NodeID::Nil(), - 0); + callback.second(object_id, std::unordered_set(), ""); } else { - callback.second(object_id, it->second, "", NodeID::Nil(), 0); + callback.second(object_id, it->second, ""); } } callbacks_.clear(); diff --git a/src/ray/raylet/scheduling/cluster_resource_scheduler.h b/src/ray/raylet/scheduling/cluster_resource_scheduler.h index 892db9e8b6a3..747fe6f6fba2 100644 --- a/src/ray/raylet/scheduling/cluster_resource_scheduler.h +++ b/src/ray/raylet/scheduling/cluster_resource_scheduler.h @@ -387,7 +387,7 @@ class ClusterResourceScheduler : public ClusterResourceSchedulerInterface { /// /// \param gcs_resources: The remote cache from gcs. void UpdateLastResourceUsage( - const std::shared_ptr gcs_resources) override; + std::shared_ptr gcs_resources) override; /// Return human-readable string for this scheduler state. std::string DebugString() const; diff --git a/src/ray/raylet/scheduling/cluster_resource_scheduler_interface.h b/src/ray/raylet/scheduling/cluster_resource_scheduler_interface.h index 21c6b6edccd3..ca2ba5237d71 100644 --- a/src/ray/raylet/scheduling/cluster_resource_scheduler_interface.h +++ b/src/ray/raylet/scheduling/cluster_resource_scheduler_interface.h @@ -54,7 +54,7 @@ class ClusterResourceSchedulerInterface { /// /// \param gcs_resources: The remote cache from gcs. virtual void UpdateLastResourceUsage( - const std::shared_ptr gcs_resources) {} + std::shared_ptr gcs_resources) {} /// Populate the relevant parts of the heartbeat table. This is intended for /// sending raylet <-> gcs heartbeats. In particular, this should fill in diff --git a/src/ray/raylet/scheduling/cluster_task_manager.cc b/src/ray/raylet/scheduling/cluster_task_manager.cc index 109833eb59ab..a395e51b5077 100644 --- a/src/ray/raylet/scheduling/cluster_task_manager.cc +++ b/src/ray/raylet/scheduling/cluster_task_manager.cc @@ -20,10 +20,7 @@ ClusterTaskManager::ClusterTaskManager( NodeInfoGetter get_node_info, std::function announce_infeasible_task, WorkerPoolInterface &worker_pool, - std::unordered_map> &leased_workers, - std::function &object_ids, - std::vector> *results)> - pin_task_arguments) + std::unordered_map> &leased_workers) : self_node_id_(self_node_id), cluster_resource_scheduler_(cluster_resource_scheduler), task_dependency_manager_(task_dependency_manager), @@ -34,8 +31,7 @@ ClusterTaskManager::ClusterTaskManager( RayConfig::instance().max_resource_shapes_per_load_report()), report_worker_backlog_(RayConfig::instance().report_worker_backlog()), worker_pool_(worker_pool), - leased_workers_(leased_workers), - pin_task_arguments_(pin_task_arguments) {} + leased_workers_(leased_workers) {} bool ClusterTaskManager::SchedulePendingTasks() { // Always try to schedule infeasible tasks in case they are now feasible. @@ -148,36 +144,11 @@ void ClusterTaskManager::DispatchScheduledTasksToWorkers( auto &task = std::get<0>(work); auto &spec = task.GetTaskSpecification(); - std::vector> args; - bool success = true; - const auto &deps = spec.GetDependencyIds(); - if (!deps.empty()) { - // This gets refs to the arguments stored in plasma. The refs should be - // deleted once we no longer need to pin the arguments. - success = pin_task_arguments_(deps, &args); - if (!success) { - RAY_LOG(WARNING) << "Error getting task arguments from plasma store"; - } - for (size_t i = 0; i < deps.size(); i++) { - if (args[i] == nullptr) { - // This can happen if the task's arguments were all local at some - // point, but then at least one was evicted before the task could - // be dispatched to a worker. - RAY_LOG(INFO) - << "Task " << spec.TaskId() << " argument " << deps[i] - << " was evicted before the task could be dispatched. This can happen " - "when there are many objects needed on this node. The task will be " - "scheduled once all of its dependencies are local."; - success = false; - break; - } - } - } - // An argument was evicted since this task was added to the dispatch // queue. Move it back to the waiting queue. The caller is responsible // for notifying us when the task is unblocked again. - if (!success) { + if (!spec.GetDependencies().empty() && + !task_dependency_manager_.IsTaskReady(spec.TaskId())) { waiting_tasks_[spec.TaskId()] = std::move(*work_it); work_it = dispatch_queue.erase(work_it); continue; @@ -206,12 +177,6 @@ void ClusterTaskManager::DispatchScheduledTasksToWorkers( bool worker_leased; bool remove = AttemptDispatchWork(*work_it, worker, &worker_leased); if (worker_leased) { - // Pin the arguments while the lease is active. These will be erased - // once the lease is returned. - num_pinned_task_arguments_ += args.size(); - RAY_CHECK(pinned_task_arguments_.emplace(spec.TaskId(), std::move(args)).second) - << spec.TaskId(); - auto reply = std::get<1>(*work_it); auto callback = std::get<2>(*work_it); Dispatch(worker, leased_workers_, task, reply, callback); @@ -330,10 +295,6 @@ void ClusterTaskManager::TaskFinished(std::shared_ptr worker, Task *task) { RAY_CHECK(worker != nullptr && task != nullptr); *task = worker->GetAssignedTask(); - auto it = pinned_task_arguments_.find(task->GetTaskSpecification().TaskId()); - RAY_CHECK(it != pinned_task_arguments_.end()); - num_pinned_task_arguments_ -= it->second.size(); - pinned_task_arguments_.erase(it); if (worker->GetAllocatedInstances() != nullptr) { ReleaseWorkerResources(worker); } @@ -657,23 +618,12 @@ bool ClusterTaskManager::AnyPendingTasks(Task *exemplar, bool *any_pending, std::string ClusterTaskManager::DebugStr() const { // TODO(Shanly): This method will be replaced with `DebugString` once we remove the // legacy scheduler. - auto accumulator = [](size_t state, const std::pair> &pair) { - return state + pair.second.size(); - }; - size_t num_infeasible_tasks = std::accumulate( - infeasible_tasks_.begin(), infeasible_tasks_.end(), (size_t)0, accumulator); - size_t num_tasks_to_schedule = std::accumulate( - tasks_to_schedule_.begin(), tasks_to_schedule_.end(), (size_t)0, accumulator); - size_t num_tasks_to_dispatch = std::accumulate( - tasks_to_dispatch_.begin(), tasks_to_dispatch_.end(), (size_t)0, accumulator); std::stringstream buffer; buffer << "========== Node: " << self_node_id_ << " =================\n"; - buffer << "Infeasible queue length: " << num_infeasible_tasks << "\n"; - buffer << "Schedule queue length: " << num_tasks_to_schedule << "\n"; - buffer << "Dispatch queue length: " << num_tasks_to_dispatch << "\n"; + buffer << "Schedule queue length: " << tasks_to_schedule_.size() << "\n"; + buffer << "Dispatch queue length: " << tasks_to_dispatch_.size() << "\n"; buffer << "Waiting tasks size: " << waiting_tasks_.size() << "\n"; - buffer << "Number of executing tasks: " << pinned_task_arguments_.size() << "\n"; - buffer << "Number of pinned task arguments: " << num_pinned_task_arguments_ << "\n"; + buffer << "infeasible queue length size: " << infeasible_tasks_.size() << "\n"; buffer << "cluster_resource_scheduler state: " << cluster_resource_scheduler_->DebugString() << "\n"; buffer << "=================================================="; @@ -723,6 +673,7 @@ void ClusterTaskManager::Dispatch( const Task &task, rpc::RequestWorkerLeaseReply *reply, std::function send_reply_callback) { const auto &task_spec = task.GetTaskSpecification(); + RAY_LOG(DEBUG) << "Dispatching task " << task_spec.TaskId(); // Pass the contact info of the worker to use. reply->set_worker_pid(worker->GetProcess().GetId()); reply->mutable_worker_address()->set_ip_address(worker->IpAddress()); @@ -732,7 +683,6 @@ void ClusterTaskManager::Dispatch( RAY_CHECK(leased_workers.find(worker->WorkerId()) == leased_workers.end()); leased_workers[worker->WorkerId()] = worker; - RemoveFromBacklogTracker(task); // Update our internal view of the cluster state. std::shared_ptr allocated_resources; @@ -784,9 +734,7 @@ void ClusterTaskManager::Dispatch( } void ClusterTaskManager::Spillback(const NodeID &spillback_to, const Work &work) { - const auto &task = std::get<0>(work); - const auto &task_spec = task.GetTaskSpecification(); - RemoveFromBacklogTracker(task); + const auto &task_spec = std::get<0>(work).GetTaskSpecification(); RAY_LOG(DEBUG) << "Spilling task " << task_spec.TaskId() << " to node " << spillback_to; if (!cluster_resource_scheduler_->AllocateRemoteTaskResources( diff --git a/src/ray/raylet/scheduling/cluster_task_manager.h b/src/ray/raylet/scheduling/cluster_task_manager.h index 7f2652cebc80..f632357e10f4 100644 --- a/src/ray/raylet/scheduling/cluster_task_manager.h +++ b/src/ray/raylet/scheduling/cluster_task_manager.h @@ -2,7 +2,6 @@ #include "absl/container/flat_hash_map.h" #include "absl/container/flat_hash_set.h" -#include "ray/common/ray_object.h" #include "ray/common/task/task.h" #include "ray/common/task/task_common.h" #include "ray/raylet/dependency_manager.h" @@ -61,10 +60,7 @@ class ClusterTaskManager : public ClusterTaskManagerInterface { NodeInfoGetter get_node_info, std::function announce_infeasible_task, WorkerPoolInterface &worker_pool, - std::unordered_map> &leased_workers, - std::function &object_ids, - std::vector> *results)> - pin_task_arguments); + std::unordered_map> &leased_workers); /// (Step 1) Queue tasks and schedule. /// Queue task and schedule. This hanppens when processing the worker lease request. @@ -252,22 +248,6 @@ class ClusterTaskManager : public ClusterTaskManagerInterface { WorkerPoolInterface &worker_pool_; std::unordered_map> &leased_workers_; - /// Callback to get references to task arguments. These will be pinned while - /// the task is running. - std::function &object_ids, - std::vector> *results)> - pin_task_arguments_; - - /// Arguments needed by currently granted lease requests. These should be - /// pinned before the lease is granted to ensure that the arguments are not - /// evicted before the task(s) start running. - std::unordered_map>> - pinned_task_arguments_; - - /// The total number of arguments pinned for running tasks. - /// Used for debug purposes. - size_t num_pinned_task_arguments_ = 0; - /// Determine whether a task should be immediately dispatched, /// or placed on a wait queue. /// diff --git a/src/ray/raylet/scheduling/cluster_task_manager_test.cc b/src/ray/raylet/scheduling/cluster_task_manager_test.cc index 80a9406da4d5..7c5f00820839 100644 --- a/src/ray/raylet/scheduling/cluster_task_manager_test.cc +++ b/src/ray/raylet/scheduling/cluster_task_manager_test.cc @@ -85,7 +85,7 @@ Task CreateTask(const std::unordered_map &required_resource std::make_pair(PlacementGroupID::Nil(), -1), true, ""); for (int i = 0; i < num_args; i++) { - ObjectID put_id = ObjectID::FromIndex(RandomTaskId(), /*index=*/i + 1); + ObjectID put_id = ObjectID::FromIndex(TaskID::Nil(), /*index=*/i + 1); spec_builder.AddArg(TaskArgByReference(put_id, rpc::Address())); } @@ -96,25 +96,20 @@ Task CreateTask(const std::unordered_map &required_resource class MockTaskDependencyManager : public TaskDependencyManagerInterface { public: - MockTaskDependencyManager(std::unordered_set &missing_objects) - : missing_objects_(missing_objects) {} - bool RequestTaskDependencies( const TaskID &task_id, const std::vector &required_objects) { RAY_CHECK(subscribed_tasks.insert(task_id).second); - for (auto &obj_ref : required_objects) { - if (missing_objects_.count(ObjectRefToId(obj_ref))) { - return false; - } - } - return true; + return task_ready_; } void RemoveTaskDependencies(const TaskID &task_id) { RAY_CHECK(subscribed_tasks.erase(task_id)); } - std::unordered_set &missing_objects_; + bool IsTaskReady(const TaskID &task_id) const { return task_ready_; } + + bool task_ready_ = true; + std::unordered_set subscribed_tasks; }; @@ -126,34 +121,16 @@ class ClusterTaskManagerTest : public ::testing::Test { is_owner_alive_(true), node_info_calls_(0), announce_infeasible_task_calls_(0), - dependency_manager_(missing_objects_), - task_manager_( - id_, scheduler_, dependency_manager_, - [this](const WorkerID &worker_id, const NodeID &node_id) { - return is_owner_alive_; - }, - [this](const NodeID &node_id) { - node_info_calls_++; - return node_info_[node_id]; - }, - [this](const Task &task) { announce_infeasible_task_calls_++; }, pool_, - leased_workers_, - [this](const std::vector &object_ids, - std::vector> *results) { - for (auto &obj_id : object_ids) { - if (missing_objects_.count(obj_id) == 0) { - std::string meta = "metadata"; - auto metadata = const_cast( - reinterpret_cast(meta.data())); - auto meta_buffer = - std::make_shared(metadata, meta.size()); - results->emplace_back(new RayObject(nullptr, meta_buffer, {})); - } else { - results->emplace_back(nullptr); - } - } - return true; - }) {} + task_manager_(id_, scheduler_, dependency_manager_, + [this](const WorkerID &worker_id, const NodeID &node_id) { + return is_owner_alive_; + }, + [this](const NodeID &node_id) { + node_info_calls_++; + return node_info_[node_id]; + }, + [this](const Task &task) { announce_infeasible_task_calls_++; }, + pool_, leased_workers_) {} void SetUp() {} @@ -176,25 +153,13 @@ class ClusterTaskManagerTest : public ::testing::Test { ASSERT_TRUE(task_manager_.tasks_to_dispatch_.empty()); ASSERT_TRUE(task_manager_.waiting_tasks_.empty()); ASSERT_TRUE(task_manager_.infeasible_tasks_.empty()); - ASSERT_TRUE(task_manager_.pinned_task_arguments_.empty()); - ASSERT_EQ(task_manager_.num_pinned_task_arguments_, 0); ASSERT_TRUE(dependency_manager_.subscribed_tasks.empty()); } - void AssertPinnedTaskArgumentsEquals(const TaskID &task_id, size_t num_args_expected) { - ASSERT_EQ(task_manager_.pinned_task_arguments_[task_id].size(), num_args_expected); - size_t num_args = 0; - for (auto &args : task_manager_.pinned_task_arguments_) { - num_args += args.second.size(); - } - ASSERT_EQ(task_manager_.num_pinned_task_arguments_, num_args); - } - NodeID id_; std::shared_ptr scheduler_; MockWorkerPool pool_; std::unordered_map> leased_workers_; - std::unordered_set missing_objects_; bool is_owner_alive_; @@ -238,11 +203,6 @@ TEST_F(ClusterTaskManagerTest, BasicTest) { ASSERT_EQ(pool_.workers.size(), 0); ASSERT_EQ(node_info_calls_, 0); - Task finished_task; - task_manager_.TaskFinished(leased_workers_.begin()->second, &finished_task); - ASSERT_EQ(finished_task.GetTaskSpecification().TaskId(), - task.GetTaskSpecification().TaskId()); - AssertNoLeaks(); } @@ -292,9 +252,8 @@ TEST_F(ClusterTaskManagerTest, ResourceTakenWhileResolving) { }; /* Blocked on dependencies */ - auto task = CreateTask({{ray::kCPU_ResourceLabel, 5}}, 2); - auto missing_arg = task.GetTaskSpecification().GetDependencyIds()[0]; - missing_objects_.insert(missing_arg); + dependency_manager_.task_ready_ = false; + auto task = CreateTask({{ray::kCPU_ResourceLabel, 5}}, 1); std::unordered_set expected_subscribed_tasks = { task.GetTaskSpecification().TaskId()}; task_manager_.QueueAndScheduleTask(task, &reply, callback); @@ -305,42 +264,36 @@ TEST_F(ClusterTaskManagerTest, ResourceTakenWhileResolving) { ASSERT_EQ(pool_.workers.size(), 2); /* This task can run */ - auto task2 = CreateTask({{ray::kCPU_ResourceLabel, 5}}, 1); + auto task2 = CreateTask({{ray::kCPU_ResourceLabel, 5}}); task_manager_.QueueAndScheduleTask(task2, &reply, callback); ASSERT_EQ(dependency_manager_.subscribed_tasks, expected_subscribed_tasks); - AssertPinnedTaskArgumentsEquals(task2.GetTaskSpecification().TaskId(), 1); ASSERT_EQ(num_callbacks, 1); ASSERT_EQ(leased_workers_.size(), 1); ASSERT_EQ(pool_.workers.size(), 1); /* First task is unblocked now, but resources are no longer available */ - missing_objects_.erase(missing_arg); + dependency_manager_.task_ready_ = true; auto id = task.GetTaskSpecification().TaskId(); std::vector unblocked = {id}; task_manager_.TasksUnblocked(unblocked); ASSERT_EQ(dependency_manager_.subscribed_tasks, expected_subscribed_tasks); - AssertPinnedTaskArgumentsEquals(task2.GetTaskSpecification().TaskId(), 1); ASSERT_EQ(num_callbacks, 1); ASSERT_EQ(leased_workers_.size(), 1); ASSERT_EQ(pool_.workers.size(), 1); /* Second task finishes, making space for the original task */ - Task finished_task; - task_manager_.TaskFinished(leased_workers_.begin()->second, &finished_task); leased_workers_.clear(); + task_manager_.ReleaseWorkerResources(worker); task_manager_.ScheduleAndDispatchTasks(); ASSERT_TRUE(dependency_manager_.subscribed_tasks.empty()); // Task2 is now done so task can run. - AssertPinnedTaskArgumentsEquals(task.GetTaskSpecification().TaskId(), 2); ASSERT_EQ(num_callbacks, 2); ASSERT_EQ(leased_workers_.size(), 1); ASSERT_EQ(pool_.workers.size(), 0); - - task_manager_.TaskFinished(leased_workers_.begin()->second, &finished_task); AssertNoLeaks(); } @@ -389,12 +342,6 @@ TEST_F(ClusterTaskManagerTest, TestSpillAfterAssigned) { // The second task was spilled. ASSERT_EQ(spillback_reply.retry_at_raylet_address().raylet_id(), remote_node_id.Binary()); - - Task finished_task; - task_manager_.TaskFinished(leased_workers_.begin()->second, &finished_task); - ASSERT_EQ(finished_task.GetTaskSpecification().TaskId(), - task.GetTaskSpecification().TaskId()); - AssertNoLeaks(); } @@ -438,12 +385,6 @@ TEST_F(ClusterTaskManagerTest, TaskCancellationTest) { ASSERT_FALSE(callback_called); ASSERT_EQ(pool_.workers.size(), 0); ASSERT_EQ(leased_workers_.size(), 1); - - Task finished_task; - task_manager_.TaskFinished(leased_workers_.begin()->second, &finished_task); - ASSERT_EQ(finished_task.GetTaskSpecification().TaskId(), - task.GetTaskSpecification().TaskId()); - AssertNoLeaks(); } @@ -613,75 +554,48 @@ TEST_F(ClusterTaskManagerTest, BacklogReportTest) { *callback_occurred_ptr = true; }; - std::vector to_cancel; + std::shared_ptr worker = + std::make_shared(WorkerID::FromRandom(), 1234); + pool_.PushWorker(std::dynamic_pointer_cast(worker)); - // Don't add these fist 2 tasks to `to_cancel`. - for (int i = 0; i < 1; i++) { - Task task = CreateTask({{ray::kCPU_ResourceLabel, 8}}); - task.SetBacklogSize(10 - i); - task_manager_.QueueAndScheduleTask(task, &reply, callback); - } + std::vector to_cancel; - for (int i = 1; i < 10; i++) { - Task task = CreateTask({{ray::kCPU_ResourceLabel, 8}}); - task.SetBacklogSize(10 - i); + for (int i = 0; i < 10; i++) { + Task task = CreateTask({{ray::kCPU_ResourceLabel, 100}}); + task.SetBacklogSize(i); task_manager_.QueueAndScheduleTask(task, &reply, callback); to_cancel.push_back(task.GetTaskSpecification().TaskId()); } ASSERT_FALSE(callback_occurred); ASSERT_EQ(leased_workers_.size(), 0); - ASSERT_EQ(pool_.workers.size(), 0); + ASSERT_EQ(pool_.workers.size(), 1); ASSERT_EQ(node_info_calls_, 0); - { // No tasks can run because the worker pool is empty. - auto data = std::make_shared(); - task_manager_.FillResourceUsage(data); - auto resource_load_by_shape = data->resource_load_by_shape(); - auto shape1 = resource_load_by_shape.resource_demands()[0]; - - ASSERT_EQ(shape1.backlog_size(), 55); - ASSERT_EQ(shape1.num_infeasible_requests_queued(), 0); - ASSERT_EQ(shape1.num_ready_requests_queued(), 10); - } + auto data = std::make_shared(); + task_manager_.FillResourceUsage(data); - // Push a worker so the first task can run. - std::shared_ptr worker = - std::make_shared(WorkerID::FromRandom(), 1234); - pool_.PushWorker(worker); - task_manager_.ScheduleAndDispatchTasks(); + auto resource_load_by_shape = data->resource_load_by_shape(); + auto shape1 = resource_load_by_shape.resource_demands()[0]; - { - auto data = std::make_shared(); - task_manager_.FillResourceUsage(data); - auto resource_load_by_shape = data->resource_load_by_shape(); - auto shape1 = resource_load_by_shape.resource_demands()[0]; + ASSERT_EQ(shape1.backlog_size(), 45); + ASSERT_EQ(shape1.num_infeasible_requests_queued(), 10); + ASSERT_EQ(shape1.num_ready_requests_queued(), 0); - ASSERT_TRUE(callback_occurred); - ASSERT_EQ(shape1.backlog_size(), 45); - ASSERT_EQ(shape1.num_infeasible_requests_queued(), 0); - ASSERT_EQ(shape1.num_ready_requests_queued(), 9); - } - - // Cancel the rest. for (auto &task_id : to_cancel) { ASSERT_TRUE(task_manager_.CancelTask(task_id)); } - RAY_LOG(ERROR) << "Finished cancelling tasks"; - { - auto data = std::make_shared(); - task_manager_.FillResourceUsage(data); - auto resource_load_by_shape = data->resource_load_by_shape(); - ASSERT_EQ(resource_load_by_shape.resource_demands().size(), 0); + data = std::make_shared(); + task_manager_.FillResourceUsage(data); - while (!leased_workers_.empty()) { - Task finished_task; - task_manager_.TaskFinished(leased_workers_.begin()->second, &finished_task); - leased_workers_.erase(leased_workers_.begin()); - } - AssertNoLeaks(); - } + resource_load_by_shape = data->resource_load_by_shape(); + shape1 = resource_load_by_shape.resource_demands()[0]; + + ASSERT_EQ(shape1.backlog_size(), 0); + ASSERT_EQ(shape1.num_infeasible_requests_queued(), 0); + ASSERT_EQ(shape1.num_ready_requests_queued(), 0); + AssertNoLeaks(); } TEST_F(ClusterTaskManagerTest, OwnerDeadTest) { @@ -850,9 +764,8 @@ TEST_F(ClusterTaskManagerTest, ArgumentEvicted) { }; /* Blocked on dependencies */ + dependency_manager_.task_ready_ = false; auto task = CreateTask({{ray::kCPU_ResourceLabel, 5}}, 2); - auto missing_arg = task.GetTaskSpecification().GetDependencyIds()[0]; - missing_objects_.insert(missing_arg); std::unordered_set expected_subscribed_tasks = { task.GetTaskSpecification().TaskId()}; task_manager_.QueueAndScheduleTask(task, &reply, callback); @@ -861,7 +774,7 @@ TEST_F(ClusterTaskManagerTest, ArgumentEvicted) { ASSERT_EQ(leased_workers_.size(), 0); /* Task is unblocked now */ - missing_objects_.erase(missing_arg); + dependency_manager_.task_ready_ = true; pool_.workers.clear(); auto id = task.GetTaskSpecification().TaskId(); task_manager_.TasksUnblocked({id}); @@ -870,7 +783,7 @@ TEST_F(ClusterTaskManagerTest, ArgumentEvicted) { ASSERT_EQ(leased_workers_.size(), 0); /* Task argument gets evicted */ - missing_objects_.insert(missing_arg); + dependency_manager_.task_ready_ = false; pool_.PushWorker(std::dynamic_pointer_cast(worker)); task_manager_.ScheduleAndDispatchTasks(); ASSERT_EQ(dependency_manager_.subscribed_tasks, expected_subscribed_tasks); @@ -878,16 +791,10 @@ TEST_F(ClusterTaskManagerTest, ArgumentEvicted) { ASSERT_EQ(leased_workers_.size(), 0); /* Worker available and arguments available */ - missing_objects_.erase(missing_arg); + dependency_manager_.task_ready_ = true; task_manager_.TasksUnblocked({id}); ASSERT_EQ(num_callbacks, 1); ASSERT_EQ(leased_workers_.size(), 1); - - Task finished_task; - task_manager_.TaskFinished(leased_workers_.begin()->second, &finished_task); - ASSERT_EQ(finished_task.GetTaskSpecification().TaskId(), - task.GetTaskSpecification().TaskId()); - AssertNoLeaks(); } diff --git a/src/ray/raylet/scheduling/old_cluster_resource_scheduler.cc b/src/ray/raylet/scheduling/old_cluster_resource_scheduler.cc index 9801e57c6311..9d5c5a9e95e9 100644 --- a/src/ray/raylet/scheduling/old_cluster_resource_scheduler.cc +++ b/src/ray/raylet/scheduling/old_cluster_resource_scheduler.cc @@ -71,6 +71,8 @@ void OldClusterResourceScheduler::FillResourceUsage( (*resources_data->mutable_resources_total())[resource_pair.first] = resource_pair.second; } + last_heartbeat_resources_->SetTotalResources( + ResourceSet(local_resources.GetTotalResources())); } if (!last_heartbeat_resources_->GetAvailableResources().IsEqual( @@ -81,6 +83,8 @@ void OldClusterResourceScheduler::FillResourceUsage( (*resources_data->mutable_resources_available())[resource_pair.first] = resource_pair.second; } + last_heartbeat_resources_->SetAvailableResources( + ResourceSet(local_resources.GetAvailableResources())); } } diff --git a/src/ray/raylet/scheduling/old_cluster_resource_scheduler.h b/src/ray/raylet/scheduling/old_cluster_resource_scheduler.h index 927442c6c078..288a85c1c37a 100644 --- a/src/ray/raylet/scheduling/old_cluster_resource_scheduler.h +++ b/src/ray/raylet/scheduling/old_cluster_resource_scheduler.h @@ -23,7 +23,7 @@ class OldClusterResourceScheduler : public ClusterResourceSchedulerInterface { explicit OldClusterResourceScheduler( const NodeID &self_node_id, ResourceIdSet &local_available_resources, std::unordered_map &cluster_resource_map, - const std::shared_ptr last_heartbeat_resources); + std::shared_ptr last_heartbeat_resources); /// Remove node from the cluster data structure. This happens /// when a node fails or it is removed from the cluster. @@ -67,6 +67,6 @@ class OldClusterResourceScheduler : public ClusterResourceSchedulerInterface { std::string self_node_id_string_; ResourceIdSet &local_available_resources_; std::unordered_map &cluster_resource_map_; - const std::shared_ptr last_heartbeat_resources_; + std::shared_ptr last_heartbeat_resources_; }; } // namespace ray diff --git a/src/ray/raylet/test/local_object_manager_test.cc b/src/ray/raylet/test/local_object_manager_test.cc index 148ed6514631..616e7348283b 100644 --- a/src/ray/raylet/test/local_object_manager_test.cc +++ b/src/ray/raylet/test/local_object_manager_test.cc @@ -37,41 +37,21 @@ class MockWorkerClient : public rpc::CoreWorkerClientInterface { void WaitForObjectEviction( const rpc::WaitForObjectEvictionRequest &request, const rpc::ClientCallback &callback) override { - eviction_callbacks.push_back(callback); + callbacks.push_back(callback); } bool ReplyObjectEviction(Status status = Status::OK()) { - if (eviction_callbacks.empty()) { + if (callbacks.size() == 0) { return false; } - auto callback = eviction_callbacks.front(); + auto callback = callbacks.front(); auto reply = rpc::WaitForObjectEvictionReply(); callback(status, reply); - eviction_callbacks.pop_front(); - return true; - } - - void AddSpilledUrl( - const rpc::AddSpilledUrlRequest &request, - const rpc::ClientCallback &callback) override { - object_urls.emplace(ObjectID::FromBinary(request.object_id()), request.spilled_url()); - spilled_url_callbacks.push_back(callback); - } - - bool ReplyAddSpilledUrl(Status status = Status::OK()) { - if (spilled_url_callbacks.empty()) { - return false; - } - auto callback = spilled_url_callbacks.front(); - auto reply = rpc::AddSpilledUrlReply(); - callback(status, reply); - spilled_url_callbacks.pop_front(); + callbacks.pop_front(); return true; } - std::deque> eviction_callbacks; - std::unordered_map object_urls; - std::deque> spilled_url_callbacks; + std::list> callbacks; }; class MockIOWorkerClient : public rpc::CoreWorkerClientInterface { @@ -104,16 +84,12 @@ class MockIOWorkerClient : public rpc::CoreWorkerClientInterface { restore_callbacks.push_back(callback); } - bool ReplyRestoreObjects(int64_t bytes_restored, Status status = Status::OK()) { + void ReplyRestoreObjects(int64_t bytes_restored, Status status = Status::OK()) { rpc::RestoreSpilledObjectsReply reply; reply.set_bytes_restored_total(bytes_restored); - if (restore_callbacks.size() == 0) { - return false; - }; auto callback = restore_callbacks.front(); callback(status, reply); restore_callbacks.pop_front(); - return true; } void DeleteSpilledObjects( @@ -209,12 +185,10 @@ class MockObjectInfoAccessor : public gcs::ObjectInfoAccessor { MOCK_METHOD1(AsyncGetAll, Status(const gcs::MultiItemCallback &callback)); - MOCK_METHOD4(AsyncAddLocation, - Status(const ObjectID &object_id, const NodeID &node_id, - size_t object_size, const gcs::StatusCallback &callback)); + MOCK_METHOD3(AsyncAddLocation, Status(const ObjectID &object_id, const NodeID &node_id, + const gcs::StatusCallback &callback)); Status AsyncAddSpilledUrl(const ObjectID &object_id, const std::string &spilled_url, - const NodeID &spilled_node_id, size_t object_size, const gcs::StatusCallback &callback) { object_urls[object_id] = spilled_url; callbacks.push_back(callback); @@ -277,14 +251,12 @@ class LocalObjectManagerTest : public ::testing::Test { LocalObjectManagerTest() : owner_client(std::make_shared()), client_pool([&](const rpc::Address &addr) { return owner_client; }), - manager_node_id_(NodeID::FromRandom()), - manager(manager_node_id_, free_objects_batch_size, + manager(io_service_, free_objects_batch_size, /*free_objects_period_ms=*/1000, worker_pool, object_table, client_pool, + /*object_pinning_enabled=*/true, /*automatic_object_delete_enabled=*/true, /*max_io_workers=*/2, /*min_spilling_size=*/0, - /*is_external_storage_type_fs=*/true, - /*on_objects_freed=*/ [&](const std::vector &object_ids) { for (const auto &object_id : object_ids) { freed.insert(object_id); @@ -293,24 +265,12 @@ class LocalObjectManagerTest : public ::testing::Test { /*is_plasma_object_spillable=*/ [&](const ray::ObjectID &object_id) { return unevictable_objects_.count(object_id) == 0; - }, - /*restore_object_from_remote_node=*/ - [&](const ObjectID &object_id, const std::string spilled_url, - const NodeID &node_id) { - if (remote_node_set_restore_requested_.count(node_id) == 0) { - remote_node_set_restore_requested_.emplace( - node_id, std::unordered_set()); - } - remote_node_set_restore_requested_[node_id].emplace(object_id); }), unpins(std::make_shared>()) { RayConfig::instance().initialize({{"object_spilling_config", "mock_config"}}); } - void TearDown() { - unevictable_objects_.clear(); - remote_node_set_restore_requested_.clear(); - } + void TearDown() { unevictable_objects_.clear(); } std::string BuildURL(const std::string url, int offset = 0, int num_objects = 1) { return url + "?" + "num_objects=" + std::to_string(num_objects) + @@ -323,10 +283,7 @@ class LocalObjectManagerTest : public ::testing::Test { rpc::CoreWorkerClientPool client_pool; MockIOWorkerPool worker_pool; MockObjectInfoAccessor object_table; - NodeID manager_node_id_; LocalObjectManager manager; - std::unordered_map> - remote_node_set_restore_requested_; std::unordered_set freed; // This hashmap is incremented when objects are unpinned by destroying their @@ -353,7 +310,7 @@ TEST_F(LocalObjectManagerTest, TestPin) { new RayObject(nullptr, meta_buffer, std::vector())); objects.push_back(std::move(object)); } - manager.PinObjects(object_ids, std::move(objects), owner_address); + manager.PinObjects(object_ids, std::move(objects)); manager.WaitForObjectFree(owner_address, object_ids); for (size_t i = 0; i < free_objects_batch_size; i++) { @@ -365,49 +322,16 @@ TEST_F(LocalObjectManagerTest, TestPin) { } TEST_F(LocalObjectManagerTest, TestRestoreSpilledObject) { - // First, spill objects. - std::vector object_ids; - std::vector> objects; - rpc::Address owner_address; - owner_address.set_worker_id(WorkerID::FromRandom().Binary()); - - for (size_t i = 0; i < free_objects_batch_size; i++) { - ObjectID object_id = ObjectID::FromRandom(); - object_ids.push_back(object_id); - auto data_buffer = std::make_shared(0, object_id, unpins); - std::unique_ptr object( - new RayObject(data_buffer, nullptr, std::vector())); - objects.push_back(std::move(object)); - } - manager.PinObjects(object_ids, std::move(objects), owner_address); - - manager.SpillObjects(object_ids, - [&](const Status &status) mutable { ASSERT_TRUE(status.ok()); }); - std::vector urls; - for (size_t i = 0; i < object_ids.size(); i++) { - urls.push_back(BuildURL("url" + std::to_string(i))); - } - ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects(urls)); - for (size_t i = 0; i < object_ids.size(); i++) { - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_TRUE(owner_client->ReplyAddSpilledUrl()); - } else { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - } - } - - // Then try restoring objects from local. - ObjectID object_id = object_ids[0]; - const auto url = urls[0]; + ObjectID object_id = ObjectID::FromRandom(); + std::string object_url("url"); int num_times_fired = 0; EXPECT_CALL(worker_pool, PushRestoreWorker(_)); // Subsequent calls should be deduped, so that only one callback should be fired. for (int i = 0; i < 10; i++) { - manager.AsyncRestoreSpilledObject(object_id, url, manager_node_id_, - [&](const Status &status) { - ASSERT_TRUE(status.ok()); - num_times_fired++; - }); + manager.AsyncRestoreSpilledObject(object_id, object_url, [&](const Status &status) { + ASSERT_TRUE(status.ok()); + num_times_fired++; + }); } ASSERT_EQ(num_times_fired, 0); @@ -417,32 +341,12 @@ TEST_F(LocalObjectManagerTest, TestRestoreSpilledObject) { ASSERT_EQ(num_times_fired, 0); } worker_pool.io_worker_client->ReplyRestoreObjects(10); - // The restore should've been invoked. ASSERT_EQ(num_times_fired, 1); - - // If the object wasn't spilled on the current node, it should request restoration to - // remote nodes. - ObjectID remote_object_id = ObjectID::FromRandom(); - const auto remote_object_url = BuildURL("remote_url"); - NodeID remote_node_id = NodeID::FromRandom(); - manager.AsyncRestoreSpilledObject(remote_object_id, remote_object_url, remote_node_id, - [&](const Status &status) { - ASSERT_TRUE(status.ok()); - num_times_fired++; - }); - // Make sure the remote call was invoked. - ASSERT_FALSE(worker_pool.io_worker_client->ReplyRestoreObjects(10)); - ASSERT_TRUE(remote_node_set_restore_requested_.count(remote_node_id) > 0); - ASSERT_TRUE(remote_node_set_restore_requested_[remote_node_id].count(remote_object_id) > - 0); - ASSERT_EQ(num_times_fired, 2); } TEST_F(LocalObjectManagerTest, TestExplicitSpill) { std::vector object_ids; std::vector> objects; - rpc::Address owner_address; - owner_address.set_worker_id(WorkerID::FromRandom().Binary()); for (size_t i = 0; i < free_objects_batch_size; i++) { ObjectID object_id = ObjectID::FromRandom(); @@ -452,7 +356,7 @@ TEST_F(LocalObjectManagerTest, TestExplicitSpill) { new RayObject(data_buffer, nullptr, std::vector())); objects.push_back(std::move(object)); } - manager.PinObjects(object_ids, std::move(objects), owner_address); + manager.PinObjects(object_ids, std::move(objects)); int num_times_fired = 0; manager.SpillObjects(object_ids, [&](const Status &status) mutable { @@ -471,19 +375,11 @@ TEST_F(LocalObjectManagerTest, TestExplicitSpill) { } ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects(urls)); for (size_t i = 0; i < object_ids.size(); i++) { - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_TRUE(owner_client->ReplyAddSpilledUrl()); - } else { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - } + ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); } ASSERT_EQ(num_times_fired, 1); for (size_t i = 0; i < object_ids.size(); i++) { - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_EQ(owner_client->object_urls[object_ids[i]], urls[i]); - } else { - ASSERT_EQ(object_table.object_urls[object_ids[i]], urls[i]); - } + ASSERT_EQ(object_table.object_urls[object_ids[i]], urls[i]); } for (const auto &id : object_ids) { ASSERT_EQ((*unpins)[id], 1); @@ -505,7 +401,7 @@ TEST_F(LocalObjectManagerTest, TestDuplicateSpill) { new RayObject(data_buffer, nullptr, std::vector())); objects.push_back(std::move(object)); } - manager.PinObjects(object_ids, std::move(objects), owner_address); + manager.PinObjects(object_ids, std::move(objects)); manager.WaitForObjectFree(owner_address, object_ids); int num_times_fired = 0; @@ -529,19 +425,11 @@ TEST_F(LocalObjectManagerTest, TestDuplicateSpill) { EXPECT_CALL(worker_pool, PushSpillWorker(_)); ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects(urls)); for (size_t i = 0; i < object_ids.size(); i++) { - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_TRUE(owner_client->ReplyAddSpilledUrl()); - } else { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - } + ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); } ASSERT_EQ(num_times_fired, 1); for (size_t i = 0; i < object_ids.size(); i++) { - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_EQ(owner_client->object_urls[object_ids[i]], urls[i]); - } else { - ASSERT_EQ(object_table.object_urls[object_ids[i]], urls[i]); - } + ASSERT_EQ(object_table.object_urls[object_ids[i]], urls[i]); } ASSERT_FALSE(worker_pool.io_worker_client->ReplySpillObjects(urls)); for (const auto &id : object_ids) { @@ -567,7 +455,7 @@ TEST_F(LocalObjectManagerTest, TestSpillObjectsOfSize) { new RayObject(data_buffer, nullptr, std::vector())); objects.push_back(std::move(object)); } - manager.PinObjects(object_ids, std::move(objects), owner_address); + manager.PinObjects(object_ids, std::move(objects)); ASSERT_TRUE(manager.SpillObjectsOfSize(total_size / 2)); for (const auto &id : object_ids) { ASSERT_EQ((*unpins)[id], 0); @@ -584,26 +472,13 @@ TEST_F(LocalObjectManagerTest, TestSpillObjectsOfSize) { // to evict. ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects(urls)); for (size_t i = 0; i < urls.size(); i++) { - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_TRUE(owner_client->ReplyAddSpilledUrl()); - } else { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - } + ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); } - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_EQ(owner_client->object_urls.size(), object_ids.size() / 2 + 1); - for (auto &object_url : owner_client->object_urls) { - auto it = std::find(urls.begin(), urls.end(), object_url.second); - ASSERT_TRUE(it != urls.end()); - ASSERT_EQ((*unpins)[object_url.first], 1); - } - } else { - ASSERT_EQ(object_table.object_urls.size(), object_ids.size() / 2 + 1); - for (auto &object_url : object_table.object_urls) { - auto it = std::find(urls.begin(), urls.end(), object_url.second); - ASSERT_TRUE(it != urls.end()); - ASSERT_EQ((*unpins)[object_url.first], 1); - } + ASSERT_EQ(object_table.object_urls.size(), object_ids.size() / 2 + 1); + for (auto &object_url : object_table.object_urls) { + auto it = std::find(urls.begin(), urls.end(), object_url.second); + ASSERT_TRUE(it != urls.end()); + ASSERT_EQ((*unpins)[object_url.first], 1); } // Make sure providing 0 bytes to SpillObjectsOfSize will spill one object. @@ -612,23 +487,13 @@ TEST_F(LocalObjectManagerTest, TestSpillObjectsOfSize) { EXPECT_CALL(worker_pool, PushSpillWorker(_)); const std::string url = BuildURL("url" + std::to_string(object_ids.size())); ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects({url})); + ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); + ASSERT_EQ(object_table.object_urls.size(), 3); urls.push_back(url); - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_TRUE(owner_client->ReplyAddSpilledUrl()); - ASSERT_EQ(owner_client->object_urls.size(), 3); - for (auto &object_url : owner_client->object_urls) { - auto it = std::find(urls.begin(), urls.end(), object_url.second); - ASSERT_TRUE(it != urls.end()); - ASSERT_EQ((*unpins)[object_url.first], 1); - } - } else { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - ASSERT_EQ(object_table.object_urls.size(), 3); - for (auto &object_url : object_table.object_urls) { - auto it = std::find(urls.begin(), urls.end(), object_url.second); - ASSERT_TRUE(it != urls.end()); - ASSERT_EQ((*unpins)[object_url.first], 1); - } + for (auto &object_url : object_table.object_urls) { + auto it = std::find(urls.begin(), urls.end(), object_url.second); + ASSERT_TRUE(it != urls.end()); + ASSERT_EQ((*unpins)[object_url.first], 1); } // Since there's no more object to spill, this should fail. @@ -653,7 +518,7 @@ TEST_F(LocalObjectManagerTest, TestSpillObjectNotEvictable) { new RayObject(data_buffer, nullptr, std::vector())); objects.push_back(std::move(object)); - manager.PinObjects(object_ids, std::move(objects), owner_address); + manager.PinObjects(object_ids, std::move(objects)); ASSERT_FALSE(manager.SpillObjectsOfSize(1000)); for (const auto &id : object_ids) { ASSERT_EQ((*unpins)[id], 0); @@ -682,7 +547,7 @@ TEST_F(LocalObjectManagerTest, TestSpillUptoMaxThroughput) { new RayObject(data_buffer, nullptr, std::vector())); objects.push_back(std::move(object)); } - manager.PinObjects(object_ids, std::move(objects), owner_address); + manager.PinObjects(object_ids, std::move(objects)); // This will spill until 2 workers are occupied. manager.SpillObjectUptoMaxThroughput(); @@ -699,23 +564,12 @@ TEST_F(LocalObjectManagerTest, TestSpillUptoMaxThroughput) { std::vector urls; urls.push_back(BuildURL("url" + std::to_string(0))); ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects({urls[0]})); - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_TRUE(owner_client->ReplyAddSpilledUrl()); - // Make sure object is spilled. - ASSERT_EQ(owner_client->object_urls.size(), 1); - for (auto &object_url : owner_client->object_urls) { - if (urls[0] == object_url.second) { - ASSERT_EQ((*unpins)[object_url.first], 1); - } - } - } else { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - // Make sure object is spilled. - ASSERT_EQ(object_table.object_urls.size(), 1); - for (auto &object_url : object_table.object_urls) { - if (urls[0] == object_url.second) { - ASSERT_EQ((*unpins)[object_url.first], 1); - } + ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); + // Make sure object is spilled. + ASSERT_EQ(object_table.object_urls.size(), 1); + for (auto &object_url : object_table.object_urls) { + if (urls[0] == object_url.second) { + ASSERT_EQ((*unpins)[object_url.first], 1); } } @@ -733,26 +587,13 @@ TEST_F(LocalObjectManagerTest, TestSpillUptoMaxThroughput) { } for (size_t i = 1; i < urls.size(); i++) { ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects({urls[i]})); - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_TRUE(owner_client->ReplyAddSpilledUrl()); - } else { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - } + ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); } - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_EQ(owner_client->object_urls.size(), 3); - for (auto &object_url : owner_client->object_urls) { - auto it = std::find(urls.begin(), urls.end(), object_url.second); - ASSERT_TRUE(it != urls.end()); - ASSERT_EQ((*unpins)[object_url.first], 1); - } - } else { - ASSERT_EQ(object_table.object_urls.size(), 3); - for (auto &object_url : object_table.object_urls) { - auto it = std::find(urls.begin(), urls.end(), object_url.second); - ASSERT_TRUE(it != urls.end()); - ASSERT_EQ((*unpins)[object_url.first], 1); - } + ASSERT_EQ(object_table.object_urls.size(), 3); + for (auto &object_url : object_table.object_urls) { + auto it = std::find(urls.begin(), urls.end(), object_url.second); + ASSERT_TRUE(it != urls.end()); + ASSERT_EQ((*unpins)[object_url.first], 1); } // We cannot spill anymore as there is no more pinned object. @@ -773,7 +614,7 @@ TEST_F(LocalObjectManagerTest, TestSpillError) { std::vector> objects; objects.push_back(std::move(object)); - manager.PinObjects({object_id}, std::move(objects), owner_address); + manager.PinObjects({object_id}, std::move(objects)); int num_times_fired = 0; manager.SpillObjects({object_id}, [&](const Status &status) mutable { @@ -785,11 +626,7 @@ TEST_F(LocalObjectManagerTest, TestSpillError) { EXPECT_CALL(worker_pool, PushSpillWorker(_)); ASSERT_TRUE( worker_pool.io_worker_client->ReplySpillObjects({}, Status::IOError("error"))); - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_FALSE(owner_client->ReplyAddSpilledUrl()); - } else { - ASSERT_FALSE(object_table.ReplyAsyncAddSpilledUrl()); - } + ASSERT_FALSE(object_table.ReplyAsyncAddSpilledUrl()); ASSERT_EQ(num_times_fired, 1); ASSERT_EQ((*unpins)[object_id], 0); @@ -801,14 +638,9 @@ TEST_F(LocalObjectManagerTest, TestSpillError) { std::string url = BuildURL("url"); EXPECT_CALL(worker_pool, PushSpillWorker(_)); ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects({url})); - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_TRUE(owner_client->ReplyAddSpilledUrl()); - ASSERT_EQ(owner_client->object_urls[object_id], url); - } else { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - ASSERT_EQ(object_table.object_urls[object_id], url); - } + ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); ASSERT_EQ(num_times_fired, 2); + ASSERT_EQ(object_table.object_urls[object_id], url); ASSERT_EQ((*unpins)[object_id], 1); } @@ -828,7 +660,7 @@ TEST_F(LocalObjectManagerTest, TestDeleteNoSpilledObjects) { new RayObject(data_buffer, nullptr, std::vector())); objects.push_back(std::move(object)); } - manager.PinObjects(object_ids, std::move(objects), owner_address); + manager.PinObjects(object_ids, std::move(objects)); manager.WaitForObjectFree(owner_address, object_ids); for (size_t i = 0; i < free_objects_batch_size; i++) { @@ -856,7 +688,7 @@ TEST_F(LocalObjectManagerTest, TestDeleteSpilledObjects) { new RayObject(data_buffer, nullptr, std::vector())); objects.push_back(std::move(object)); } - manager.PinObjects(object_ids, std::move(objects), owner_address); + manager.PinObjects(object_ids, std::move(objects)); manager.WaitForObjectFree(owner_address, object_ids); // 2 Objects are spilled out of 3. @@ -873,11 +705,7 @@ TEST_F(LocalObjectManagerTest, TestDeleteSpilledObjects) { } ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects(urls)); for (size_t i = 0; i < object_ids_to_spill.size(); i++) { - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_TRUE(owner_client->ReplyAddSpilledUrl()); - } else { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - } + ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); } // All objects are out of scope now. @@ -908,7 +736,7 @@ TEST_F(LocalObjectManagerTest, TestDeleteURLRefCount) { new RayObject(data_buffer, nullptr, std::vector())); objects.push_back(std::move(object)); } - manager.PinObjects(object_ids, std::move(objects), owner_address); + manager.PinObjects(object_ids, std::move(objects)); manager.WaitForObjectFree(owner_address, object_ids); // Every object is spilled. @@ -929,11 +757,7 @@ TEST_F(LocalObjectManagerTest, TestDeleteURLRefCount) { } ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects(urls)); for (size_t i = 0; i < object_ids_to_spill.size(); i++) { - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_TRUE(owner_client->ReplyAddSpilledUrl()); - } else { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - } + ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); } // Everything is evicted except the last object. In this case, ref count is still > 0. @@ -969,7 +793,7 @@ TEST_F(LocalObjectManagerTest, TestDeleteSpillingObjectsBlocking) { new RayObject(data_buffer, nullptr, std::vector())); objects.push_back(std::move(object)); } - manager.PinObjects(object_ids, std::move(objects), owner_address); + manager.PinObjects(object_ids, std::move(objects)); manager.WaitForObjectFree(owner_address, object_ids); // Objects are spilled. @@ -988,11 +812,7 @@ TEST_F(LocalObjectManagerTest, TestDeleteSpillingObjectsBlocking) { } ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects(urls)); for (size_t i = 0; i < 1; i++) { - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_TRUE(owner_client->ReplyAddSpilledUrl()); - } else { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - } + ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); } // Every object has gone out of scope. for (size_t i = 0; i < free_objects_batch_size; i++) { @@ -1011,11 +831,7 @@ TEST_F(LocalObjectManagerTest, TestDeleteSpillingObjectsBlocking) { new_urls.push_back(BuildURL("url" + std::to_string(i))); } for (size_t i = 1; i < object_ids_to_spill.size(); i++) { - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_TRUE(owner_client->ReplyAddSpilledUrl()); - } else { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - } + ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); } // Every object is now deleted. @@ -1040,7 +856,7 @@ TEST_F(LocalObjectManagerTest, TestDeleteMaxObjects) { new RayObject(data_buffer, nullptr, std::vector())); objects.push_back(std::move(object)); } - manager.PinObjects(object_ids, std::move(objects), owner_address); + manager.PinObjects(object_ids, std::move(objects)); manager.WaitForObjectFree(owner_address, object_ids); std::vector object_ids_to_spill; @@ -1058,11 +874,7 @@ TEST_F(LocalObjectManagerTest, TestDeleteMaxObjects) { } ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects(urls)); for (size_t i = 0; i < object_ids_to_spill.size(); i++) { - if (RayConfig::instance().ownership_based_object_directory_enabled()) { - ASSERT_TRUE(owner_client->ReplyAddSpilledUrl()); - } else { - ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); - } + ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); } // Every reference has gone out of scope. diff --git a/src/ray/raylet/test/util.h b/src/ray/raylet/test/util.h index c43a386fba14..8527220e3df8 100644 --- a/src/ray/raylet/test/util.h +++ b/src/ray/raylet/test/util.h @@ -33,7 +33,7 @@ class MockWorker : public WorkerInterface { void AssignTaskId(const TaskID &task_id) {} - void SetAssignedTask(const Task &assigned_task) { task_ = assigned_task; } + void SetAssignedTask(const Task &assigned_task) {} const std::string IpAddress() const { return address_.ip_address(); } @@ -162,7 +162,11 @@ class MockWorker : public WorkerInterface { void SetBundleId(const BundleID &bundle_id) { bundle_id_ = bundle_id; } - Task &GetAssignedTask() { return task_; } + Task &GetAssignedTask() { + RAY_CHECK(false) << "Method unused"; + auto *t = new Task(); + return *t; + } bool IsRegistered() { RAY_CHECK(false) << "Method unused"; @@ -184,7 +188,6 @@ class MockWorker : public WorkerInterface { bool is_detached_actor_; BundleID bundle_id_; bool blocked_ = false; - Task task_; }; } // namespace raylet diff --git a/src/ray/raylet/worker_pool.cc b/src/ray/raylet/worker_pool.cc index 89749f2d4b26..93a568748e80 100644 --- a/src/ray/raylet/worker_pool.cc +++ b/src/ray/raylet/worker_pool.cc @@ -159,8 +159,9 @@ Process WorkerPool::StartWorkerProcess( return Process(); } // Either there are no workers pending registration or the worker start is being forced. - RAY_LOG(DEBUG) << "Starting new worker process, current pool has " << state.idle.size() - << " workers"; + RAY_LOG(DEBUG) << "Starting new worker process, current pool has " + << state.idle_actor.size() << " actor workers, and " << state.idle.size() + << " non-actor workers"; int workers_to_start = 1; if (dynamic_options.empty()) { @@ -624,11 +625,15 @@ void WorkerPool::PushWorker(const std::shared_ptr &worker) { state.idle_dedicated_workers[task_id] = worker; } else { // The worker is not used for the actor creation task with dynamic options. - // Put the worker to the idle pool. - state.idle.insert(worker); - int64_t now = current_time_ms(); - idle_of_all_languages_.emplace_back(worker, now); - idle_of_all_languages_map_[worker] = now; + // Put the worker to the corresponding idle pool. + if (worker->GetActorId().IsNil()) { + state.idle.insert(worker); + int64_t now = current_time_ms(); + idle_of_all_languages_.emplace_back(worker, now); + idle_of_all_languages_map_[worker] = now; + } else { + state.idle_actor[worker->GetActorId()] = worker; + } } } @@ -782,18 +787,14 @@ std::shared_ptr WorkerPool::PopWorker( state.tasks_to_dedicated_workers[task_spec.TaskId()] = proc; } } - } else if (task_spec.IsActorTask()) { - // Code path of actor task. - RAY_CHECK(false) << "Direct call shouldn't reach here."; - } else { + } else if (!task_spec.IsActorTask()) { // Code path of normal task or actor creation task without dynamic worker options. // Find an available worker which is already assigned to this job. // Try to pop the most recently pushed worker. for (auto it = idle_of_all_languages_.rbegin(); it != idle_of_all_languages_.rend(); it++) { if (task_spec.GetLanguage() != it->first->GetLanguage() || - it->first->GetAssignedJobId() != task_spec.JobId() || - state.pending_disconnection_workers.count(it->first) > 0) { + it->first->GetAssignedJobId() != task_spec.JobId()) { continue; } state.idle.erase(it->first); @@ -811,6 +812,14 @@ std::shared_ptr WorkerPool::PopWorker( proc = StartWorkerProcess(task_spec.GetLanguage(), rpc::WorkerType::WORKER, task_spec.JobId()); } + } else { + // Code path of actor task. + const auto &actor_id = task_spec.ActorId(); + auto actor_entry = state.idle_actor.find(actor_id); + if (actor_entry != state.idle_actor.end()) { + worker = std::move(actor_entry->second); + state.idle_actor.erase(actor_entry); + } } if (worker == nullptr && proc.IsValid()) { @@ -858,12 +867,9 @@ void WorkerPool::PrestartWorkers(const TaskSpecification &task_spec, } } -bool WorkerPool::DisconnectWorker(const std::shared_ptr &worker, - rpc::WorkerExitType disconnect_type) { +bool WorkerPool::DisconnectWorker(const std::shared_ptr &worker) { auto &state = GetStateForLanguage(worker->GetLanguage()); RAY_CHECK(RemoveWorker(state.registered_workers, worker)); - RAY_UNUSED(RemoveWorker(state.pending_disconnection_workers, worker)); - for (auto it = idle_of_all_languages_.begin(); it != idle_of_all_languages_.end(); it++) { if (it->first == worker) { @@ -874,25 +880,7 @@ bool WorkerPool::DisconnectWorker(const std::shared_ptr &worker } MarkPortAsFree(worker->AssignedPort()); - auto status = RemoveWorker(state.idle, worker); - if (disconnect_type != rpc::WorkerExitType::INTENDED_EXIT) { - // A Java worker process may have multiple workers. If one of them disconnects - // unintentionally (which means that the worker process has died), we remove the - // others from idle pool so that the failed actor will not be rescheduled on the same - // process. - auto pid = worker->GetProcess().GetId(); - for (auto worker2 : state.registered_workers) { - if (worker2->GetProcess().GetId() == pid) { - // NOTE(kfstorm): We have to use a new field to record these workers (instead of - // just removing them from idle sets) because they may haven't announced worker - // port yet. When they announce worker port, they'll be marked idle again. So - // removing them from idle sets here doesn't really prevent them from being popped - // later. - state.pending_disconnection_workers.insert(worker2); - } - } - } - return status; + return RemoveWorker(state.idle, worker); } void WorkerPool::DisconnectDriver(const std::shared_ptr &driver) { @@ -1059,10 +1047,6 @@ std::string WorkerPool::DebugString() const { << " workers: " << entry.second.registered_workers.size(); result << "\n- num " << Language_Name(entry.first) << " drivers: " << entry.second.registered_drivers.size(); - result << "\n- num object spill callbacks queued: " - << entry.second.spill_io_worker_state.pending_io_tasks.size(); - result << "\n- num object restore queued: " - << entry.second.restore_io_worker_state.pending_io_tasks.size(); } result << "\n- num idle workers: " << idle_of_all_languages_.size(); return result.str(); diff --git a/src/ray/raylet/worker_pool.h b/src/ray/raylet/worker_pool.h index ae7d1c52cddd..66d4b94c7700 100644 --- a/src/ray/raylet/worker_pool.h +++ b/src/ray/raylet/worker_pool.h @@ -184,11 +184,9 @@ class WorkerPool : public WorkerPoolInterface, public IOWorkerPoolInterface { /// Disconnect a registered worker. /// - /// \param worker The worker to disconnect. The worker must be registered. - /// \param disconnect_type Type of a worker exit. + /// \param The worker to disconnect. The worker must be registered. /// \return Whether the given worker was in the pool of idle workers. - bool DisconnectWorker(const std::shared_ptr &worker, - rpc::WorkerExitType disconnect_type); + bool DisconnectWorker(const std::shared_ptr &worker); /// Disconnect a registered driver. /// @@ -360,6 +358,8 @@ class WorkerPool : public WorkerPoolInterface, public IOWorkerPoolInterface { std::unordered_map> idle_dedicated_workers; /// The pool of idle non-actor workers. std::unordered_set> idle; + /// The pool of idle actor workers. + std::unordered_map> idle_actor; // States for io workers used for spilling objects. IOWorkerState spill_io_worker_state; // States for io workers used for restoring objects. @@ -369,9 +369,6 @@ class WorkerPool : public WorkerPoolInterface, public IOWorkerPoolInterface { std::unordered_set> registered_workers; /// All drivers that have registered and are still connected. std::unordered_set> registered_drivers; - /// All workers that have registered but is about to disconnect. They shouldn't be - /// popped anymore. - std::unordered_set> pending_disconnection_workers; /// A map from the pids of starting worker processes /// to the number of their unregistered workers. std::unordered_map starting_worker_processes; diff --git a/src/ray/raylet/worker_pool_test.cc b/src/ray/raylet/worker_pool_test.cc index 044dc33a2ede..ee8f3356bb77 100644 --- a/src/ray/raylet/worker_pool_test.cc +++ b/src/ray/raylet/worker_pool_test.cc @@ -268,8 +268,7 @@ TEST_F(WorkerPoolTest, HandleWorkerRegistration) { // Check that there's no starting worker process ASSERT_EQ(worker_pool_->NumWorkerProcessesStarting(), 0); for (const auto &worker : workers) { - worker_pool_->DisconnectWorker( - worker, /*disconnect_type=*/rpc::WorkerExitType::INTENDED_EXIT); + worker_pool_->DisconnectWorker(worker); // Check that we cannot lookup the worker after it's disconnected. ASSERT_EQ(worker_pool_->GetRegisteredWorker(worker->Connection()), nullptr); } @@ -344,6 +343,28 @@ TEST_F(WorkerPoolTest, HandleWorkerPushPop) { ASSERT_EQ(popped_worker, nullptr); } +TEST_F(WorkerPoolTest, PopActorWorker) { + // Create a worker. + auto worker = CreateWorker(Process::CreateNewDummy()); + // Add the worker to the pool. + worker_pool_->PushWorker(worker); + + // Assign an actor ID to the worker. + const auto task_spec = ExampleTaskSpec(); + auto actor = worker_pool_->PopWorker(task_spec); + auto actor_id = ActorID::Of(JOB_ID, TaskID::ForDriverTask(JOB_ID), 1); + actor->AssignActorId(actor_id); + worker_pool_->PushWorker(actor); + + // Check that there are no more non-actor workers. + ASSERT_EQ(worker_pool_->PopWorker(task_spec), nullptr); + // Check that we can pop the actor worker. + const auto actor_task_spec = ExampleTaskSpec(actor_id); + actor = worker_pool_->PopWorker(actor_task_spec); + ASSERT_EQ(actor, worker); + ASSERT_EQ(actor->GetActorId(), actor_id); +} + TEST_F(WorkerPoolTest, PopWorkersOfMultipleLanguages) { // Create a Python Worker, and add it to the pool auto py_worker = CreateWorker(Process::CreateNewDummy(), Language::PYTHON); @@ -407,19 +428,25 @@ TEST_F(WorkerPoolTest, PopWorkerMultiTenancy) { worker_pool_->PushWorker(worker); } } + std::unordered_set worker_ids; for (int round = 0; round < 2; round++) { std::vector> workers; - // Pop workers for actor. + // Pop workers for actor (creation) tasks. for (auto job_id : job_ids) { - auto actor_creation_id = ActorID::Of(job_id, TaskID::ForDriverTask(job_id), 1); - // Pop workers for actor creation tasks. - auto task_spec = ExampleTaskSpec(/*actor_id=*/ActorID::Nil(), Language::PYTHON, - job_id, actor_creation_id); + auto actor_id = ActorID::Of(job_id, TaskID::ForDriverTask(job_id), 1); + // For the first round, we pop for actor creation tasks. + // For the second round, we pop for actor tasks. + auto task_spec = + ExampleTaskSpec(round == 0 ? ActorID::Nil() : actor_id, Language::PYTHON, + job_id, round == 0 ? actor_id : ActorID::Nil()); auto worker = worker_pool_->PopWorker(task_spec); ASSERT_TRUE(worker); ASSERT_EQ(worker->GetAssignedJobId(), job_id); + if (round == 0) { + worker->AssignActorId(actor_id); + } workers.push_back(worker); } @@ -711,42 +738,6 @@ TEST_F(WorkerPoolTest, DeleteWorkerPushPop) { }); } -TEST_F(WorkerPoolTest, NoPopOnCrashedWorkerProcess) { - // Start a Java worker process. - Process proc = - worker_pool_->StartWorkerProcess(Language::JAVA, rpc::WorkerType::WORKER, JOB_ID); - auto worker1 = CreateWorker(Process(), Language::JAVA); - auto worker2 = CreateWorker(Process(), Language::JAVA); - - // We now imitate worker process crashing while core worker initializing. - - // 1. we register both workers. - RAY_CHECK_OK(worker_pool_->RegisterWorker(worker1, proc.GetId(), [](Status, int) {})); - RAY_CHECK_OK(worker_pool_->RegisterWorker(worker2, proc.GetId(), [](Status, int) {})); - - // 2. announce worker port for worker 1. When interacting with worker pool, it's - // PushWorker. - worker_pool_->PushWorker(worker1); - - // 3. kill the worker process. Now let's assume that Raylet found that the connection - // with worker 1 disconnected first. - worker_pool_->DisconnectWorker( - worker1, /*disconnect_type=*/rpc::WorkerExitType::SYSTEM_ERROR_EXIT); - - // 4. but the RPC for announcing worker port for worker 2 is already in Raylet input - // buffer. So now Raylet needs to handle worker 2. - worker_pool_->PushWorker(worker2); - - // 5. Let's try to pop a worker to execute a task. Worker 2 shouldn't be popped because - // the process has crashed. - const auto task_spec = ExampleTaskSpec(); - ASSERT_EQ(worker_pool_->PopWorker(task_spec), nullptr); - - // 6. Now Raylet disconnects with worker 2. - worker_pool_->DisconnectWorker( - worker2, /*disconnect_type=*/rpc::WorkerExitType::SYSTEM_ERROR_EXIT); -} - } // namespace raylet } // namespace ray diff --git a/src/ray/raylet_client/raylet_client.cc b/src/ray/raylet_client/raylet_client.cc index b3177071a144..739832b2bb40 100644 --- a/src/ray/raylet_client/raylet_client.cc +++ b/src/ray/raylet_client/raylet_client.cc @@ -311,18 +311,6 @@ void raylet::RayletClient::RequestObjectSpillage( grpc_client_->RequestObjectSpillage(request, callback); } -void raylet::RayletClient::RestoreSpilledObject( - const ObjectID &object_id, const std::string &object_url, - const NodeID &spilled_node_id, - const rpc::ClientCallback &callback) { - RAY_CHECK(!spilled_node_id.IsNil()); - rpc::RestoreSpilledObjectRequest request; - request.set_object_id(object_id.Binary()); - request.set_object_url(object_url); - request.set_spilled_node_id(spilled_node_id.Binary()); - grpc_client_->RestoreSpilledObject(request, callback); -} - Status raylet::RayletClient::ReturnWorker(int worker_port, const WorkerID &worker_id, bool disconnect_worker) { rpc::ReturnWorkerRequest request; diff --git a/src/ray/raylet_client/raylet_client.h b/src/ray/raylet_client/raylet_client.h index cf9cfea56d7f..185ca445ac3b 100644 --- a/src/ray/raylet_client/raylet_client.h +++ b/src/ray/raylet_client/raylet_client.h @@ -332,15 +332,6 @@ class RayletClient : public RayletClientInterface { const ObjectID &object_id, const rpc::ClientCallback &callback); - /// Ask the raylet to restore the object of a given id. - /// \param object_id Object id that the remote raylet needs to restore. - /// \param object_url Object URL where the object is spilled. - /// \param spilled_node_id Node id of a node where the object is spilled. - void RestoreSpilledObject( - const ObjectID &object_id, const std::string &object_url, - const NodeID &spilled_node_id, - const rpc::ClientCallback &callback); - /// Implements WorkerLeaseInterface. void RequestWorkerLease( const ray::TaskSpecification &resource_spec, diff --git a/src/ray/rpc/gcs_server/gcs_rpc_client.h b/src/ray/rpc/gcs_server/gcs_rpc_client.h index bae0e56bd9ae..fa77fddd2845 100644 --- a/src/ray/rpc/gcs_server/gcs_rpc_client.h +++ b/src/ray/rpc/gcs_server/gcs_rpc_client.h @@ -144,10 +144,6 @@ class GcsRpcClient { VOID_GCS_RPC_CLIENT_METHOD(ActorInfoGcsService, GetAllActorInfo, actor_info_grpc_client_, ) - /// Kill actor via GCS Service. - VOID_GCS_RPC_CLIENT_METHOD(ActorInfoGcsService, KillActorViaGcs, - actor_info_grpc_client_, ) - /// Register a node to GCS Service. VOID_GCS_RPC_CLIENT_METHOD(NodeInfoGcsService, RegisterNode, node_info_grpc_client_, ) @@ -258,10 +254,6 @@ class GcsRpcClient { VOID_GCS_RPC_CLIENT_METHOD(PlacementGroupInfoGcsService, GetPlacementGroup, placement_group_info_grpc_client_, ) - /// Get placement group data from GCS Service by name. - VOID_GCS_RPC_CLIENT_METHOD(PlacementGroupInfoGcsService, GetNamedPlacementGroup, - placement_group_info_grpc_client_, ) - /// Get information of all placement group from GCS Service. VOID_GCS_RPC_CLIENT_METHOD(PlacementGroupInfoGcsService, GetAllPlacementGroup, placement_group_info_grpc_client_, ) diff --git a/src/ray/rpc/gcs_server/gcs_rpc_server.h b/src/ray/rpc/gcs_server/gcs_rpc_server.h index 246a5ee9e306..0add85c0e04b 100644 --- a/src/ray/rpc/gcs_server/gcs_rpc_server.h +++ b/src/ray/rpc/gcs_server/gcs_rpc_server.h @@ -125,10 +125,6 @@ class ActorInfoGcsServiceHandler { virtual void HandleGetAllActorInfo(const GetAllActorInfoRequest &request, GetAllActorInfoReply *reply, SendReplyCallback send_reply_callback) = 0; - - virtual void HandleKillActorViaGcs(const KillActorViaGcsRequest &request, - KillActorViaGcsReply *reply, - SendReplyCallback send_reply_callback) = 0; }; /// The `GrpcService` for `ActorInfoGcsService`. @@ -152,7 +148,6 @@ class ActorInfoGrpcService : public GrpcService { ACTOR_INFO_SERVICE_RPC_HANDLER(GetActorInfo); ACTOR_INFO_SERVICE_RPC_HANDLER(GetNamedActorInfo); ACTOR_INFO_SERVICE_RPC_HANDLER(GetAllActorInfo); - ACTOR_INFO_SERVICE_RPC_HANDLER(KillActorViaGcs); } private: @@ -527,10 +522,6 @@ class PlacementGroupInfoGcsServiceHandler { const WaitPlacementGroupUntilReadyRequest &request, WaitPlacementGroupUntilReadyReply *reply, SendReplyCallback send_reply_callback) = 0; - - virtual void HandleGetNamedPlacementGroup(const GetNamedPlacementGroupRequest &request, - GetNamedPlacementGroupReply *reply, - SendReplyCallback send_reply_callback) = 0; }; /// The `GrpcService` for `PlacementGroupInfoGcsService`. @@ -552,7 +543,6 @@ class PlacementGroupInfoGrpcService : public GrpcService { PLACEMENT_GROUP_INFO_SERVICE_RPC_HANDLER(CreatePlacementGroup); PLACEMENT_GROUP_INFO_SERVICE_RPC_HANDLER(RemovePlacementGroup); PLACEMENT_GROUP_INFO_SERVICE_RPC_HANDLER(GetPlacementGroup); - PLACEMENT_GROUP_INFO_SERVICE_RPC_HANDLER(GetNamedPlacementGroup); PLACEMENT_GROUP_INFO_SERVICE_RPC_HANDLER(GetAllPlacementGroup); PLACEMENT_GROUP_INFO_SERVICE_RPC_HANDLER(WaitPlacementGroupUntilReady); } diff --git a/src/ray/rpc/node_manager/node_manager_client.h b/src/ray/rpc/node_manager/node_manager_client.h index 81182ab94ab4..1c9b16c18370 100644 --- a/src/ray/rpc/node_manager/node_manager_client.h +++ b/src/ray/rpc/node_manager/node_manager_client.h @@ -100,9 +100,6 @@ class NodeManagerWorkerClient /// Ask the raylet to spill an object to external storage. VOID_RPC_CLIENT_METHOD(NodeManagerService, RequestObjectSpillage, grpc_client_, ) - /// Ask the raylet to restore an object from external storage. - VOID_RPC_CLIENT_METHOD(NodeManagerService, RestoreSpilledObject, grpc_client_, ) - /// Release unused bundles. VOID_RPC_CLIENT_METHOD(NodeManagerService, ReleaseUnusedBundles, grpc_client_, ) diff --git a/src/ray/rpc/node_manager/node_manager_server.h b/src/ray/rpc/node_manager/node_manager_server.h index 7f769150871c..08893d49f7a7 100644 --- a/src/ray/rpc/node_manager/node_manager_server.h +++ b/src/ray/rpc/node_manager/node_manager_server.h @@ -36,7 +36,6 @@ namespace rpc { RPC_SERVICE_HANDLER(NodeManagerService, CommitBundleResources) \ RPC_SERVICE_HANDLER(NodeManagerService, CancelResourceReserve) \ RPC_SERVICE_HANDLER(NodeManagerService, RequestObjectSpillage) \ - RPC_SERVICE_HANDLER(NodeManagerService, RestoreSpilledObject) \ RPC_SERVICE_HANDLER(NodeManagerService, ReleaseUnusedBundles) /// Interface of the `NodeManagerService`, see `src/ray/protobuf/node_manager.proto`. @@ -103,10 +102,6 @@ class NodeManagerServiceHandler { RequestObjectSpillageReply *reply, SendReplyCallback send_reply_callback) = 0; - virtual void HandleRestoreSpilledObject(const RestoreSpilledObjectRequest &request, - RestoreSpilledObjectReply *reply, - SendReplyCallback send_reply_callback) = 0; - virtual void HandleReleaseUnusedBundles(const ReleaseUnusedBundlesRequest &request, ReleaseUnusedBundlesReply *reply, SendReplyCallback send_reply_callback) = 0; diff --git a/src/ray/rpc/worker/core_worker_client.h b/src/ray/rpc/worker/core_worker_client.h index 8f2796581e31..a014a1776a4e 100644 --- a/src/ray/rpc/worker/core_worker_client.h +++ b/src/ray/rpc/worker/core_worker_client.h @@ -186,9 +186,6 @@ class CoreWorkerClientInterface { const DeleteSpilledObjectsRequest &request, const ClientCallback &callback) {} - virtual void AddSpilledUrl(const AddSpilledUrlRequest &request, - const ClientCallback &callback) {} - virtual void PlasmaObjectReady(const PlasmaObjectReadyRequest &request, const ClientCallback &callback) { } @@ -254,8 +251,6 @@ class CoreWorkerClient : public std::enable_shared_from_this, VOID_RPC_CLIENT_METHOD(CoreWorkerService, DeleteSpilledObjects, grpc_client_, override) - VOID_RPC_CLIENT_METHOD(CoreWorkerService, AddSpilledUrl, grpc_client_, override) - VOID_RPC_CLIENT_METHOD(CoreWorkerService, PlasmaObjectReady, grpc_client_, override) VOID_RPC_CLIENT_METHOD(CoreWorkerService, Exit, grpc_client_, override) diff --git a/src/ray/rpc/worker/core_worker_server.h b/src/ray/rpc/worker/core_worker_server.h index 37c01cf484c2..8f9d236e0b97 100644 --- a/src/ray/rpc/worker/core_worker_server.h +++ b/src/ray/rpc/worker/core_worker_server.h @@ -44,7 +44,6 @@ namespace rpc { RPC_SERVICE_HANDLER(CoreWorkerService, SpillObjects) \ RPC_SERVICE_HANDLER(CoreWorkerService, RestoreSpilledObjects) \ RPC_SERVICE_HANDLER(CoreWorkerService, DeleteSpilledObjects) \ - RPC_SERVICE_HANDLER(CoreWorkerService, AddSpilledUrl) \ RPC_SERVICE_HANDLER(CoreWorkerService, PlasmaObjectReady) \ RPC_SERVICE_HANDLER(CoreWorkerService, Exit) @@ -66,7 +65,6 @@ namespace rpc { DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(SpillObjects) \ DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(RestoreSpilledObjects) \ DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(DeleteSpilledObjects) \ - DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(AddSpilledUrl) \ DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(PlasmaObjectReady) \ DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(Exit) diff --git a/src/ray/stats/metric.cc b/src/ray/stats/metric.cc index d4b253428b92..4a475a338408 100644 --- a/src/ray/stats/metric.cc +++ b/src/ray/stats/metric.cc @@ -22,8 +22,6 @@ namespace ray { namespace stats { -absl::Mutex Metric::registration_mutex_; - static void RegisterAsView(opencensus::stats::ViewDescriptor view_descriptor, const std::vector &keys) { // Register global keys. @@ -87,24 +85,19 @@ void Metric::Record(double value, const TagsType &tags) { return; } - // NOTE(lingxuan.zlx): Double check for recording performance while - // processing in multithread and avoid race since metrics may invoke - // record in different threads or code pathes. if (measure_ == nullptr) { - absl::MutexLock lock(®istration_mutex_); - if (measure_ == nullptr) { - // Measure could be registered before, so we try to get it first. - MeasureDouble registered_measure = - opencensus::stats::MeasureRegistry::GetMeasureDoubleByName(name_); - - if (registered_measure.IsValid()) { - measure_.reset(new MeasureDouble(registered_measure)); - } else { - measure_.reset( - new MeasureDouble(MeasureDouble::Register(name_, description_, unit_))); - } - RegisterView(); + // Measure could be registered before, so we try to get it first. + MeasureDouble registered_measure = + opencensus::stats::MeasureRegistry::GetMeasureDoubleByName(name_); + + if (registered_measure.IsValid()) { + measure_.reset(new MeasureDouble(registered_measure)); + } else { + measure_.reset( + new MeasureDouble(MeasureDouble::Register(name_, description_, unit_))); } + + RegisterView(); } // Do record. diff --git a/src/ray/stats/metric.h b/src/ray/stats/metric.h index dac50bc2d947..06e8534c4c67 100644 --- a/src/ray/stats/metric.h +++ b/src/ray/stats/metric.h @@ -129,9 +129,6 @@ class Metric { std::vector tag_keys_; std::unique_ptr> measure_; - // For making sure thread-safe to all of metric registrations. - static absl::Mutex registration_mutex_; - }; // class Metric class Gauge : public Metric { diff --git a/src/ray/stats/stats_test.cc b/src/ray/stats/stats_test.cc index 38f7952823d7..21e1627233a4 100644 --- a/src/ray/stats/stats_test.cc +++ b/src/ray/stats/stats_test.cc @@ -116,38 +116,6 @@ TEST_F(StatsTest, InitializationTest) { ASSERT_TRUE(new_first_tag.second == test_tag_value_that_shouldnt_be_applied); } -TEST(Metric, MultiThreadMetricRegisterViewTest) { - ray::stats::Shutdown(); - std::shared_ptr exporter( - new stats::StdoutExporterClient()); - ray::stats::Init({}, MetricsAgentPort, exporter); - std::vector threads; - const stats::TagKeyType tag1 = stats::TagKeyType::Register("k1"); - const stats::TagKeyType tag2 = stats::TagKeyType::Register("k2"); - for (int index = 0; index < 10; ++index) { - threads.emplace_back([tag1, tag2, index]() { - for (int i = 0; i < 100; i++) { - stats::Count random_counter( - "ray.random.counter" + std::to_string(index) + std::to_string(i), "", "", - {tag1, tag2}); - random_counter.Record(i); - stats::Gauge random_gauge( - "ray.random.gauge" + std::to_string(index) + std::to_string(i), "", "", - {tag1, tag2}); - random_gauge.Record(i); - stats::Sum random_sum( - "ray.random.sum" + std::to_string(index) + std::to_string(i), "", "", - {tag1, tag2}); - random_sum.Record(i); - } - }); - } - for (auto &thread : threads) { - thread.join(); - } - ray::stats::Shutdown(); -} - TEST_F(StatsTest, MultiThreadedInitializationTest) { // Make sure stats module is thread-safe. // Shutdown the stats module first. diff --git a/src/ray/test/run_object_manager_tests.sh b/src/ray/test/run_object_manager_tests.sh new file mode 100755 index 000000000000..ebb5eba223aa --- /dev/null +++ b/src/ray/test/run_object_manager_tests.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +# This needs to be run in the root directory. + +# Cause the script to exit if a single command fails. +set -e +set -x + +bazel build "//:object_manager_stress_test" "//:object_manager_test" "//:plasma_store_server" + +# Get the directory in which this script is executing. +SCRIPT_DIR="$(dirname "$0")" +RAY_ROOT="$SCRIPT_DIR/../../.." +# Makes $RAY_ROOT an absolute path. +RAY_ROOT="$(cd "$RAY_ROOT" && pwd)" +if [ -z "$RAY_ROOT" ] ; then + exit 1 +fi +# Ensure we're in the right directory. +if [ ! -d "$RAY_ROOT/python" ]; then + echo "Unable to find root Ray directory. Has this script moved?" + exit 1 +fi + +REDIS_MODULE="./bazel-bin/libray_redis_module.so" +LOAD_MODULE_ARGS=(--loadmodule "${REDIS_MODULE}") +STORE_EXEC="./bazel-bin/plasma_store_server" +GCS_SERVER_EXEC="./bazel-bin/gcs_server" + +# Allow cleanup commands to fail. +bazel run //:redis-cli -- -p 6379 shutdown || true +bazel run //:redis-cli -- -p 6380 shutdown || true +sleep 1s +bazel run //:redis-server -- --loglevel warning "${LOAD_MODULE_ARGS[@]}" --port 6379 & +bazel run //:redis-server -- --loglevel warning "${LOAD_MODULE_ARGS[@]}" --port 6380 & +sleep 1s +# Run tests. +./bazel-bin/object_manager_stress_test $STORE_EXEC $GCS_SERVER_EXEC +sleep 1s +# Use timeout=1000ms for the Wait tests. +./bazel-bin/object_manager_test $STORE_EXEC 1000 $GCS_SERVER_EXEC +bazel run //:redis-cli -- -p 6379 shutdown +bazel run //:redis-cli -- -p 6380 shutdown diff --git a/src/ray/util/logging.cc b/src/ray/util/logging.cc index 104fff0ec317..1640c5cfc657 100644 --- a/src/ray/util/logging.cc +++ b/src/ray/util/logging.cc @@ -55,17 +55,6 @@ namespace ray { -RayLogLevel RayLog::severity_threshold_ = RayLogLevel::INFO; -std::string RayLog::app_name_ = ""; -std::string RayLog::log_dir_ = ""; -// Format pattern is 2020-08-21 17:00:00,000 I 100 1001 msg. -// %L is loglevel, %P is process id, %t for thread id. -std::string RayLog::log_format_pattern_ = "[%Y-%m-%d %H:%M:%S,%e %L %P %t] %v"; -std::string RayLog::logger_name_ = "ray_log_sink"; -long RayLog::log_rotation_max_size_ = 1 << 29; -long RayLog::log_rotation_file_num_ = 10; -bool RayLog::is_failure_signal_handler_installed_ = false; - std::string GetCallTrace() { std::string return_message = "Cannot get callstack information."; #if defined(RAY_USE_GLOG) || defined(RAY_USE_SPDLOG) @@ -102,34 +91,23 @@ inline const char *ConstBasename(const char *filepath) { return base ? (base + 1) : filepath; } -/// A logger that prints logs to stderr. -/// This is the default logger if logging is not initialized. -class DefaultStdErrLogger final { - public: - DefaultStdErrLogger() { - default_stderr_logger_ = spdlog::stderr_color_mt("stderr"); - default_stderr_logger_->set_pattern(RayLog::GetLogFormatPattern()); - } - std::shared_ptr GetDefaultLogger() { return default_stderr_logger_; } - - private: - std::shared_ptr default_stderr_logger_; -}; - -/// NOTE(lingxuan.zlx): Default stderr logger must be singleton and global -/// variable so core worker process can invoke `RAY_LOG` in its whole lifecyle. -std::unique_ptr default_stderr_logger(new DefaultStdErrLogger()); - class SpdLogMessage final { public: explicit SpdLogMessage(const char *file, int line, int loglevel) : loglevel_(loglevel) { stream() << ConstBasename(file) << ":" << line << ": "; } + inline std::shared_ptr GetDefaultLogger() { + // We just emit all log informations to stderr when no default logger has been created + // before starting ray log, which is for glog compatible. + static auto logger = spdlog::stderr_color_mt("stderr"); + logger->set_pattern(RayLog::GetLogFormatPattern()); + return logger; + } inline void Flush() { auto logger = spdlog::get(RayLog::GetLoggerName()); if (!logger) { - logger = default_stderr_logger->GetDefaultLogger(); + logger = GetDefaultLogger(); } // To avoid dump duplicated stacktrace with installed failure signal // handler, we have to check whether glog failure signal handler is enabled. @@ -150,13 +128,12 @@ class SpdLogMessage final { ~SpdLogMessage() { Flush(); } inline std::ostream &stream() { return str_; } - private: - SpdLogMessage(const SpdLogMessage &) = delete; - SpdLogMessage &operator=(const SpdLogMessage &) = delete; - private: std::ostringstream str_; int loglevel_; + + SpdLogMessage(const SpdLogMessage &) = delete; + SpdLogMessage &operator=(const SpdLogMessage &) = delete; }; #endif @@ -211,6 +188,17 @@ typedef ray::SpdLogMessage LoggingProvider; typedef ray::CerrLog LoggingProvider; #endif +RayLogLevel RayLog::severity_threshold_ = RayLogLevel::INFO; +std::string RayLog::app_name_ = ""; +std::string RayLog::log_dir_ = ""; +// Format pattern is 2020-08-21 17:00:00,000 I 100 1001 msg. +// %L is loglevel, %P is process id, %t for thread id. +std::string RayLog::log_format_pattern_ = "[%Y-%m-%d %H:%M:%S,%e %L %P %t] %v"; +std::string RayLog::logger_name_ = "ray_log_sink"; +long RayLog::log_rotation_max_size_ = 1 << 29; +long RayLog::log_rotation_file_num_ = 10; +bool RayLog::is_failure_signal_handler_installed_ = false; + #ifdef RAY_USE_GLOG using namespace google; @@ -319,19 +307,11 @@ void RayLog::StartRayLog(const std::string &app_name, RayLogLevel severity_thres #endif // Reset log pattern and level and we assume a log file can be rotated with // 10 files in max size 512M by default. - if (getenv("RAY_ROTATION_MAX_BYTES")) { - long max_size = std::atol(getenv("RAY_ROTATION_MAX_BYTES")); - // 0 means no log rotation in python, but not in spdlog. We just use the default - // value here. - if (max_size != 0) { - log_rotation_max_size_ = max_size; - } + if (getenv("RAY_ROTATION_MAX_SIZE")) { + log_rotation_max_size_ = std::atol(getenv("RAY_RAOTATION_MAX_SIZE")); } - if (getenv("RAY_ROTATION_BACKUP_COUNT")) { - long file_num = std::atol(getenv("RAY_ROTATION_BACKUP_COUNT")); - if (file_num != 0) { - log_rotation_file_num_ = file_num; - } + if (getenv("RAY_ROTATION_FILE_NUM")) { + log_rotation_file_num_ = std::atol(getenv("RAY_ROTATION_FILE_NUM")); } spdlog::set_pattern(log_format_pattern_); spdlog::set_level(static_cast(severity_threshold_)); diff --git a/src/ray/util/process.cc b/src/ray/util/process.cc index 0928c4402a72..a9008df32e6c 100644 --- a/src/ray/util/process.cc +++ b/src/ray/util/process.cc @@ -139,6 +139,15 @@ class ProcessFD { STARTUPINFO si = {sizeof(si)}; RAY_UNUSED( new_env_block.c_str()); // Ensure there's a final terminator for Windows + // MSDN: + // https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-createprocessa + // Note that an ANSI environment block is terminated by two zero bytes: + // one for the last string, one more to terminate the block. + // A Unicode environment block is terminated by four zero bytes: + // two for the last string, two more to terminate the block. + if (!new_env_block.empty()) { + new_env_block += '\0'; + } char *const envp = &new_env_block[0]; if (CreateProcessA(NULL, cmdline, NULL, NULL, FALSE, 0, envp, NULL, &si, &pi)) { succeeded = true; diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/function/Function.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/function/Function.java index c12bdf87c48c..fbfc4736e031 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/function/Function.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/function/Function.java @@ -11,7 +11,7 @@ public interface Function extends Serializable { * storage, and load it back when in fail-over through. {@link * Function#loadCheckpoint(Serializable)}. * - * @return A serializable object which represents function state. + *

Returns A serializable object which represents function state. */ default Serializable saveCheckpoint() { return null; diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/function/impl/FilterFunction.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/function/impl/FilterFunction.java index d60e335a9d1e..877a93ae0e74 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/function/impl/FilterFunction.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/function/impl/FilterFunction.java @@ -14,8 +14,8 @@ public interface FilterFunction extends Function { /** * The filter function that evaluates the predicate. * - * @param value The value to be filtered. - * @return True for values that should be retained, false for values to be filtered out. + * @param value The value to be filtered. Returns True for values that should be retained, false + * for values to be filtered out. */ boolean filter(T value) throws Exception; } diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/partition/Partition.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/partition/Partition.java index 80e9d92729bf..527f469c301a 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/partition/Partition.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/partition/Partition.java @@ -15,8 +15,8 @@ public interface Partition extends Function { * record. * * @param record The record. - * @param numPartition num of partitions - * @return IDs of the downstream partitions that should receive the record. + * @param numPartition num of partitions Returns IDs of the downstream partitions that should + * receive the record. */ int[] partition(T record, int numPartition); } diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/DataStream.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/DataStream.java index 999057d5a8b7..698eab29d2e3 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/DataStream.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/DataStream.java @@ -59,8 +59,7 @@ public DataStream(PythonDataStream referencedStream) { * Apply a map function to this stream. * * @param mapFunction The map function. - * @param Type of data returned by the map function. - * @return A new DataStream. + * @param Type of data returned by the map function. Returns A new DataStream. */ public DataStream map(MapFunction mapFunction) { return new DataStream<>(this, new MapOperator<>(mapFunction)); @@ -70,8 +69,7 @@ public DataStream map(MapFunction mapFunction) { * Apply a flat-map function to this stream. * * @param flatMapFunction The FlatMapFunction - * @param Type of data returned by the flatmap function. - * @return A new DataStream + * @param Type of data returned by the flatmap function. Returns A new DataStream */ public DataStream flatMap(FlatMapFunction flatMapFunction) { return new DataStream<>(this, new FlatMapOperator<>(flatMapFunction)); @@ -86,8 +84,7 @@ public DataStream filter(FilterFunction filterFunction) { * type with each other. * * @param stream The DataStream to union output with. - * @param others The other DataStreams to union output with. - * @return A new UnionStream. + * @param others The other DataStreams to union output with. Returns A new UnionStream. */ @SafeVarargs public final DataStream union(DataStream stream, DataStream... others) { @@ -101,8 +98,7 @@ public final DataStream union(DataStream stream, DataStream... others) * Apply union transformations to this stream by merging {@link DataStream} outputs of the same * type with each other. * - * @param streams The DataStreams to union output with. - * @return A new UnionStream. + * @param streams The DataStreams to union output with. Returns A new UnionStream. */ public final DataStream union(List> streams) { if (this instanceof UnionStream) { @@ -119,8 +115,7 @@ public final DataStream union(List> streams) { * * @param other Another stream. * @param The type of the other stream data. - * @param The type of the data in the joined stream. - * @return A new JoinStream. + * @param The type of the data in the joined stream. Returns A new JoinStream. */ public JoinStream join(DataStream other) { return new JoinStream<>(this, other); @@ -134,8 +129,7 @@ public DataStream process() { /** * Apply a sink function and get a StreamSink. * - * @param sinkFunction The sink function. - * @return A new StreamSink. + * @param sinkFunction The sink function. Returns A new StreamSink. */ public DataStreamSink sink(SinkFunction sinkFunction) { return new DataStreamSink<>(this, new SinkOperator<>(sinkFunction)); @@ -145,8 +139,7 @@ public DataStreamSink sink(SinkFunction sinkFunction) { * Apply a key-by function to this stream. * * @param keyFunction the key function. - * @param The type of the key. - * @return A new KeyDataStream. + * @param The type of the key. Returns A new KeyDataStream. */ public KeyDataStream keyBy(KeyFunction keyFunction) { checkPartitionCall(); @@ -156,7 +149,7 @@ public KeyDataStream keyBy(KeyFunction keyFunction) { /** * Apply broadcast to this stream. * - * @return This stream. + *

Returns This stream. */ public DataStream broadcast() { checkPartitionCall(); @@ -166,8 +159,7 @@ public DataStream broadcast() { /** * Apply a partition to this stream. * - * @param partition The partitioning strategy. - * @return This stream. + * @param partition The partitioning strategy. Returns This stream. */ public DataStream partitionBy(Partition partition) { checkPartitionCall(); diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/DataStreamSource.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/DataStreamSource.java index 53dd2a09738a..13de0b33bb4e 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/DataStreamSource.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/DataStreamSource.java @@ -27,8 +27,7 @@ public static DataStreamSource fromSource( * * @param context Stream context. * @param values A collection of values. - * @param The type of source data. - * @return A DataStreamSource. + * @param The type of source data. Returns A DataStreamSource. */ public static DataStreamSource fromCollection( StreamingContext context, Collection values) { diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/KeyDataStream.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/KeyDataStream.java index c50b232697e4..fb6431ef2da8 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/KeyDataStream.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/KeyDataStream.java @@ -33,8 +33,7 @@ public KeyDataStream(PythonDataStream referencedStream) { /** * Apply a reduce function to this stream. * - * @param reduceFunction The reduce function. - * @return A new DataStream. + * @param reduceFunction The reduce function. Returns A new DataStream. */ public DataStream reduce(ReduceFunction reduceFunction) { return new DataStream<>(this, new ReduceOperator(reduceFunction)); @@ -45,8 +44,7 @@ public DataStream reduce(ReduceFunction reduceFunction) { * * @param aggregateFunction The aggregate function * @param The type of aggregated intermediate data. - * @param The type of result data. - * @return A new DataStream. + * @param The type of result data. Returns A new DataStream. */ public DataStream aggregate(AggregateFunction aggregateFunction) { return new DataStream<>(this, null); diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/jobgraph/JobGraph.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/jobgraph/JobGraph.java index b192dbcc8a18..6e40ee441c32 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/jobgraph/JobGraph.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/jobgraph/JobGraph.java @@ -43,7 +43,7 @@ public JobGraph( * Generate direct-graph(made up of a set of vertices and connected by edges) by current job graph * for simple log printing. * - * @return Digraph in string type. + *

Returns Digraph in string type. */ public String generateDigraph() { StringBuilder digraph = new StringBuilder(); diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/python/stream/PythonDataStream.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/python/stream/PythonDataStream.java index 90f018ecdc89..25b5873105a6 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/python/stream/PythonDataStream.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/python/stream/PythonDataStream.java @@ -51,8 +51,7 @@ public PythonDataStream map(String moduleName, String funcName) { /** * Apply a map function to this stream. * - * @param func The python MapFunction. - * @return A new PythonDataStream. + * @param func The python MapFunction. Returns A new PythonDataStream. */ public PythonDataStream map(PythonFunction func) { func.setFunctionInterface(FunctionInterface.MAP_FUNCTION); @@ -66,8 +65,7 @@ public PythonDataStream flatMap(String moduleName, String funcName) { /** * Apply a flat-map function to this stream. * - * @param func The python FlapMapFunction. - * @return A new PythonDataStream + * @param func The python FlapMapFunction. Returns A new PythonDataStream */ public PythonDataStream flatMap(PythonFunction func) { func.setFunctionInterface(FunctionInterface.FLAT_MAP_FUNCTION); @@ -81,9 +79,8 @@ public PythonDataStream filter(String moduleName, String funcName) { /** * Apply a filter function to this stream. * - * @param func The python FilterFunction. - * @return A new PythonDataStream that contains only the elements satisfying the given filter - * predicate. + * @param func The python FilterFunction. Returns A new PythonDataStream that contains only the + * elements satisfying the given filter predicate. */ public PythonDataStream filter(PythonFunction func) { func.setFunctionInterface(FunctionInterface.FILTER_FUNCTION); @@ -95,8 +92,7 @@ public PythonDataStream filter(PythonFunction func) { * same type with each other. * * @param stream The DataStream to union output with. - * @param others The other DataStreams to union output with. - * @return A new UnionStream. + * @param others The other DataStreams to union output with. Returns A new UnionStream. */ public final PythonDataStream union(PythonDataStream stream, PythonDataStream... others) { List streams = new ArrayList<>(); @@ -109,8 +105,7 @@ public final PythonDataStream union(PythonDataStream stream, PythonDataStream... * Apply union transformations to this stream by merging {@link PythonDataStream} outputs of the * same type with each other. * - * @param streams The DataStreams to union output with. - * @return A new UnionStream. + * @param streams The DataStreams to union output with. Returns A new UnionStream. */ public final PythonDataStream union(List streams) { if (this instanceof PythonUnionStream) { @@ -129,8 +124,7 @@ public PythonStreamSink sink(String moduleName, String funcName) { /** * Apply a sink function and get a StreamSink. * - * @param func The python SinkFunction. - * @return A new StreamSink. + * @param func The python SinkFunction. Returns A new StreamSink. */ public PythonStreamSink sink(PythonFunction func) { func.setFunctionInterface(FunctionInterface.SINK_FUNCTION); @@ -144,8 +138,7 @@ public PythonKeyDataStream keyBy(String moduleName, String funcName) { /** * Apply a key-by function to this stream. * - * @param func the python keyFunction. - * @return A new KeyDataStream. + * @param func the python keyFunction. Returns A new KeyDataStream. */ public PythonKeyDataStream keyBy(PythonFunction func) { checkPartitionCall(); @@ -156,7 +149,7 @@ public PythonKeyDataStream keyBy(PythonFunction func) { /** * Apply broadcast to this stream. * - * @return This stream. + *

Returns This stream. */ public PythonDataStream broadcast() { checkPartitionCall(); @@ -166,8 +159,7 @@ public PythonDataStream broadcast() { /** * Apply a partition to this stream. * - * @param partition The partitioning strategy. - * @return This stream. + * @param partition The partitioning strategy. Returns This stream. */ public PythonDataStream partitionBy(PythonPartition partition) { checkPartitionCall(); diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/python/stream/PythonKeyDataStream.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/python/stream/PythonKeyDataStream.java index 078f84ac4a94..8116fd392923 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/python/stream/PythonKeyDataStream.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/python/stream/PythonKeyDataStream.java @@ -31,8 +31,7 @@ public PythonDataStream reduce(String moduleName, String funcName) { /** * Apply a reduce function to this stream. * - * @param func The reduce function. - * @return A new DataStream. + * @param func The reduce function. Returns A new DataStream. */ public PythonDataStream reduce(PythonFunction func) { func.setFunctionInterface(FunctionInterface.REDUCE_FUNCTION); diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/config/global/CommonConfig.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/config/global/CommonConfig.java index 2ec3b6dfb944..0c555e7c5ada 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/config/global/CommonConfig.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/config/global/CommonConfig.java @@ -11,7 +11,7 @@ public interface CommonConfig extends Config { /** * Ray streaming job id. Non-custom. * - * @return Job id with string type. + *

Returns Job id with string type. */ @DefaultValue(value = "default-job-id") @Key(value = JOB_ID) @@ -20,7 +20,7 @@ public interface CommonConfig extends Config { /** * Ray streaming job name. Non-custom. * - * @return Job name with string type. + *

Returns Job name with string type. */ @DefaultValue(value = "default-job-name") @Key(value = JOB_NAME) diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/config/master/SchedulerConfig.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/config/master/SchedulerConfig.java index 79189431a2ba..bc2fc2bd3662 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/config/master/SchedulerConfig.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/config/master/SchedulerConfig.java @@ -11,7 +11,7 @@ public interface SchedulerConfig extends Config { /** * The timeout ms of worker initiation. Default is: 10000ms(10s). * - * @return timeout ms + *

Returns timeout ms */ @Key(WORKER_INITIATION_WAIT_TIMEOUT_MS) @DefaultValue(value = "10000") @@ -20,7 +20,7 @@ public interface SchedulerConfig extends Config { /** * The timeout ms of worker starting. Default is: 10000ms(10s). * - * @return timeout ms + *

Returns timeout ms */ @Key(WORKER_STARTING_WAIT_TIMEOUT_MS) @DefaultValue(value = "10000") diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/context/ContextBackend.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/context/ContextBackend.java index 83b62696e6ba..faf8703905be 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/context/ContextBackend.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/context/ContextBackend.java @@ -12,15 +12,14 @@ public interface ContextBackend { /** * check if key exists in state * - * @return true if exists + *

Returns true if exists */ boolean exists(final String key) throws Exception; /** * get content by key * - * @param key key - * @return the StateBackend + * @param key key Returns the StateBackend */ byte[] get(final String key) throws Exception; diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/graph/executiongraph/ExecutionGraph.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/graph/executiongraph/ExecutionGraph.java index 2852e0f99141..b0d3b522ed10 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/graph/executiongraph/ExecutionGraph.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/graph/executiongraph/ExecutionGraph.java @@ -156,7 +156,7 @@ public AtomicInteger getExecutionVertexIdGenerator() { /** * Get all execution vertices from current execution graph. * - * @return all execution vertices. + *

Returns all execution vertices. */ public List getAllExecutionVertices() { return executionJobVertexMap.values().stream() @@ -168,7 +168,7 @@ public List getAllExecutionVertices() { /** * Get all execution vertices whose status is 'TO_ADD' from current execution graph. * - * @return all added execution vertices. + *

Returns all added execution vertices. */ public List getAllAddedExecutionVertices() { return executionJobVertexMap.values().stream() @@ -181,8 +181,7 @@ public List getAllAddedExecutionVertices() { /** * Get specified execution vertex from current execution graph by execution vertex id. * - * @param executionVertexId execution vertex id. - * @return the specified execution vertex. + * @param executionVertexId execution vertex id. Returns the specified execution vertex. */ public ExecutionVertex getExecutionVertexByExecutionVertexId(int executionVertexId) { if (executionVertexMap.containsKey(executionVertexId)) { @@ -194,8 +193,7 @@ public ExecutionVertex getExecutionVertexByExecutionVertexId(int executionVertex /** * Get specified execution vertex from current execution graph by actor id. * - * @param actorId the actor id of execution vertex. - * @return the specified execution vertex. + * @param actorId the actor id of execution vertex. Returns the specified execution vertex. */ public ExecutionVertex getExecutionVertexByActorId(ActorId actorId) { return actorIdExecutionVertexMap.get(actorId); @@ -204,8 +202,7 @@ public ExecutionVertex getExecutionVertexByActorId(ActorId actorId) { /** * Get specified actor by actor id. * - * @param actorId the actor id of execution vertex. - * @return the specified actor handle. + * @param actorId the actor id of execution vertex. Returns the specified actor handle. */ public Optional getActorById(ActorId actorId) { return getAllActors().stream().filter(actor -> actor.getId().equals(actorId)).findFirst(); @@ -215,8 +212,7 @@ public Optional getActorById(ActorId actorId) { * Get the peer actor in the other side of channelName of a given actor * * @param actor actor in this side - * @param channelName the channel name - * @return the peer actor in the other side + * @param channelName the channel name Returns the peer actor in the other side */ public BaseActorHandle getPeerActor(BaseActorHandle actor, String channelName) { Set set = getActorsByChannelId(channelName); @@ -233,8 +229,7 @@ public BaseActorHandle getPeerActor(BaseActorHandle actor, String channelName) { /** * Get actors in both sides of a channelId * - * @param channelId the channelId - * @return actors in both sides + * @param channelId the channelId Returns actors in both sides */ public Set getActorsByChannelId(String channelId) { return channelGroupedActors.getOrDefault(channelId, Sets.newHashSet()); @@ -243,7 +238,7 @@ public Set getActorsByChannelId(String channelId) { /** * Get all actors by graph. * - * @return actor list + *

Returns actor list */ public List getAllActors() { return getActorsFromJobVertices(getExecutionJobVertexList()); @@ -252,7 +247,7 @@ public List getAllActors() { /** * Get source actors by graph. * - * @return actor list + *

Returns actor list */ public List getSourceActors() { List executionJobVertices = @@ -266,7 +261,7 @@ public List getSourceActors() { /** * Get transformation and sink actors by graph. * - * @return actor list + *

Returns actor list */ public List getNonSourceActors() { List executionJobVertices = @@ -283,7 +278,7 @@ public List getNonSourceActors() { /** * Get sink actors by graph. * - * @return actor list + *

Returns actor list */ public List getSinkActors() { List executionJobVertices = @@ -297,8 +292,7 @@ public List getSinkActors() { /** * Get actors according to job vertices. * - * @param executionJobVertices specified job vertices - * @return actor list + * @param executionJobVertices specified job vertices Returns actor list */ public List getActorsFromJobVertices( List executionJobVertices) { diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/graph/executiongraph/ExecutionJobVertex.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/graph/executiongraph/ExecutionJobVertex.java index cf869c0c4f2a..0aa426672db6 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/graph/executiongraph/ExecutionJobVertex.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/graph/executiongraph/ExecutionJobVertex.java @@ -109,7 +109,7 @@ public String getExecutionJobVertexName() { /** * e.g. 1-SourceOperator * - * @return operator name with index + *

Returns operator name with index */ public String getExecutionJobVertexNameWithIndex() { return executionJobVertexId + "-" + executionJobVertexName; diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/resource/Resources.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/resource/Resources.java index 9b07d131f7c9..b0dec4aef0c0 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/resource/Resources.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/resource/Resources.java @@ -24,7 +24,7 @@ public Resources() {} /** * Get registered containers, the container list is read-only. * - * @return container list. + *

Returns container list. */ public ImmutableList getRegisteredContainers() { return ImmutableList.copyOf(registerContainers); diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/JobMaster.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/JobMaster.java index fd672978a4f2..a1dd5b6bc14b 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/JobMaster.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/JobMaster.java @@ -101,7 +101,7 @@ private void loadMasterCheckpoint() { /** * Init JobMaster. To initiate or recover other components(like metrics and extra coordinators). * - * @return init result + *

Returns init result */ public Boolean init(boolean isRecover) { LOG.info("Initializing job master, isRecover={}.", isRecover); @@ -136,8 +136,7 @@ public Boolean init(boolean isRecover) { * * * @param jobMasterActor JobMaster actor - * @param jobGraph logical plan - * @return submit result + * @param jobGraph logical plan Returns submit result */ public boolean submitJob(ActorHandle jobMasterActor, JobGraph jobGraph) { LOG.info("Begin submitting job using logical plan: {}.", jobGraph); diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/graphmanager/GraphManager.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/graphmanager/GraphManager.java index b563917d97b4..ce8dd474157a 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/graphmanager/GraphManager.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/graphmanager/GraphManager.java @@ -19,22 +19,21 @@ public interface GraphManager { /** * Build execution graph from job graph. * - * @param jobGraph logical plan of streaming job. - * @return physical plan of streaming job. + * @param jobGraph logical plan of streaming job. Returns physical plan of streaming job. */ ExecutionGraph buildExecutionGraph(JobGraph jobGraph); /** * Get job graph. * - * @return the job graph. + *

Returns the job graph. */ JobGraph getJobGraph(); /** * Get execution graph. * - * @return the execution graph. + *

Returns the execution graph. */ ExecutionGraph getExecutionGraph(); } diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/ResourceManager.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/ResourceManager.java index fbe3f696aa59..43671eea1b28 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/ResourceManager.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/ResourceManager.java @@ -10,7 +10,7 @@ public interface ResourceManager extends ResourceAssignStrategy { /** * Get registered containers, the container list is read-only. * - * @return the registered container list + *

Returns the registered container list */ ImmutableList getRegisteredContainers(); } diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/strategy/ResourceAssignStrategy.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/strategy/ResourceAssignStrategy.java index 9ce131d2599c..8df20790cb90 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/strategy/ResourceAssignStrategy.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/strategy/ResourceAssignStrategy.java @@ -13,8 +13,7 @@ public interface ResourceAssignStrategy { * Assign {@link Container} for {@link ExecutionVertex} * * @param containers registered container - * @param executionGraph execution graph - * @return allocating view + * @param executionGraph execution graph Returns allocating view */ ResourceAssignmentView assignResource(List containers, ExecutionGraph executionGraph); diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/strategy/impl/PipelineFirstStrategy.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/strategy/impl/PipelineFirstStrategy.java index 48f2366cd37d..74b646c67364 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/strategy/impl/PipelineFirstStrategy.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/strategy/impl/PipelineFirstStrategy.java @@ -42,8 +42,8 @@ public class PipelineFirstStrategy implements ResourceAssignStrategy { * Assign resource to each execution vertex in the given execution graph. * * @param containers registered containers - * @param executionGraph execution graph - * @return allocating map, key is container ID, value is list of vertextId, and contains vertices + * @param executionGraph execution graph Returns allocating map, key is container ID, value is + * list of vertextId, and contains vertices */ @Override public ResourceAssignmentView assignResource( @@ -133,8 +133,7 @@ private void updateContainerCapacity(List containers, int capacity) { * Find a container which matches required resource * * @param requiredResource required resource - * @param containers registered containers - * @return container that matches the required resource + * @param containers registered containers Returns container that matches the required resource */ private Container findMatchedContainer( Map requiredResource, List containers) { @@ -160,8 +159,7 @@ private Container findMatchedContainer( * Check if current container has enough resource * * @param requiredResource required resource - * @param container container - * @return true if matches, false else + * @param container container Returns true if matches, false else */ private boolean hasEnoughResource(Map requiredResource, Container container) { LOG.info("Check resource for index: {}, container: {}", currentContainerIndex, container); @@ -202,8 +200,7 @@ private boolean hasEnoughResource(Map requiredResource, Containe /** * Forward to next container * - * @param containers registered container list - * @return next container in the list + * @param containers registered container list Returns next container in the list */ private Container forwardToNextContainer(List containers) { this.currentContainerIndex = (this.currentContainerIndex + 1) % containers.size(); @@ -213,8 +210,7 @@ private Container forwardToNextContainer(List containers) { /** * Get current container * - * @param containers registered container - * @return current container to allocate actor + * @param containers registered container Returns current container to allocate actor */ private Container getCurrentContainer(List containers) { return containers.get(currentContainerIndex); diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/JobScheduler.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/JobScheduler.java index d0fb60d54878..962c0bdfa92b 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/JobScheduler.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/JobScheduler.java @@ -8,8 +8,7 @@ public interface JobScheduler { /** * Schedule streaming job using the physical plan. * - * @param executionGraph physical plan - * @return scheduling result + * @param executionGraph physical plan Returns scheduling result */ boolean scheduleJob(ExecutionGraph executionGraph); } diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/JobSchedulerImpl.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/JobSchedulerImpl.java index 039715ccbefd..6309bb334e32 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/JobSchedulerImpl.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/JobSchedulerImpl.java @@ -95,8 +95,7 @@ private void initAndStart(ExecutionGraph executionGraph) { /** * Create JobWorker actors according to the physical plan. * - * @param executionGraph physical plan - * @return actor creation result + * @param executionGraph physical plan Returns actor creation result */ public boolean createWorkers(ExecutionGraph executionGraph) { LOG.info("Begin creating workers."); @@ -149,8 +148,7 @@ public boolean startWorkers(ExecutionGraph executionGraph, long checkpointId) { /** * Build workers context. * - * @param executionGraph execution graph - * @return vertex to worker context map + * @param executionGraph execution graph Returns vertex to worker context map */ protected Map buildWorkersContext( ExecutionGraph executionGraph) { diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/controller/WorkerLifecycleController.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/controller/WorkerLifecycleController.java index 3cd3984b2043..f5c4be5f7ee1 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/controller/WorkerLifecycleController.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/controller/WorkerLifecycleController.java @@ -36,8 +36,7 @@ public boolean createWorkers(List executionVertices) { /** * Create JobWorker actor according to the execution vertex. * - * @param executionVertex target execution vertex - * @return creation result + * @param executionVertex target execution vertex Returns creation result */ private boolean createWorker(ExecutionVertex executionVertex) { LOG.info( @@ -85,8 +84,7 @@ private boolean createWorker(ExecutionVertex executionVertex) { * Using context to init JobWorker. * * @param vertexToContextMap target JobWorker actor - * @param timeout timeout for waiting, unit: ms - * @return initiation result + * @param timeout timeout for waiting, unit: ms Returns initiation result */ public boolean initWorkers( Map vertexToContextMap, int timeout) { @@ -122,8 +120,7 @@ public boolean initWorkers( * Start JobWorkers to run task. * * @param executionGraph physical plan - * @param timeout timeout for waiting, unit: ms - * @return starting result + * @param timeout timeout for waiting, unit: ms Returns starting result */ public boolean startWorkers(ExecutionGraph executionGraph, long lastCheckpointId, int timeout) { LOG.info("Begin starting workers."); @@ -153,8 +150,7 @@ public boolean startWorkers(ExecutionGraph executionGraph, long lastCheckpointId /** * Stop and destroy JobWorkers' actor. * - * @param executionVertices target vertices - * @return destroy result + * @param executionVertices target vertices Returns destroy result */ public boolean destroyWorkers(List executionVertices) { return asyncBatchExecute(this::destroyWorker, executionVertices); diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/rpc/RemoteCallWorker.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/rpc/RemoteCallWorker.java index 6cd788138883..5a5475350d65 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/rpc/RemoteCallWorker.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/rpc/RemoteCallWorker.java @@ -25,8 +25,7 @@ public class RemoteCallWorker { * Call JobWorker actor to init. * * @param actor target JobWorker actor - * @param context JobWorker's context - * @return init result + * @param context JobWorker's context Returns init result */ public static ObjectRef initWorker(BaseActorHandle actor, JobWorkerContext context) { LOG.info("Call worker to initiate, actor: {}, context: {}.", actor.getId(), context); @@ -51,8 +50,7 @@ public static ObjectRef initWorker(BaseActorHandle actor, JobWorkerCont * Call JobWorker actor to start. * * @param actor target JobWorker actor - * @param checkpointId checkpoint ID to be rollback - * @return start result + * @param checkpointId checkpoint ID to be rollback Returns start result */ public static ObjectRef rollback(BaseActorHandle actor, final Long checkpointId) { LOG.info("Call worker to start, actor: {}.", actor.getId()); @@ -81,8 +79,7 @@ public static ObjectRef rollback(BaseActorHandle actor, final Long checkpointId) /** * Call JobWorker actor to destroy without reconstruction. * - * @param actor target JobWorker actor - * @return destroy result + * @param actor target JobWorker actor Returns destroy result */ public static Boolean shutdownWithoutReconstruction(BaseActorHandle actor) { LOG.info("Call worker to shutdown without reconstruction, actor is {}.", actor.getId()); diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/transfer/DataReader.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/transfer/DataReader.java index ff3c62fee11c..17ab4fe1ec4a 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/transfer/DataReader.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/transfer/DataReader.java @@ -115,8 +115,7 @@ private static native long createDataReaderNative( /** * Read message from input channels, if timeout, return null. * - * @param timeoutMillis timeout - * @return message or null + * @param timeoutMillis timeout Returns message or null */ public ChannelMessage read(long timeoutMillis) { if (buf.isEmpty()) { diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/transfer/channel/ChannelId.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/transfer/channel/ChannelId.java index 731031d62a9b..d3a4b8d71773 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/transfer/channel/ChannelId.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/transfer/channel/ChannelId.java @@ -86,8 +86,7 @@ public static String genRandomIdStr() { * Generate channel name, which will be {@link ChannelId#ID_LENGTH} character * * @param fromTaskId upstream task id - * @param toTaskId downstream task id - * @return channel name + * @param toTaskId downstream task id Returns channel name */ public static String genIdStr(int fromTaskId, int toTaskId, long ts) { /* @@ -117,8 +116,7 @@ public static String genIdStr(int fromTaskId, int toTaskId, long ts) { } /** - * @param id hex string representation of channel id - * @return bytes representation of channel id + * @param id hex string representation of channel id Returns bytes representation of channel id */ public static byte[] idStrToBytes(String id) { byte[] idBytes = BaseEncoding.base16().decode(id.toUpperCase()); @@ -127,8 +125,7 @@ public static byte[] idStrToBytes(String id) { } /** - * @param id bytes representation of channel id - * @return hex string representation of channel id + * @param id bytes representation of channel id Returns hex string representation of channel id */ public static String idBytesToStr(byte[] id) { assert id.length == ChannelId.ID_LENGTH; diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/EnvUtil.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/EnvUtil.java index 29ac29f4d51e..07fda18a6c5a 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/EnvUtil.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/EnvUtil.java @@ -36,7 +36,7 @@ public static void loadNativeLibraries() { /** * Execute an external command. * - * @return Whether the command succeeded. + *

Returns Whether the command succeeded. */ public static boolean executeCommand(List command, int waitTimeoutSeconds) { try { diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/Platform.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/Platform.java index 324e1ab9dcd9..effafcc540a0 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/Platform.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/Platform.java @@ -77,10 +77,7 @@ public static void wrapDirectBuffer(ByteBuffer buffer, long address, int size) { buffer.clear(); } - /** - * @param buffer a DirectBuffer backed by off-heap memory - * @return address of off-heap memory - */ + /** @param buffer a DirectBuffer backed by off-heap memory Returns address of off-heap memory */ public static long getAddress(ByteBuffer buffer) { return ((DirectBuffer) buffer).address(); } diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/RayUtils.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/RayUtils.java index b3243d69f449..a97a2f5bab3b 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/RayUtils.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/RayUtils.java @@ -15,7 +15,7 @@ public class RayUtils { /** * Get all node info from GCS * - * @return node info list + *

Returns node info list */ public static List getAllNodeInfo() { if (Ray.getRuntimeContext().isSingleProcess()) { @@ -28,7 +28,7 @@ public static List getAllNodeInfo() { /** * Get all alive node info map * - * @return node info map, key is unique node id , value is node info + *

Returns node info map, key is unique node id , value is node info */ public static Map getAliveNodeInfoMap() { return getAllNodeInfo().stream() diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/ReflectionUtils.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/ReflectionUtils.java index 13a75f8ebc7b..bc04a1ded0f6 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/ReflectionUtils.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/ReflectionUtils.java @@ -20,7 +20,7 @@ public static Method findMethod(Class cls, String methodName) { /** * For covariant return type, return the most specific method. * - * @return all methods named by {@code methodName}, + *

Returns all methods named by {@code methodName}, */ public static List findMethods(Class cls, String methodName) { List> classes = new ArrayList<>(); diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/ResourceUtil.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/ResourceUtil.java index b00b6ee96b85..b8336cd145be 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/ResourceUtil.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/ResourceUtil.java @@ -52,8 +52,8 @@ public static void logProcessMemoryDetail() { } /** - * @return jvm heap usage ratio. note that one of the survivor space is not include in total - * memory while calculating this ratio. + * Returns jvm heap usage ratio. note that one of the survivor space is not include in total + * memory while calculating this ratio. */ public static double getJvmHeapUsageRatio() { Runtime runtime = Runtime.getRuntime(); @@ -61,8 +61,8 @@ public static double getJvmHeapUsageRatio() { } /** - * @return jvm heap usage(in bytes). note that this value doesn't include one of the survivor - * space. + * Returns jvm heap usage(in bytes). note that this value doesn't include one of the survivor + * space. */ public static long getJvmHeapUsageInBytes() { Runtime runtime = Runtime.getRuntime(); @@ -95,8 +95,8 @@ public static double getProcessCpuUsage() { } /** - * @return the system cpu usage. This value is a double in the [0.0,1.0] We will try to use `vsar` - * to get cpu usage by default, and use MXBean if any exception raised. + * Returns the system cpu usage. This value is a double in the [0.0,1.0] We will try to use `vsar` + * to get cpu usage by default, and use MXBean if any exception raised. */ public static double getSystemCpuUsage() { double cpuUsage = 0.0; @@ -109,10 +109,10 @@ public static double getSystemCpuUsage() { } /** - * @return the "recent cpu usage" for the whole system. This value is a double in the [0.0,1.0] - * interval. A value of 0.0 means that all CPUs were idle during the recent period of time - * observed, while a value of 1.0 means that all CPUs were actively running 100% of the time - * during the recent period being observed + * Returns the "recent cpu usage" for the whole system. This value is a double in the [0.0,1.0] + * interval. A value of 0.0 means that all CPUs were idle during the recent period of time + * observed, while a value of 1.0 means that all CPUs were actively running 100% of the time + * during the recent period being observed */ public static double getSystemCpuUtilByMXBean() { return osmxb.getSystemCpuLoad(); @@ -144,7 +144,7 @@ public static double getSystemCpuUtilByVsar() throws Exception { return cpuUsageFromVsar; } - /** Returns the system load average for the last minute */ + /** Returnss the system load average for the last minute */ public static double getSystemLoadAverage() { return osmxb.getSystemLoadAverage(); } @@ -158,8 +158,7 @@ public static int getCpuCores() { * Get containers by hostname of address * * @param containers container list - * @param containerHosts container hostname or address set - * @return matched containers + * @param containerHosts container hostname or address set Returns matched containers */ public static List getContainersByHostname( List containers, Collection containerHosts) { @@ -175,8 +174,7 @@ public static List getContainersByHostname( /** * Get container by hostname * - * @param hostName container hostname - * @return container + * @param hostName container hostname Returns container */ public static Optional getContainerByHostname( List containers, String hostName) { @@ -190,8 +188,7 @@ public static Optional getContainerByHostname( /** * Get container by id * - * @param containerID container id - * @return container + * @param containerID container id Returns container */ public static Optional getContainerById( List containers, ContainerId containerID) { diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/worker/JobWorker.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/worker/JobWorker.java index 15200c65633e..5a6554802bc3 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/worker/JobWorker.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/worker/JobWorker.java @@ -137,8 +137,8 @@ public Boolean init(JobWorkerContext workerContext) { /** * Start worker's stream tasks with specific checkpoint ID. * - * @return a {@link CallResult} with {@link ChannelRecoverInfo}, contains {@link - * ChannelCreationStatus} of each input queue. + *

Returns a {@link CallResult} with {@link ChannelRecoverInfo}, contains {@link + * ChannelCreationStatus} of each input queue. */ public CallResult rollback(Long checkpointId, Long startRollbackTs) { synchronized (initialStateChangeLock) { diff --git a/streaming/java/streaming-runtime/src/test/java/io/ray/streaming/runtime/util/Mockitools.java b/streaming/java/streaming-runtime/src/test/java/io/ray/streaming/runtime/util/Mockitools.java index eb48f1691a12..5fe774e20b22 100644 --- a/streaming/java/streaming-runtime/src/test/java/io/ray/streaming/runtime/util/Mockitools.java +++ b/streaming/java/streaming-runtime/src/test/java/io/ray/streaming/runtime/util/Mockitools.java @@ -49,8 +49,8 @@ public static List mockGetAllNodeInfo() { /** * Mock get node info map * - * @param nodeInfos all node infos fetched from GCS - * @return node info map, key is node unique id, value is node info + * @param nodeInfos all node infos fetched from GCS Returns node info map, key is node unique id, + * value is node info */ public static Map mockGetNodeInfoMap(List nodeInfos) { return nodeInfos.stream() diff --git a/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/KeyGroupAssignment.java b/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/KeyGroupAssignment.java index 921ea8598b43..10f99c0b6b2f 100644 --- a/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/KeyGroupAssignment.java +++ b/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/KeyGroupAssignment.java @@ -50,8 +50,8 @@ public static KeyGroup getKeyGroup(int maxParallelism, int parallelism, int inde * Assigning the key to a key-group index. * * @param key the key to assign. - * @param maxParallelism the maximum parallelism. - * @return the key-group index to which the given key is assigned. + * @param maxParallelism the maximum parallelism. Returns the key-group index to which the given + * key is assigned. */ public static int assignKeyGroupIndexForKey(Object key, int maxParallelism) { return Math.abs(key.hashCode() % maxParallelism); diff --git a/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/state/MapState.java b/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/state/MapState.java index a632d21d0728..933081af5383 100644 --- a/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/state/MapState.java +++ b/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/state/MapState.java @@ -28,8 +28,7 @@ public interface MapState extends UnaryState> { /** * Returns the current value associated with the given key. * - * @param key The key of the mapping - * @return The value of the mapping with the given key + * @param key The key of the mapping Returns The value of the mapping with the given key */ V get(K key); @@ -65,8 +64,8 @@ public interface MapState extends UnaryState> { /** * Returns whether there exists the given mapping. * - * @param key The key of the mapping - * @return True if there exists a mapping whose key equals to the given key + * @param key The key of the mapping Returns True if there exists a mapping whose key equals to + * the given key */ default boolean contains(K key) { return get().containsKey(key); @@ -75,7 +74,7 @@ default boolean contains(K key) { /** * Returns all the mappings in the state * - * @return An iterable view of all the key-value pairs in the state. + *

Returns An iterable view of all the key-value pairs in the state. */ default Iterable> entries() { return get().entrySet(); @@ -84,7 +83,7 @@ default Iterable> entries() { /** * Returns all the keys in the state * - * @return An iterable view of all the keys in the state. + *

Returns An iterable view of all the keys in the state. */ default Iterable keys() { return get().keySet(); @@ -93,7 +92,7 @@ default Iterable keys() { /** * Returns all the values in the state. * - * @return An iterable view of all the values in the state. + *

Returns An iterable view of all the values in the state. */ default Iterable values() { return get().values(); @@ -102,7 +101,7 @@ default Iterable values() { /** * Iterates over all the mappings in the state. * - * @return An iterator over all the mappings in the state + *

Returns An iterator over all the mappings in the state */ default Iterator> iterator() { return get().entrySet().iterator(); diff --git a/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/state/UnaryState.java b/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/state/UnaryState.java index 637b573144b8..5c250b594973 100644 --- a/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/state/UnaryState.java +++ b/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/state/UnaryState.java @@ -24,7 +24,7 @@ public interface UnaryState extends State { /** * get the value in state * - * @return the value in state + *

Returns the value in state */ O get(); }