fishbone · pull · May 25, 2022 · May 24, 2022 · May 24, 2022 · May 24, 2022
diff --git a/.buildkite/pipeline.ml.yml b/.buildkite/pipeline.ml.yml
@@ -377,7 +377,7 @@
     - DOC_TESTING=1 PYTHON=3.7 ./ci/env/install-dependencies.sh
     - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=-ray_air,-gpu,-py37,-post_wheel_build doc/...
 
-- label: ":book: :ariplane: Ray AIR examples"
+- label: ":book: :airplane: Ray AIR examples"
   conditions:
     ["RAY_CI_PYTHON_AFFECTED", "RAY_CI_TUNE_AFFECTED", "RAY_CI_DOC_AFFECTED", "RAY_CI_SERVE_AFFECTED"]
   commands:

diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
@@ -0,0 +1,2 @@
+# Formatted Python code with Black
+7f1bacc7dc9caf6d0ec042e39499bbf1d9a7d065
diff --git a/.github/ISSUE_TEMPLATE/documentation-issue.yml b/.github/ISSUE_TEMPLATE/documentation-issue.yml
@@ -0,0 +1,26 @@
+name: Documentation
+title: "[<Ray component: Core|RLlib|etc...>] "
+description: Report an issue with the Ray documentation
+labels: [docs]
+body:
+  - type: markdown
+    attributes:
+      value: Thank you for helping us improve the Ray documentation!
+
+  - type: textarea
+    attributes:
+      label: Description
+      description: |
+        Tell us about the change you'd like to see. For example, "I'd like to
+        see more examples of how to use `ray.remote`."
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: Link
+      description: |
+        If the problem is related to an existing section, please add a link to
+        the section. For example, https://docs.ray.io/en/master/ray-core/package-ref.html#ray.remote.
+    validations:
+      required: false
diff --git a/.gitignore b/.gitignore
@@ -213,3 +213,11 @@ workflow_data/
 
 # Jupyter Notebooks
 **/.ipynb_checkpoints/
+
+### Added by Hedron's Bazel Compile Commands Extractor: https://github.com/hedronvision/bazel-compile-commands-extractor
+# The external link: Differs on Windows vs macOS/Linux, so we can't check it in. The pattern needs to not have a trailing / because it's a symlink on macOS/Linux.
+/external
+# Compiled output -> don't check in
+/compile_commands.json
+# Directory where clangd puts its indexing work
+/.cache/
diff --git a/WORKSPACE b/WORKSPACE
@@ -20,3 +20,9 @@ load("@bazel_skylib//lib:versions.bzl", "versions")
 # When the bazel version is updated, make sure to update it
 # in setup.py as well.
 versions.check(minimum_bazel_version = "4.2.1")
+
+# Tools to generate `compile_commands.json` to enable awesome tooling of the C language family.
+# Just run `bazel run @hedron_compile_commands//:refresh_all`
+load("@hedron_compile_commands//:workspace_setup.bzl", "hedron_compile_commands_setup")
+
+hedron_compile_commands_setup()
diff --git a/bazel/ray_deps_setup.bzl b/bazel/ray_deps_setup.bzl
@@ -306,3 +306,16 @@ def ray_deps_setup():
         ],
         sha256 = "379113459b0feaf6bfbb584a91874c065078aa673222846ac765f86661c27407",
     )
+
+    # Hedron's Compile Commands Extractor for Bazel
+    # https://github.com/hedronvision/bazel-compile-commands-extractor
+    http_archive(
+        name = "hedron_compile_commands",
+
+        # Replace the commit hash in both places (below) with the latest, rather than using the stale one here.
+        # Even better, set up Renovate and let it do the work for you (see "Suggestion: Updates" in the README).
+        url = "https://github.com/hedronvision/bazel-compile-commands-extractor/archive/cfd16a16cb4c4f27337ef652aa8510dcf1dd01ce.tar.gz",
+        strip_prefix = "bazel-compile-commands-extractor-cfd16a16cb4c4f27337ef652aa8510dcf1dd01ce",
+        # When you first run this tool, it'll recommend a sha256 hash to put here with a message like: "DEBUG: Rule 'hedron_compile_commands' indicated that a canonical reproducible form can be obtained by modifying arguments sha256 = ..."
+        sha256 = "4c2753a8d446f561391b7968a6d0eed748e8bb0f40adeda51301c57e829c7696",
+    )
diff --git a/ci/env/install-dependencies.sh b/ci/env/install-dependencies.sh
@@ -379,7 +379,7 @@ install_dependencies() {
   # dependencies with Modin.
   if [ "${INSTALL_LUDWIG-}" = 1 ]; then
     # TODO: eventually pin this to master.
-    pip install -U "ludwig[test]">=0.4
+    pip install -U "ludwig[test]">=0.4 jsonschema>=4
   fi
 
   # Data processing test dependencies.

diff --git a/ci/lint/check_api_annotations.py b/ci/lint/check_api_annotations.py
@@ -5,7 +5,16 @@
 import ray
 from ray.util.annotations import _is_annotated
 
-IGNORE_PATHS = {".impl.", ".backend.", ".experimental.", ".internal.", ".generated."}
+IGNORE_PATHS = {
+    ".impl.",
+    ".backend.",
+    ".experimental.",
+    ".internal.",
+    ".generated.",
+    ".test_utils.",
+    ".annotations.",
+    ".deprecation",
+}
 
 
 def _fullname(attr):
@@ -76,11 +85,12 @@ def verify(symbol, scanned, ok, output, prefix=None):
     verify(ray.data, set(), ok, output)
     # Sanity check the lint logic.
     assert len(ok) >= 60, len(ok)
+
+    verify(ray.rllib, set(), ok, output)
     # TODO(ekl) enable it for all modules.
     #    verify(ray.ml, set(), ok, output)
     #    verify(ray.train, set(), ok, output)
     #    verify(ray.serve, set(), ok, output)
-    #    verify(ray.rllib, set(), ok, output)
     #    verify(ray.tune, set(), ok, output)
     #    verify(ray, set(), ok, output)
 

diff --git a/ci/pipeline/determine_tests_to_run.py b/ci/pipeline/determine_tests_to_run.py
@@ -201,6 +201,10 @@ def get_commit_range():
             elif any(changed_file.startswith(prefix) for prefix in skip_prefix_list):
                 # nothing is run but linting in these cases
                 pass
+            elif changed_file.startswith("release/ray_release/"):
+                # Tests for release/ray_release always run, so it is unnecessary to
+                # tag affected tests.
+                pass
             elif changed_file.endswith("build-docker-images.py"):
                 RAY_CI_DOCKER_AFFECTED = 1
                 RAY_CI_LINUX_WHEELS_AFFECTED = 1

diff --git a/dashboard/client/src/pages/dashboard/node-info/NodeInfo.tsx b/dashboard/client/src/pages/dashboard/node-info/NodeInfo.tsx
@@ -217,7 +217,10 @@ const NodeInfo: React.FC<{}> = () => {
   // If a Ray node is running in a K8s pod, it marks available disk as 1 byte.
   // (See ReporterAgent._get_disk_usage() in reporter_agent.py)
   // Check if there are any nodes with realistic disk total:
-  const showDisk = nodes.filter((n) => n.disk["/"].total > 10).length !== 0;
+  const showDisk =
+    nodes.filter(
+      (n) => n !== undefined && n.disk !== undefined && n.disk["/"].total > 10,
+    ).length !== 0;
 
   const filterPredicate = (
     feature: NodeInfoFeature | HeaderInfo<nodeInfoColumnId>,

diff --git a/dashboard/modules/job/job_manager.py b/dashboard/modules/job/job_manager.py
@@ -415,10 +415,16 @@ def _get_supervisor_runtime_env(
         runtime_env = (
             copy.deepcopy(user_runtime_env) if user_runtime_env is not None else {}
         )
+
+        # NOTE(edoakes): Can't use .get(, {}) here because we need to handle the case
+        # where env_vars is explicitly set to `None`.
+        env_vars = runtime_env.get("env_vars")
+        if env_vars is None:
+            env_vars = {}
+
         # Don't set CUDA_VISIBLE_DEVICES for the supervisor actor so the
         # driver can use GPUs if it wants to. This will be removed from
         # the driver's runtime_env so it isn't inherited by tasks & actors.
-        env_vars = runtime_env.get("env_vars", {})
         env_vars[ray_constants.NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR] = "1"
         runtime_env["env_vars"] = env_vars
         return runtime_env

diff --git a/dashboard/modules/job/tests/test_job_manager.py b/dashboard/modules/job/tests/test_job_manager.py
@@ -3,6 +3,7 @@
 import psutil
 import tempfile
 import sys
+import urllib.request
 from uuid import uuid4
 import signal
 
@@ -270,6 +271,24 @@ async def test_submit_with_s3_runtime_env(self, job_manager):
             job_manager.get_job_logs(job_id) == "Executing main() from script.py !!\n"
         )
 
+    async def test_submit_with_file_runtime_env(self, job_manager):
+        with tempfile.NamedTemporaryFile(suffix=".zip") as f:
+            filename, _ = urllib.request.urlretrieve(
+                "https://runtime-env-test.s3.amazonaws.com/script_runtime_env.zip",
+                filename=f.name,
+            )
+            job_id = job_manager.submit_job(
+                entrypoint="python script.py",
+                runtime_env={"working_dir": "file://" + filename},
+            )
+            await async_wait_for_condition(
+                check_job_succeeded, job_manager=job_manager, job_id=job_id
+            )
+            assert (
+                job_manager.get_job_logs(job_id)
+                == "Executing main() from script.py !!\n"
+            )
+
 
 @pytest.mark.asyncio
 class TestRuntimeEnv:
@@ -424,13 +443,21 @@ def dict_to_str(d):
             {JOB_NAME_METADATA_KEY: "custom_name", JOB_ID_METADATA_KEY: job_id}
         ) in job_manager.get_job_logs(job_id)
 
-    async def test_cuda_visible_devices(self, job_manager):
+    @pytest.mark.parametrize(
+        "env_vars",
+        [None, {}, {"hello": "world"}],
+    )
+    async def test_cuda_visible_devices(self, job_manager, env_vars):
         """Check CUDA_VISIBLE_DEVICES behavior.
 
         Should not be set in the driver, but should be set in tasks.
+
+        We test a variety of `env_vars` parameters due to custom parsing logic
+        that caused https://github.com/ray-project/ray/issues/25086.
         """
         run_cmd = f"python {_driver_script_path('check_cuda_devices.py')}"
-        job_id = job_manager.submit_job(entrypoint=run_cmd)
+        runtime_env = {"env_vars": env_vars}
+        job_id = job_manager.submit_job(entrypoint=run_cmd, runtime_env=runtime_env)
 
         await async_wait_for_condition(
             check_job_succeeded, job_manager=job_manager, job_id=job_id

diff --git a/dashboard/modules/serve/serve_head.py b/dashboard/modules/serve/serve_head.py
@@ -33,14 +33,14 @@ async def get_all_deployments(self, req: Request) -> Response:
     @routes.get("/api/serve/deployments/status")
     @optional_utils.init_ray_and_catch_exceptions(connect_to_serve=True)
     async def get_all_deployment_statuses(self, req: Request) -> Response:
-        from ray.serve.api import get_deployment_statuses
-        from ray.serve.schema import serve_application_status_to_schema
+        from ray.serve.context import get_global_client
+        from ray.serve.schema import serve_status_to_schema
 
-        serve_application_status_schema = serve_application_status_to_schema(
-            get_deployment_statuses()
-        )
+        client = get_global_client(_override_controller_namespace="serve")
+
+        serve_status_schema = serve_status_to_schema(client.get_serve_status())
         return Response(
-            text=serve_application_status_schema.json(),
+            text=serve_status_schema.json(),
             content_type="application/json",
         )
 

diff --git a/dashboard/modules/serve/tests/test_serve_head.py b/dashboard/modules/serve/tests/test_serve_head.py
@@ -1,14 +1,15 @@
+import os
+import sys
 import json
+import time
+import pytest
+import requests
 import subprocess
-import sys
-import os
 from typing import List, Dict, Set
 
-import pytest
-
-import requests
 import ray
 from ray import serve
+from ray._private.test_utils import wait_for_condition
 
 
 GET_OR_PUT_URL = "http://localhost:8265/api/serve/deployments/"
@@ -124,6 +125,14 @@ def test_put_get_success(ray_start_stop):
             GET_OR_PUT_URL, json={"deployments": deployments}, timeout=30
         )
         assert put_response.status_code == 200
+
+        # Use wait_for_condition() to ensure "deep" deployment deleted
+        wait_for_condition(
+            lambda: len(requests.get(GET_OR_PUT_URL, timeout=3).json()["deployments"])
+            == 2,
+            timeout=10,
+        )
+
         assert (
             requests.get("http://localhost:8000/shallow", timeout=30).text
             == "Hello shallow world!"
@@ -176,9 +185,12 @@ def test_delete_success(ray_start_stop):
         delete_response = requests.delete(GET_OR_PUT_URL, timeout=30)
         assert delete_response.status_code == 200
 
-        # Make sure no deployments exist
-        get_response = requests.get(GET_OR_PUT_URL, timeout=30)
-        assert len(get_response.json()["deployments"]) == 0
+        # Make sure all deployments are deleted
+        wait_for_condition(
+            lambda: len(requests.get(GET_OR_PUT_URL, timeout=3).json()["deployments"])
+            == 0,
+            timeout=10,
+        )
 
 
 def test_get_status_info(ray_start_stop):
@@ -216,18 +228,23 @@ def test_get_status_info(ray_start_stop):
 
     status_response = requests.get(STATUS_URL, timeout=30)
     assert status_response.status_code == 200
+    serve_status = status_response.json()
 
-    statuses = status_response.json()["statuses"]
-    assert len(statuses) == len(deployments)
+    deployment_statuses = serve_status["deployment_statuses"]
+    assert len(deployment_statuses) == len(deployments)
     expected_deployment_names = {deployment["name"] for deployment in deployments}
-    for deployment_status in statuses:
+    for deployment_status in deployment_statuses:
         assert deployment_status["name"] in expected_deployment_names
         expected_deployment_names.remove(deployment_status["name"])
         assert deployment_status["status"] in {"UPDATING", "HEALTHY"}
         assert deployment_status["message"] == ""
     assert len(expected_deployment_names) == 0
 
-    print(statuses)
+    assert serve_status["app_status"]["status"] in {"DEPLOYING", "RUNNING"}
+    wait_for_condition(
+        lambda: time.time() > serve_status["app_status"]["deployment_timestamp"],
+        timeout=2,
+    )
 
 
 def test_serve_namespace(ray_start_stop):
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Formatted Python code with Black
		7f1bacc7dc9caf6d0ec042e39499bbf1d9a7d065