Merge branch 'master' of https://github.com/ray-project/ray

jianoaix · Apr 11, 2023 · a48dae4 · a48dae4
2 parents 1ba2e7d + 92d6f1f
commit a48dae4
Show file tree

Hide file tree

Showing 225 changed files with 2,981 additions and 3,164 deletions.
diff --git a/dashboard/client/src/components/ActorTable.component.test.tsx b/dashboard/client/src/components/ActorTable.component.test.tsx
@@ -0,0 +1,131 @@
+import { render, within } from "@testing-library/react";
+import React from "react";
+import { MemoryRouter } from "react-router-dom";
+import { Actor } from "../type/actor";
+import ActorTable from "./ActorTable";
+const MOCK_ACTORS: { [actorId: string]: Actor } = {
+  ACTOR_1: {
+    actorId: "ACTOR_1",
+    jobId: "01000000",
+    address: {
+      rayletId: "426854e68e4225b3941deaf03c8dcfcb1daacc69a92711d370dbb0e1",
+      ipAddress: "172.31.11.178",
+      port: 10003,
+      workerId: "b8b276a03612644098ed7a929c3b0e50f5bde894eb0d8cab288fbb6d",
+    },
+    state: "ALIVE",
+    numRestarts: "0",
+    name: "",
+    pid: 25321,
+    startTime: 1679010689148,
+    endTime: 0,
+    actorClass: "Counter",
+    exitDetail: "-",
+    requiredResources: {},
+    placementGroupId: "123",
+    reprName: ",",
+  },
+  ACTOR_2: {
+    actorId: "ACTOR_2",
+    jobId: "01000000",
+    address: {
+      rayletId: "426854e68e4225b3941deaf03c8dcfcb1daacc69a92711d370dbb0e1",
+      ipAddress: "172.31.11.178",
+      port: 10003,
+      workerId: "b8b276a03612644098ed7a929c3b0e50f5bde894eb0d8cab288fbb6d",
+    },
+    state: "DEAD",
+    numRestarts: "0",
+    name: "",
+    pid: 25322,
+    startTime: 1679010689150,
+    endTime: 0,
+    actorClass: "Counter",
+    exitDetail: "-",
+    requiredResources: {},
+    placementGroupId: "123",
+    reprName: ",",
+  },
+};
+describe("ActorTable", () => {
+  it("renders a table of actors sorted by state", () => {
+    const { getByRole } = render(
+      <MemoryRouter>
+        <ActorTable actors={MOCK_ACTORS} />
+      </MemoryRouter>,
+    );
+
+    const actor1Row = getByRole("row", {
+      name: /ACTOR_1/,
+    });
+    const actor2Row = getByRole("row", {
+      name: /ACTOR_2/,
+    });
+
+    expect(within(actor1Row).getByText("ACTOR_1")).toBeInTheDocument();
+    expect(within(actor2Row).getByText("ACTOR_2")).toBeInTheDocument();
+
+    expect(actor1Row.compareDocumentPosition(actor2Row)).toBe(
+      Node.DOCUMENT_POSITION_FOLLOWING,
+    ); // actor2Row appear after actor1Row
+  });
+
+  it("renders a table of actors sorted by startTime asc when states are the same", () => {
+    const RUNNING_ACTORS = {
+      ...MOCK_ACTORS,
+      ACTOR_2: {
+        ...MOCK_ACTORS.ACTOR_2,
+        state: "ALIVE",
+      },
+    };
+
+    const { getByRole } = render(
+      <MemoryRouter>
+        <ActorTable actors={RUNNING_ACTORS} />
+      </MemoryRouter>,
+    );
+    const actor1Row = getByRole("row", {
+      name: /ACTOR_1/,
+    });
+    const actor2Row = getByRole("row", {
+      name: /ACTOR_2/,
+    });
+
+    expect(within(actor1Row).getByText("ACTOR_1")).toBeInTheDocument();
+    expect(within(actor2Row).getByText("ACTOR_2")).toBeInTheDocument();
+
+    expect(actor1Row.compareDocumentPosition(actor2Row)).toBe(
+      Node.DOCUMENT_POSITION_FOLLOWING,
+    ); // actor2Row appear after actor1Row
+  });
+
+  it("renders a table of actors sorted by startTime asc when states are the same, actor2 appears first", () => {
+    const RUNNING_ACTORS = {
+      ...MOCK_ACTORS,
+      ACTOR_2: {
+        ...MOCK_ACTORS.ACTOR_2,
+        state: "ALIVE",
+        startTime: 1679010689146,
+      },
+    };
+
+    const { getByRole } = render(
+      <MemoryRouter>
+        <ActorTable actors={RUNNING_ACTORS} />
+      </MemoryRouter>,
+    );
+    const actor1Row = getByRole("row", {
+      name: /ACTOR_1/,
+    });
+    const actor2Row = getByRole("row", {
+      name: /ACTOR_2/,
+    });
+
+    expect(within(actor1Row).getByText("ACTOR_1")).toBeInTheDocument();
+    expect(within(actor2Row).getByText("ACTOR_2")).toBeInTheDocument();
+
+    expect(actor2Row.compareDocumentPosition(actor1Row)).toBe(
+      Node.DOCUMENT_POSITION_FOLLOWING,
+    ); // actor2Row appear before actor1Row
+  });
+});
diff --git a/doc/source/_includes/rllib/rlmodules_rollout.rst b/doc/source/_includes/rllib/rlmodules_rollout.rst
@@ -0,0 +1 @@
+.. note:: This doc is related to the `RLModule API <rllib-rlmodule.html>`__ and therefore experimental.
diff --git a/doc/source/_toc.yml b/doc/source/_toc.yml
@@ -289,7 +289,7 @@ parts:
         title: Ray RLlib
         sections:
           - file: rllib/rllib-training
-          - file: rllib/core-concepts
+          - file: rllib/key-concepts
           - file: rllib/rllib-env
           - file: rllib/rllib-algorithms
           - file: rllib/user-guides
@@ -301,6 +301,7 @@ parts:
               - file: rllib/rllib-sample-collection
               - file: rllib/rllib-replay-buffers
               - file: rllib/rllib-offline
+              - file: rllib/rllib-catalogs
               - file: rllib/rllib-connector
               - file: rllib/rllib-rlmodule
               - file: rllib/rllib-fault-tolerance

diff --git a/doc/source/cluster/doc_code/pytorch_training_e2e_submit.py b/doc/source/cluster/doc_code/pytorch_training_e2e_submit.py
@@ -4,7 +4,7 @@
 
 kick_off_pytorch_benchmark = (
     # Clone ray. If ray is already present, don't clone again.
-    "git clone https://github.com/ray-project/ray || true;"
+    "git clone -b ray-2.2.0 https://github.com/ray-project/ray || true;"
     # Run the benchmark.
     "python ray/release/air_tests/air_benchmarks/workloads/pytorch_training_e2e.py"
     " --data-size-gb=1 --num-epochs=2 --num-workers=1"
@@ -16,4 +16,4 @@
 )
 
 print("Use the following command to follow this Job's logs:")
-print(f"ray job logs '{submission_id}' --follow")
+print(f"ray job logs '{submission_id}' --address http://127.0.0.1:8265 --follow")
diff --git a/doc/source/cluster/kubernetes/configs/ray-cluster.gpu.yaml b/doc/source/cluster/kubernetes/configs/ray-cluster.gpu.yaml
@@ -8,27 +8,23 @@ metadata:
   name: raycluster
 spec:
   # The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
-  rayVersion: '2.0.0'
+  rayVersion: '2.2.0'
   ######################headGroupSpec#################################
   # head group template and specs, (perhaps 'group' is not needed in the name)
   headGroupSpec:
-    # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
-    serviceType: ClusterIP
     # logical group name, for this called head-group, also can be functional
     # pod type head or worker
     # rayNodeType: head # Not needed since it is under the headgroup
     # the following params are used to complete the ray start: ray start --head --block ...
     rayStartParams:
       dashboard-host: '0.0.0.0'
-      block: 'true'
     #pod template
     template:
       spec:
         containers:
         # The Ray head pod
         - name: ray-head
-          image: rayproject/ray-ml:2.0.0-gpu
-          imagePullPolicy: Always
+          image: rayproject/ray-ml:2.2.0-gpu
           lifecycle:
             preStop:
               exec:
@@ -48,7 +44,6 @@ spec:
     # logical group name, for this called small-group, also can be functional
     groupName: small-group
     rayStartParams:
-      block: 'true'
       num-gpus: "1"
     #pod template
     template:
@@ -59,14 +54,9 @@ spec:
         annotations:
           key: value
       spec:
-        initContainers:
-        # the env var $RAY_IP is set by the operator if missing, with the value of the head service name
-        - name: init-myservice
-          image: busybox:1.28
-          command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
         containers:
         - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc'
-          image: rayproject/ray-ml:2.0.0-gpu
+          image: rayproject/ray-ml:2.2.0-gpu
           lifecycle:
             preStop:
               exec:

diff --git a/doc/source/cluster/kubernetes/configs/ray-cluster.log.yaml b/doc/source/cluster/kubernetes/configs/ray-cluster.log.yaml
@@ -23,17 +23,15 @@ metadata:
     controller-tools.k8s.io: "1.0"
   name: raycluster-complete-logs
 spec:
-  rayVersion: '2.0.0'
+  rayVersion: '2.3.0'
   headGroupSpec:
-    serviceType: ClusterIP
     rayStartParams:
       dashboard-host: '0.0.0.0'
-      block: 'true'
     template:
       spec:
         containers:
         - name: ray-head
-          image: rayproject/ray:2.0.0
+          image: rayproject/ray:2.3.0
           lifecycle:
             preStop:
               exec:

diff --git a/doc/source/cluster/kubernetes/examples/gpu-training-example.md b/doc/source/cluster/kubernetes/examples/gpu-training-example.md
@@ -40,19 +40,18 @@ kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container
 #   (Method 2) "gcloud container clusters get-credentials <your-cluster-name> --region <your-region> --project <your-project>"
 #   (Method 3) "kubectl config use-context ..."
 
-# Create the KubeRay operator
-kubectl create -k "github.com/ray-project/kuberay/ray-operator/config/default?ref=v0.4.0&timeout=90s"
+# Install both CRDs and KubeRay operator v0.5.0.
+helm repo add kuberay https://ray-project.github.io/kuberay-helm/
+helm install kuberay-operator kuberay/kuberay-operator --version 0.5.0
 
 # Create a Ray cluster
 kubectl apply -f https://raw.githubusercontent.com/ray-project/ray/master/doc/source/cluster/kubernetes/configs/ray-cluster.gpu.yaml
 
 # Set up port-forwarding
-kubectl port-forward services/raycluster-head-svc 8265:8265
-
-# Test the cluster
-ray job submit --address http://localhost:8265 -- python -c "import ray; ray.init(); print(ray.cluster_resources())"
+kubectl port-forward --address 0.0.0.0 services/raycluster-head-svc 8265:8265
 
 # Step 3: Run the PyTorch image training benchmark.
+# Install Ray if needed
 pip3 install -U "ray[default]"
 
 # Download the Python script
@@ -63,7 +62,7 @@ python3 pytorch_training_e2e_submit.py
 
 # Use the following command to follow this Job's logs:
 # Substitute the Ray Job's submission id.
-ray job logs 'raysubmit_xxxxxxxxxxxxxxxx' --follow
+ray job logs 'raysubmit_xxxxxxxxxxxxxxxx' --address http://127.0.0.1:8265 --follow
 ```
 In the rest of this document, we present a more detailed breakdown of the above workflow.
 
@@ -114,13 +113,14 @@ It is optional.
 ```shell
 # Step 2: Deploy a Ray cluster on Kubernetes with the KubeRay operator.
 # Create the KubeRay operator
-kubectl create -k "github.com/ray-project/kuberay/ray-operator/config/default?ref=v0.4.0&timeout=90s"
+helm repo add kuberay https://ray-project.github.io/kuberay-helm/
+helm install kuberay-operator kuberay/kuberay-operator --version 0.5.0
 
 # Create a Ray cluster
 kubectl apply -f https://raw.githubusercontent.com/ray-project/ray/master/doc/source/cluster/kubernetes/configs/ray-cluster.gpu.yaml
 
 # port forwarding
-kubectl port-forward services/raycluster-head-svc 8265:8265
+kubectl port-forward --address 0.0.0.0 services/raycluster-head-svc 8265:8265
 
 # Test cluster (optional)
 ray job submit --address http://localhost:8265 -- python -c "import ray; ray.init(); print(ray.cluster_resources())"
@@ -129,26 +129,8 @@ ray job submit --address http://localhost:8265 -- python -c "import ray; ray.ini
 ## Step 3: Run the PyTorch image training benchmark.
 We will use the [Ray Job Python SDK](https://docs.ray.io/en/latest/cluster/running-applications/job-submission/sdk.html#ray-job-sdk) to submit the PyTorch workload.
 
-```python
-from ray.job_submission import JobSubmissionClient
-
-client = JobSubmissionClient("http://127.0.0.1:8265")
-
-kick_off_pytorch_benchmark = (
-    # Clone ray. If ray is already present, don't clone again.
-    "git clone https://github.com/ray-project/ray || true;"
-    # Run the benchmark.
-    "python ray/release/air_tests/air_benchmarks/workloads/pytorch_training_e2e.py"
-    " --data-size-gb=1 --num-epochs=2 --num-workers=1"
-)
-
-
-submission_id = client.submit_job(
-    entrypoint=kick_off_pytorch_benchmark,
-)
-
-print("Use the following command to follow this Job's logs:")
-print(f"ray job logs '{submission_id}' --follow")
+```{literalinclude} /cluster/doc_code/pytorch_training_e2e_submit.py
+:language: python
 ```
 
 To submit the workload, run the above Python script. The script is available in the [Ray repository](https://github.com/ray-project/ray/tree/master/doc/source/cluster/doc_code/pytorch_training_e2e_submit.py)
@@ -168,7 +150,7 @@ python3 pytorch_training_e2e_submit.py
 
 # Track job status
 # Substitute the Ray Job's submission id.
-ray job logs 'raysubmit_xxxxxxxxxxxxxxxx' --follow
+ray job logs 'raysubmit_xxxxxxxxxxxxxxxx' --address http://127.0.0.1:8265 --follow
 ```
 
 ## Clean-up
@@ -177,7 +159,7 @@ Delete your Ray cluster and KubeRay with the following commands:
 kubectl delete raycluster raycluster
 
 # Please make sure the ray cluster has already been removed before delete the operator.
-kubectl delete -k "http://github.com/ray-project/kuberay/ray-operator/config/default?ref=v0.4.0&timeout=90s"
+helm uninstall kuberay-operator
 ```
 If you're on a public cloud, don't forget to clean up the underlying
 node group and/or Kubernetes cluster.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		.. note:: This doc is related to the `RLModule API <rllib-rlmodule.html>`__ and therefore experimental.