Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/ray-project/ray
Browse files Browse the repository at this point in the history
  • Loading branch information
jianoaix committed Apr 11, 2023
2 parents 1ba2e7d + 92d6f1f commit a48dae4
Show file tree
Hide file tree
Showing 225 changed files with 2,981 additions and 3,164 deletions.
131 changes: 131 additions & 0 deletions dashboard/client/src/components/ActorTable.component.test.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import { render, within } from "@testing-library/react";
import React from "react";
import { MemoryRouter } from "react-router-dom";
import { Actor } from "../type/actor";
import ActorTable from "./ActorTable";
const MOCK_ACTORS: { [actorId: string]: Actor } = {
ACTOR_1: {
actorId: "ACTOR_1",
jobId: "01000000",
address: {
rayletId: "426854e68e4225b3941deaf03c8dcfcb1daacc69a92711d370dbb0e1",
ipAddress: "172.31.11.178",
port: 10003,
workerId: "b8b276a03612644098ed7a929c3b0e50f5bde894eb0d8cab288fbb6d",
},
state: "ALIVE",
numRestarts: "0",
name: "",
pid: 25321,
startTime: 1679010689148,
endTime: 0,
actorClass: "Counter",
exitDetail: "-",
requiredResources: {},
placementGroupId: "123",
reprName: ",",
},
ACTOR_2: {
actorId: "ACTOR_2",
jobId: "01000000",
address: {
rayletId: "426854e68e4225b3941deaf03c8dcfcb1daacc69a92711d370dbb0e1",
ipAddress: "172.31.11.178",
port: 10003,
workerId: "b8b276a03612644098ed7a929c3b0e50f5bde894eb0d8cab288fbb6d",
},
state: "DEAD",
numRestarts: "0",
name: "",
pid: 25322,
startTime: 1679010689150,
endTime: 0,
actorClass: "Counter",
exitDetail: "-",
requiredResources: {},
placementGroupId: "123",
reprName: ",",
},
};
describe("ActorTable", () => {
it("renders a table of actors sorted by state", () => {
const { getByRole } = render(
<MemoryRouter>
<ActorTable actors={MOCK_ACTORS} />
</MemoryRouter>,
);

const actor1Row = getByRole("row", {
name: /ACTOR_1/,
});
const actor2Row = getByRole("row", {
name: /ACTOR_2/,
});

expect(within(actor1Row).getByText("ACTOR_1")).toBeInTheDocument();
expect(within(actor2Row).getByText("ACTOR_2")).toBeInTheDocument();

expect(actor1Row.compareDocumentPosition(actor2Row)).toBe(
Node.DOCUMENT_POSITION_FOLLOWING,
); // actor2Row appear after actor1Row
});

it("renders a table of actors sorted by startTime asc when states are the same", () => {
const RUNNING_ACTORS = {
...MOCK_ACTORS,
ACTOR_2: {
...MOCK_ACTORS.ACTOR_2,
state: "ALIVE",
},
};

const { getByRole } = render(
<MemoryRouter>
<ActorTable actors={RUNNING_ACTORS} />
</MemoryRouter>,
);
const actor1Row = getByRole("row", {
name: /ACTOR_1/,
});
const actor2Row = getByRole("row", {
name: /ACTOR_2/,
});

expect(within(actor1Row).getByText("ACTOR_1")).toBeInTheDocument();
expect(within(actor2Row).getByText("ACTOR_2")).toBeInTheDocument();

expect(actor1Row.compareDocumentPosition(actor2Row)).toBe(
Node.DOCUMENT_POSITION_FOLLOWING,
); // actor2Row appear after actor1Row
});

it("renders a table of actors sorted by startTime asc when states are the same, actor2 appears first", () => {
const RUNNING_ACTORS = {
...MOCK_ACTORS,
ACTOR_2: {
...MOCK_ACTORS.ACTOR_2,
state: "ALIVE",
startTime: 1679010689146,
},
};

const { getByRole } = render(
<MemoryRouter>
<ActorTable actors={RUNNING_ACTORS} />
</MemoryRouter>,
);
const actor1Row = getByRole("row", {
name: /ACTOR_1/,
});
const actor2Row = getByRole("row", {
name: /ACTOR_2/,
});

expect(within(actor1Row).getByText("ACTOR_1")).toBeInTheDocument();
expect(within(actor2Row).getByText("ACTOR_2")).toBeInTheDocument();

expect(actor2Row.compareDocumentPosition(actor1Row)).toBe(
Node.DOCUMENT_POSITION_FOLLOWING,
); // actor2Row appear before actor1Row
});
});
1 change: 1 addition & 0 deletions doc/source/_includes/rllib/rlmodules_rollout.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.. note:: This doc is related to the `RLModule API <rllib-rlmodule.html>`__ and therefore experimental.
3 changes: 2 additions & 1 deletion doc/source/_toc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ parts:
title: Ray RLlib
sections:
- file: rllib/rllib-training
- file: rllib/core-concepts
- file: rllib/key-concepts
- file: rllib/rllib-env
- file: rllib/rllib-algorithms
- file: rllib/user-guides
Expand All @@ -301,6 +301,7 @@ parts:
- file: rllib/rllib-sample-collection
- file: rllib/rllib-replay-buffers
- file: rllib/rllib-offline
- file: rllib/rllib-catalogs
- file: rllib/rllib-connector
- file: rllib/rllib-rlmodule
- file: rllib/rllib-fault-tolerance
Expand Down
4 changes: 2 additions & 2 deletions doc/source/cluster/doc_code/pytorch_training_e2e_submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

kick_off_pytorch_benchmark = (
# Clone ray. If ray is already present, don't clone again.
"git clone https://github.com/ray-project/ray || true;"
"git clone -b ray-2.2.0 https://github.com/ray-project/ray || true;"
# Run the benchmark.
"python ray/release/air_tests/air_benchmarks/workloads/pytorch_training_e2e.py"
" --data-size-gb=1 --num-epochs=2 --num-workers=1"
Expand All @@ -16,4 +16,4 @@
)

print("Use the following command to follow this Job's logs:")
print(f"ray job logs '{submission_id}' --follow")
print(f"ray job logs '{submission_id}' --address http://127.0.0.1:8265 --follow")
16 changes: 3 additions & 13 deletions doc/source/cluster/kubernetes/configs/ray-cluster.gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,23 @@ metadata:
name: raycluster
spec:
# The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
rayVersion: '2.0.0'
rayVersion: '2.2.0'
######################headGroupSpec#################################
# head group template and specs, (perhaps 'group' is not needed in the name)
headGroupSpec:
# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
serviceType: ClusterIP
# logical group name, for this called head-group, also can be functional
# pod type head or worker
# rayNodeType: head # Not needed since it is under the headgroup
# the following params are used to complete the ray start: ray start --head --block ...
rayStartParams:
dashboard-host: '0.0.0.0'
block: 'true'
#pod template
template:
spec:
containers:
# The Ray head pod
- name: ray-head
image: rayproject/ray-ml:2.0.0-gpu
imagePullPolicy: Always
image: rayproject/ray-ml:2.2.0-gpu
lifecycle:
preStop:
exec:
Expand All @@ -48,7 +44,6 @@ spec:
# logical group name, for this called small-group, also can be functional
groupName: small-group
rayStartParams:
block: 'true'
num-gpus: "1"
#pod template
template:
Expand All @@ -59,14 +54,9 @@ spec:
annotations:
key: value
spec:
initContainers:
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
- name: init-myservice
image: busybox:1.28
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
containers:
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
image: rayproject/ray-ml:2.0.0-gpu
image: rayproject/ray-ml:2.2.0-gpu
lifecycle:
preStop:
exec:
Expand Down
6 changes: 2 additions & 4 deletions doc/source/cluster/kubernetes/configs/ray-cluster.log.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,15 @@ metadata:
controller-tools.k8s.io: "1.0"
name: raycluster-complete-logs
spec:
rayVersion: '2.0.0'
rayVersion: '2.3.0'
headGroupSpec:
serviceType: ClusterIP
rayStartParams:
dashboard-host: '0.0.0.0'
block: 'true'
template:
spec:
containers:
- name: ray-head
image: rayproject/ray:2.0.0
image: rayproject/ray:2.3.0
lifecycle:
preStop:
exec:
Expand Down
44 changes: 13 additions & 31 deletions doc/source/cluster/kubernetes/examples/gpu-training-example.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,19 +40,18 @@ kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container
# (Method 2) "gcloud container clusters get-credentials <your-cluster-name> --region <your-region> --project <your-project>"
# (Method 3) "kubectl config use-context ..."

# Create the KubeRay operator
kubectl create -k "github.com/ray-project/kuberay/ray-operator/config/default?ref=v0.4.0&timeout=90s"
# Install both CRDs and KubeRay operator v0.5.0.
helm repo add kuberay https://ray-project.github.io/kuberay-helm/
helm install kuberay-operator kuberay/kuberay-operator --version 0.5.0

# Create a Ray cluster
kubectl apply -f https://raw.githubusercontent.com/ray-project/ray/master/doc/source/cluster/kubernetes/configs/ray-cluster.gpu.yaml

# Set up port-forwarding
kubectl port-forward services/raycluster-head-svc 8265:8265

# Test the cluster
ray job submit --address http://localhost:8265 -- python -c "import ray; ray.init(); print(ray.cluster_resources())"
kubectl port-forward --address 0.0.0.0 services/raycluster-head-svc 8265:8265

# Step 3: Run the PyTorch image training benchmark.
# Install Ray if needed
pip3 install -U "ray[default]"

# Download the Python script
Expand All @@ -63,7 +62,7 @@ python3 pytorch_training_e2e_submit.py

# Use the following command to follow this Job's logs:
# Substitute the Ray Job's submission id.
ray job logs 'raysubmit_xxxxxxxxxxxxxxxx' --follow
ray job logs 'raysubmit_xxxxxxxxxxxxxxxx' --address http://127.0.0.1:8265 --follow
```
In the rest of this document, we present a more detailed breakdown of the above workflow.

Expand Down Expand Up @@ -114,13 +113,14 @@ It is optional.
```shell
# Step 2: Deploy a Ray cluster on Kubernetes with the KubeRay operator.
# Create the KubeRay operator
kubectl create -k "github.com/ray-project/kuberay/ray-operator/config/default?ref=v0.4.0&timeout=90s"
helm repo add kuberay https://ray-project.github.io/kuberay-helm/
helm install kuberay-operator kuberay/kuberay-operator --version 0.5.0

# Create a Ray cluster
kubectl apply -f https://raw.githubusercontent.com/ray-project/ray/master/doc/source/cluster/kubernetes/configs/ray-cluster.gpu.yaml

# port forwarding
kubectl port-forward services/raycluster-head-svc 8265:8265
kubectl port-forward --address 0.0.0.0 services/raycluster-head-svc 8265:8265

# Test cluster (optional)
ray job submit --address http://localhost:8265 -- python -c "import ray; ray.init(); print(ray.cluster_resources())"
Expand All @@ -129,26 +129,8 @@ ray job submit --address http://localhost:8265 -- python -c "import ray; ray.ini
## Step 3: Run the PyTorch image training benchmark.
We will use the [Ray Job Python SDK](https://docs.ray.io/en/latest/cluster/running-applications/job-submission/sdk.html#ray-job-sdk) to submit the PyTorch workload.

```python
from ray.job_submission import JobSubmissionClient

client = JobSubmissionClient("http://127.0.0.1:8265")

kick_off_pytorch_benchmark = (
# Clone ray. If ray is already present, don't clone again.
"git clone https://github.com/ray-project/ray || true;"
# Run the benchmark.
"python ray/release/air_tests/air_benchmarks/workloads/pytorch_training_e2e.py"
" --data-size-gb=1 --num-epochs=2 --num-workers=1"
)


submission_id = client.submit_job(
entrypoint=kick_off_pytorch_benchmark,
)

print("Use the following command to follow this Job's logs:")
print(f"ray job logs '{submission_id}' --follow")
```{literalinclude} /cluster/doc_code/pytorch_training_e2e_submit.py
:language: python
```

To submit the workload, run the above Python script. The script is available in the [Ray repository](https://github.com/ray-project/ray/tree/master/doc/source/cluster/doc_code/pytorch_training_e2e_submit.py)
Expand All @@ -168,7 +150,7 @@ python3 pytorch_training_e2e_submit.py

# Track job status
# Substitute the Ray Job's submission id.
ray job logs 'raysubmit_xxxxxxxxxxxxxxxx' --follow
ray job logs 'raysubmit_xxxxxxxxxxxxxxxx' --address http://127.0.0.1:8265 --follow
```

## Clean-up
Expand All @@ -177,7 +159,7 @@ Delete your Ray cluster and KubeRay with the following commands:
kubectl delete raycluster raycluster

# Please make sure the ray cluster has already been removed before delete the operator.
kubectl delete -k "http://github.com/ray-project/kuberay/ray-operator/config/default?ref=v0.4.0&timeout=90s"
helm uninstall kuberay-operator
```
If you're on a public cloud, don't forget to clean up the underlying
node group and/or Kubernetes cluster.
Loading

0 comments on commit a48dae4

Please sign in to comment.