From f09b529541cdac94f81538faf636561195b545c0 Mon Sep 17 00:00:00 2001
From: Richard Liaw <rliaw@berkeley.edu>
Date: Mon, 2 Oct 2023 18:18:57 -0700
Subject: [PATCH 1/4] changes

---
 docs/kuberay/deploy-on-eks.md            | 193 +++++++++-------
 docs/kuberay/deploy-on-gke.md            | 276 ++++++++---------------
 docs/kuberay/ray-cluster.aviary-eks.yaml |   2 +-
 docs/kuberay/ray-cluster.aviary-gke.yaml |   6 +-
 docs/kuberay/ray-service.aviary-eks.yaml |  16 +-
 docs/reference.md                        |  66 ------
 6 files changed, 232 insertions(+), 327 deletions(-)
 delete mode 100644 docs/reference.md

diff --git a/docs/kuberay/deploy-on-eks.md b/docs/kuberay/deploy-on-eks.md
index d4e3a162..8edaf055 100644
--- a/docs/kuberay/deploy-on-eks.md
+++ b/docs/kuberay/deploy-on-eks.md
@@ -1,4 +1,4 @@
-# Deploy Aviary on Amazon EKS using KubeRay
+# Deploy RayLLM on Amazon EKS using KubeRay
 * Note that this document will be extended to include Ray autoscaling and the deployment of multiple models in the near future.
 
 # Part 1: Set up a Kubernetes cluster on Amazon EKS
@@ -84,15 +84,15 @@ helm install kuberay-operator kuberay/kuberay-operator --version 0.6.0
 
 At this point, you have two options:
 
-1. You can deploy Aviary manually on a `RayCluster` (Part 3), or
-2. You can deploy Aviary using a [`RayService` custom resource](https://ray-project.github.io/kuberay/guidance/rayservice/) (Part 4).
+1. You can deploy RayLLM manually on a `RayCluster` (Part 3), or
+2. You can deploy RayLLM using a [`RayService` custom resource](https://ray-project.github.io/kuberay/guidance/rayservice/) (Part 4).
 
 The first option is more flexible for conducting experiments.
 The second option is recommended for production use due to the additional high availability features provided by the `RayService` custom resource, which will manage the underlying `RayCluster`s for you.
 
-# Part 3: Deploy Aviary on a RayCluster (recommended for experiments)
+# Part 3: Deploy RayLLM on a RayCluster (recommended for experiments)
 
-## Step 1: Create a RayCluster with Aviary
+## Step 1: Create a RayCluster with RayLLM
 
 ```sh
 # path: docs/kuberay
@@ -116,13 +116,13 @@ Something is worth noticing:
       resources: '"{\"accelerator_type_cpu\": 2}"'
 
     # Ray workers: The Ray worker has a Pod resource limit of 48 CPUs and 4 GPUs.
-    # `accelerator_type_a10` and `accelerator_type_a100_80g` below are only used for Ray logical-resource scheduling.
+    # `accelerator_type_a10` and `accelerator_type_a100` below are only used for Ray logical-resource scheduling.
     # This does not imply that each worker has 2 A10 GPUs and 2 A100 GPUs.
     rayStartParams:
-      resources: '"{\"accelerator_type_cpu\": 48, \"accelerator_type_a10\": 2, \"accelerator_type_a100_80g\": 2}"'
+      resources: '"{\"accelerator_type_cpu\": 48, \"accelerator_type_a10\": 2, \"accelerator_type_a100\": 2}"'
     ```
 
-## Step 2: Deploy a LLM model with Aviary
+## Step 2: Deploy an LLM model with RayLLM
 
 ```sh
 # Step 7.1: Log in to the head Pod
@@ -133,67 +133,96 @@ kubectl exec -it $HEAD_POD -- bash
 # If you don't have one, you can skip this step and deploy other models in Step 7.3.
 export HUGGING_FACE_HUB_TOKEN=${YOUR_HUGGING_FACE_HUB_TOKEN}
 
-# Step 7.3: Deploy a LLM model. You can deploy Falcon-7B if you don't have a Hugging Face Hub token.
+# Step 7.3: Deploy an LLM model. You can deploy Falcon-7B if you don't have a Hugging Face Hub token.
 # (1) Llama 2 7B
-aviary run --model ~/models/continuous_batching/meta-llama--Llama-2-7b-chat-hf.yaml
-# (2) Falcon 7B
-aviary run --model ./models/continuous_batching/OpenAssistant--falcon-7b-sft-top1-696.yaml
+serve run serve/meta-llama--Llama-2-7b-chat-hf.yaml
 
 # Step 7.3: Check the Serve application status
 serve status
 
 # [Example output]
-# name: OpenAssistant--falcon-7b-sft-top1-696
-# app_status:
-#   status: RUNNING
-#   message: ''
-#   deployment_timestamp: 1691109255.5476327
-# deployment_statuses:
-# - name: OpenAssistant--falcon-7b-sft-top1-696_OpenAssistant--falcon-7b-sft-top1-696
-#   status: HEALTHY
-#   message: ''
-# ---
-# name: router
-# app_status:
-#   status: RUNNING
-#   message: ''
-#   deployment_timestamp: 1691109255.6641886
-# deployment_statuses:
-# - name: router_Router
-#   status: HEALTHY
-#   message: ''
-
-# Step 7.4: List all models
-export AVIARY_URL="http://localhost:8000"
-aviary models
+# proxies:
+#   e4dc8d29f19e3900c0b93dabb76ce9bcc6f42e36bdf5484ca57ec774: HEALTHY
+#   4f4edf80bf644846175eec0a4daabb3f3775e64738720b6b2ae5c139: HEALTHY
+# applications:
+#   router:
+#     status: RUNNING
+#     message: ''
+#     last_deployed_time_s: 1694808658.0861287
+#     deployments:
+#       Router:
+#         status: HEALTHY
+#         replica_states:
+#           RUNNING: 2
+#         message: ''
+#   meta-llama--Llama-2-7b-chat-hf:
+#     status: RUNNING
+#     message: ''
+#     last_deployed_time_s: 1694808658.0861287
+#     deployments:
+#       meta-llama--Llama-2-7b-chat-hf:
+#         status: HEALTHY
+#         replica_states:
+#           RUNNING: 1
+#         message: ''
+
+# Step 7.4: Check the live Serve app's config
+serve config
 
 # [Example output]
-# Connecting to Aviary backend at:  http://localhost:8000/
-# OpenAssistant/falcon-7b-sft-top1-696
-
-# Step 7.5: Send a query to `OpenAssistant/falcon-7b-sft-top1-696`.
-aviary query --model OpenAssistant/falcon-7b-sft-top1-696 --prompt "What are the top 5 most popular programming languages?"
+# name: router
+# route_prefix: /
+# import_path: aviary.backend:router_application
+# args:
+#   models:
+#     meta-llama/Llama-2-7b-chat-hf: ./models/continuous_batching/meta-llama--Llama-2-7b-chat-hf.yaml
 
-# [Example output for `OpenAssistant/falcon-7b-sft-top1-696`]
-# Connecting to Aviary backend at:  http://localhost:8000/v1
-# OpenAssistant/falcon-7b-sft-top1-696:
-# The top five most popular programming languages globally, according to TIOBE, are Java, Python, C, C++, and JavaScript. However, popularity can vary by region, industry, and
-# other factors. Additionally, the definition of a programming language can vary, leading to different rankings depending on the methodology used. Some rankings may include or
-# exclude specific scripting languages or high-level language variants, for example.
+# ---
 
-# Here are some additional rankings of the most popular programming languages:
-# * **Top 10 programming languages in 2023**: Python, JavaScript, C#, Java, PHP, TypeScript, Swift, Golang, Ruby, and Kotlin.
-# [Source](https://www.toptal.com/software/programming-languages/2023-best-programming-languages/)
-# * **Top 10 programming languages in 2022**: Python, JavaScript, Java, C++, C#, PHP, Swift, Kotlin, R, and TypeScript.
-# [Source](https://www.toptal.com/software/programming-languages/2022-best-programming-languages/)
-# * **Top 10 programming languages in 2021**: Python, JavaScript, Java, C++, C#, PHP, Swift, Go, Kotlin, and TypeScript.
-# .....
-# These rankings can change frequently, so it's important to keep up to date with the latest trends.
+# name: meta-llama--Llama-2-7b-chat-hf
+# route_prefix: /meta-llama--Llama-2-7b-chat-hf
+# import_path: aviary.backend:llm_application
+# args:
+#   model: ./models/continuous_batching/meta-llama--Llama-2-7b-chat-hf.yaml
+
+# Step 7.5: Send a query to `meta-llama/Llama-2-7b-chat-hf`.
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "meta-llama/Llama-2-7b-chat-hf",
+    "messages": [
+      {"role": "system", "content": "You are a helpful assistant."},
+      {"role": "user", "content": "What are the top 5 most popular programming languages?"}
+    ],
+    "temperature": 0.7
+  }'
+
+# [Example output for `meta-llama/Llama-2-7b-chat-hf`]
+{
+  "id":"meta-llama/Llama-2-7b-chat-hf-95239f0b-4601-4557-8a33-3977e9b6b779",
+  "object":"text_completion","created":1694814804,"model":"meta-llama/Llama-2-7b-chat-hf",
+  "choices":[
+    {
+      "message":
+      {
+        "role":"assistant",
+        "content":"As a helpful assistant, I'm glad to provide you with the top 5 most popular programming languages based on various sources and metrics:\n\n1. Java: Java is a popular language used for developing enterprise-level applications, Android apps, and web applications. It's known for its platform independence, which allows Java developers to create applications that can run on any device supporting the Java Virtual Machine (JVM).\n\n2. Python: Python is a versatile language that's widely used in various industries, including web development, data science, artificial intelligence, and machine learning. Its simplicity, readability, and ease of use make it a favorite among developers.\n\n3. JavaScript: JavaScript is the language of the web and is used for creating interactive client-side functionality for web applications. It's also used in mobile app development, game development, and server-side programming.\n\n4. C++: C++ is a high-performance language used for developing operating systems, games, and other high-performance applications. It's known for its efficiency, speed, and flexibility, making it a popular choice among developers.\n\n5. PHP: PHP is a server-side scripting language used for web development, especially for building dynamic websites and web applications. It's known for its ease of use and is widely used in the web development community.\n\nThese are the top 5 most popular programming languages based on various sources, but it's worth noting that programming language popularity can vary depending on the source and the time frame considered."
+      },
+      "index":0,
+      "finish_reason":"stop"
+    }
+  ],
+  "usage":{
+    "prompt_tokens":39,
+    "completion_tokens":330,
+    "total_tokens":369
+  }
+}
 ```
 
-# Part 4: Deploy Aviary on a RayService (recommended for production)
+# Part 4: Deploy RayLLM on a RayService (recommended for production)
 
-## Step 1: Create a RayService with Aviary
+## Step 1: Create a RayService with RayLLM
 
 ```sh
 # path: docs/kuberay
@@ -229,34 +258,38 @@ serveConfigV2: |
           OpenAssistant/falcon-7b-sft-top1-696: ./models/continuous_batching/OpenAssistant--falcon-7b-sft-top1-696.yaml
 ```
 
-In the YAML file, we use the `serveConfigV2` field to configure two LLM serve applications, one for LightGPT and one for Falcon-7B.
+In the YAML file, we use the `serveConfigV2` field to configure two LLM Serve applications, one for LightGPT and one for Falcon-7B.
 It's important to note that the `model` argument refers to the path of the LLM model's YAML file, located in the Ray head Pod.
 
-## Step 2: Send a query to both `amazon/LightGPT` and `OpenAssistant/falcon-7b-sft-top1-696`
+## Step 2: Send a query to both `amazon/LightGPT` and `OpenAssistant/falcon-7b-sft-top1-696`.
 
 ```sh
-# Step 2.1: Port forward the Kubernetes serve service.
-# Note that the service will be created only when all serve applications are ready.
+# Step 2.1: Port forward the Kubernetes Serve service.
+# Note that the service will be created only when all Serve applications are ready.
 kubectl get svc # Check if `aviary-serve-svc` is created.
 kubectl port-forward service/aviary-serve-svc 8000:8000
 
-# Step 2.2: Install the Aviary client if not already installed.
-pip install "aviary @ git+https://github.com/ray-project/aviary.git"
-
-# Step 2.3: List models via the Aviary CLI outside the Kubernetes cluster.
-export AVIARY_URL="http://localhost:8000"
-aviary models
+# Step 2.2: Check that the models have started running using `serve status`
+serve status
 
 # [Example output]
 # Connecting to Aviary backend at:  http://localhost:8000/v1
 # OpenAssistant/falcon-7b-sft-top1-696
 # amazon/LightGPT
 
-# Step 2.4: Send a query to `amazon/LightGPT`.
-aviary query --model amazon/LightGPT --prompt "What are the top 5 most popular programming languages?"
+# Step 2.3: Send a query to `amazon/LightGPT`.
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "amazon/LightGPT",
+    "messages": [
+      {"role": "system", "content": "You are a helpful assistant."},
+      {"role": "user", "content": "What are the top 5 most popular programming languages?"}
+    ],
+    "temperature": 0.7
+  }'
 
 # [Example output]
-# Connecting to Aviary backend at:  http://localhost:8000/v1
 # amazon/LightGPT:
 # 1. Java
 # 2. C++
@@ -264,8 +297,17 @@ aviary query --model amazon/LightGPT --prompt "What are the top 5 most popular p
 # 4. Python
 # 5. SQL
 
-# Step 2.5: Send a query to `OpenAssistant/falcon-7b-sft-top1-696`.
-aviary query --model OpenAssistant/falcon-7b-sft-top1-696 --prompt "What are the top 5 most popular programming languages?"
+# Step 2.4: Send a query to `OpenAssistant/falcon-7b-sft-top1-696`.
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "OpenAssistant/falcon-7b-sft-top1-696",
+    "messages": [
+      {"role": "system", "content": "You are a helpful assistant."},
+      {"role": "user", "content": "What are the top 5 most popular programming languages?"}
+    ],
+    "temperature": 0.7
+  }'
 
 # [Example output for `OpenAssistant/falcon-7b-sft-top1-696`]
 # Connecting to Aviary backend at:  http://localhost:8000/v1
@@ -282,20 +324,19 @@ aviary query --model OpenAssistant/falcon-7b-sft-top1-696 --prompt "What are the
 # * **Top 10 programming languages in 2021**: Python, JavaScript, Java, C++, C#, PHP, Swift, Go, Kotlin, and TypeScript.
 # .....
 # These rankings can change frequently, so it's important to keep up to date with the latest trends.
-
-# Step 2.6: Send a query to `OpenAssistant/falcon-7b-sft-top1-696` and get streaming response.
-aviary stream --model OpenAssistant/falcon-7b-sft-top1-696 --prompt "What are the top 5 most popular programming languages?"
 ```
 
+Check out the RayLLM README to learn more ways to query models, such as with the Python `requests` library or the OpenAI package. Use these techniques to stream responses from the models.
+
 # Part 5: Clean up resources
 
 **Warning: GPU nodes are extremely expensive. Please remember to delete the cluster if you no longer need it.**
 
 ```sh
 # path: docs/kuberay
-# Case 1: Aviary was deployed on a RayCluster
+# Case 1: RayLLM was deployed on a RayCluster
 kubectl delete -f ray-cluster.aviary-eks.yaml
-# Case 2: Aviary was deployed as a RayService
+# Case 2: RayLLM was deployed as a RayService
 kubectl delete -f ray-service.aviary-eks.yaml
 
 # Uninstall the KubeRay operator chart
diff --git a/docs/kuberay/deploy-on-gke.md b/docs/kuberay/deploy-on-gke.md
index 6c172811..a945846a 100644
--- a/docs/kuberay/deploy-on-gke.md
+++ b/docs/kuberay/deploy-on-gke.md
@@ -1,10 +1,10 @@
-# Deploy Aviary on Googke Kubernetes Engine (GKE) using KubeRay
+# Deploy RayLLM on Google Kubernetes Engine (GKE) using KubeRay
 
 In this tutorial, we will:
 
 1. Set up a Kubernetes cluster on GKE.
 2. Deploy the KubeRay operator and a Ray cluster on GKE.
-3. Run an LLM model with Aviary.
+3. Run an LLM model with Ray Serve.
 
 * Note that this document will be extended to include Ray autoscaling and the deployment of multiple models in the near future.
 
@@ -13,12 +13,12 @@ In this tutorial, we will:
 Run this command and all following commands on your local machine or on the [Google Cloud Shell](https://cloud.google.com/shell). If running from your local machine, you will need to install the [Google Cloud SDK](https://cloud.google.com/sdk/docs/install).
 
 ```sh
-gcloud container clusters create aviary-gpu-cluster \
+gcloud container clusters create rayllm-gpu-cluster \
     --num-nodes=1 --min-nodes 0 --max-nodes 1 --enable-autoscaling \
     --zone=us-west1-b --machine-type e2-standard-8
 ```
 
-This command creates a Kubernetes cluster named `aviary-gpu-cluster` with 1 node in the `us-west1-b` zone. In this example, we use the `e2-standard-8` machine type, which has 8 vCPUs and 32 GB RAM. The cluster has autoscaling enabled, so the number of nodes can increase or decrease based on the workload.
+This command creates a Kubernetes cluster named `rayllm-gpu-cluster` with 1 node in the `us-west1-b` zone. In this example, we use the `e2-standard-8` machine type, which has 8 vCPUs and 32 GB RAM. The cluster has autoscaling enabled, so the number of nodes can increase or decrease based on the workload.
 
 You can also create a cluster from the [Google Cloud Console](https://console.cloud.google.com/kubernetes/list).
 
@@ -31,7 +31,7 @@ Run the following command to create a GPU node pool for Ray GPU workers.
 gcloud container node-pools create gpu-node-pool \
   --accelerator type=nvidia-l4-vws,count=4 \
   --zone us-west1-b \
-  --cluster aviary-gpu-cluster \
+  --cluster rayllm-gpu-cluster \
   --num-nodes 1 \
   --min-nodes 0 \
   --max-nodes 1 \
@@ -66,7 +66,7 @@ For more on taints and tolerations, see the [Kubernetes documentation](https://k
 Run the following command to download credentials and configure the Kubernetes CLI to use them.
 
 ```sh
-gcloud container clusters get-credentials aviary-gpu-cluster --zone us-west1-b
+gcloud container clusters get-credentials rayllm-gpu-cluster --zone us-west1-b
 ```
 
 For more details, see the [GKE documentation](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-access-for-kubectl).
@@ -79,73 +79,36 @@ This step is required for GPU support on GKE. See the [GKE documentation](https:
 # Install NVIDIA GPU device driver
 kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded-latest.yaml
 
-# Verify that your nodes have allocatable GPUs. It may take a few seconds for the GPUs to be allocated. 
+# Verify that your nodes have allocatable GPUs 
 kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu"
 
 # Example output:
-# NAME                                               GPU
-# gke-aviary-gpu-cluster-default-pool-ceb8fe4d-8dqw  <none>
-# gke-aviary-gpu-cluster-gpu-node-pool-2f4a373c-8q3q 4
-```
-
-### Troubleshooting
-
-If you never see the allocatable GPUs, or if the GPU node pool scaled down to zero nodes, you can still proceed to the next step. The GPU device driver will be installed when the GPU node pool scales up again.
-
-To debug issues with the GPU device driver installation, you can run the following command:
-
-```sh
-kubectl get pod -n kube-system
-```
-
-After finding the correct pod, you can check the logs with:
-
-```sh
-kubectl logs -n kube-system nvidia-driver-installer-xxxxx -c nvidia-driver-installer
+# NAME                                GPU
+# ...                                 4
+# ...                                 <none>
 ```
 
 ## Step 5: Install the KubeRay operator
 
 ```sh
-# Install both CRDs and KubeRay operator v0.6.0.
+# Install both CRDs and KubeRay operator v0.5.0.
 helm repo add kuberay https://ray-project.github.io/kuberay-helm/
-helm repo update
-helm install kuberay-operator kuberay/kuberay-operator --version 0.6.0
+helm install kuberay-operator kuberay/kuberay-operator --version 0.5.0
 
-# Check that it is scheduled on the CPU node. If it is not, something is wrong.
-kubectl get pods -o wide
-# Example output:
-# NAME                                READY   STATUS    RESTARTS   AGE   IP           NODE                                                NOMINATED NODE   READINESS GATES
-# kuberay-operator-54f657c8cf-6ln5j   1/1     Running   0          66s   10.32.0.12   gke-aviary-gpu-cluster-default-pool-ceb8fe4d-8dqw   <none>           <none>
+# It should be scheduled on the CPU node. If it is not, something is wrong.
 ```
 
-## Step 6: Deploy Aviary
-
-At this point, you have two options:
-
-1. You can deploy Aviary manually on a `RayCluster`, or
-2. You can deploy Aviary using a [`RayService` custom resource](https://ray-project.github.io/kuberay/guidance/rayservice/).
-
-The first option is more flexible for conducting experiments.  The second option is recommended for production use due to the additional high availability features provided by the `RayService` custom resource, which will manage the underlying `RayCluster`s for you.
+## Step 6: Create a RayCluster with RayLLM
 
-If you are running this tutorial on the Google Cloud Shell, please copy the file `docs/kuberay/ray-cluster.aviary-gke.yaml` or `docs/kuberay/ray-service.aviary-gke.yaml` to the Google Cloud Shell, depending on which option you're using. You may find it useful to use the [Cloud Shell Editor](https://cloud.google.com/shell/docs/editor-overview) to edit the file.
+If you are running this tutorial on the Google Cloud Shell, please copy the file `docs/kuberay/ray-cluster.aviary-gke.yaml` to the Google Cloud Shell. You may find it useful to use the [Cloud Shell Editor](https://cloud.google.com/shell/docs/editor-overview) to edit the file.
 
-Now you can create a RayCluster with Aviary. Aviary is included in the image `anyscale/aviary:latest`, which is specified in the RayCluster YAML manifest `ray-cluster.aviary-gke.yaml`.
-
-Run one of the following two commands:
+Now you can create a RayCluster with RayLLM. RayLLM is included in the image `anyscale/aviary:latest`, which is specified in the RayCluster YAML manifest `ray-cluster.aviary-gke.yaml`.
 
 ```sh
 # path: docs/kuberay
-# Option 1: Deploy Aviary on a RayCluster 
 kubectl apply -f ray-cluster.aviary-gke.yaml
 ```
 
-```sh
-# path: docs/kuberay
-# Option 2: Deploy Aviary as a RayService
-kubectl apply -f ray-service.aviary-gke.yaml
-```
-
 Note the following aspects of the YAML file:
 
 * The `tolerations` for workers match the taints we specified in Step 2. This ensures that the Ray GPU workers are scheduled on the GPU node pool.
@@ -171,156 +134,113 @@ Note the following aspects of the YAML file:
         resources: '"{\"accelerator_type_cpu\": 48, \"accelerator_type_a10\": 4}"'
     ```
 
-* (If using Option 2: Deploy Aviary as a RayService) The `ray-service.aviary-gke.yaml` manifest contains the following Ray Serve options:
-
-    ```yaml
-    serviceUnhealthySecondThreshold: 1200 # Config for the health check threshold for service. Default value is 60.
-    deploymentUnhealthySecondThreshold: 1200 # Config for the health check threshold for deployments. Default value is 60.
-    serveConfigV2: |
-        applications:
-        - name: amazon--LightGPT
-          import_path: aviary.backend:llm_application
-          route_prefix: /amazon--LightGPT
-          args:
-            model: "./models/continuous_batching/amazon--LightGPT.yaml"
-    ```
-
-    It also has a field `RayClusterSpec`, which describes the spec for the underlying `RayCluster`. Here we have used the same configuration as in `ray-cluster.aviary-gke.yaml` above, with the following change:
-  * We have specified the `containerPort: 8000` with the name `serve` in the head pod spec and the worker pod spec.
-
-## Step 7: Deploy an LLM model with Aviary
-
-
-In [Step 6](#step-6-deploy-aviary), if you used "Option 1: Deploy Aviary on a RayCluster", please follow [Step 7A](#step-7a-deploy-an-llm-model-with-aviary-on-a-raycluster).  Otherwise, if you used "Option 2: Deploy Aviary as a RayService", please follow [Step 7B](#step-7b-deploy-an-llm-model-with-aviary-as-a-rayservice).
-
-### Step 7A: Deploy an LLM model with Aviary via SSH on a RayCluster
-
-
+## Step 7: Deploy an LLM model with RayLLM
 
 ```sh
-# Step 7A.1: Log in to the head Pod
+# Step 7.1: Log in to the head Pod
 export HEAD_POD=$(kubectl get pods --selector=ray.io/node-type=head -o custom-columns=POD:metadata.name --no-headers)
 kubectl exec -it $HEAD_POD -- bash
 
-# Step 7A.2: Deploy the `mosaicml/mpt-7b-chat` model
-aviary run --model ./models/static_batching/mosaicml--mpt-7b-chat.yaml
+# Step 7.2: Deploy the `meta-llama/Llama-2-7b-chat-hf` model
+serve run serve/meta-llama--Llama-2-7b-chat-hf.yaml
 
-# Step 7A.3: Check the Serve application status
+# Step 7.3: Check the Serve application status
 serve status
 
 # [Example output]
-# name: default
-# app_status:
-#   status: RUNNING
-#   message: ''
-#   deployment_timestamp: 1686006910.9571936
-# deployment_statuses:
-# - name: default_mosaicml--mpt-7b-chat
-#   status: HEALTHY
-#   message: ''
-# - name: default_RouterDeployment
-#   status: HEALTHY
-#   message: ''
-
-# Step 7A.4: List all models
-export AVIARY_URL="http://localhost:8000"
-aviary models
-
-# [Example output]
-# Connecting to Aviary backend at:  http://localhost:8000/
-# mosaicml/mpt-7b-chat
-
-# Step 7A.5: Send a query to `mosaicml/mpt-7b-chat`.
-aviary query --model mosaicml/mpt-7b-chat --prompt "What are the top 5 most popular programming languages?"
+# proxies:
+#   e4dc8d29f19e3900c0b93dabb76ce9bcc6f42e36bdf5484ca57ec774: HEALTHY
+#   4f4edf80bf644846175eec0a4daabb3f3775e64738720b6b2ae5c139: HEALTHY
+# applications:
+#   router:
+#     status: RUNNING
+#     message: ''
+#     last_deployed_time_s: 1694808658.0861287
+#     deployments:
+#       Router:
+#         status: HEALTHY
+#         replica_states:
+#           RUNNING: 2
+#         message: ''
+#   meta-llama--Llama-2-7b-chat-hf:
+#     status: RUNNING
+#     message: ''
+#     last_deployed_time_s: 1694808658.0861287
+#     deployments:
+#       meta-llama--Llama-2-7b-chat-hf:
+#         status: HEALTHY
+#         replica_states:
+#           RUNNING: 1
+#         message: ''
+
+# Step 7.4: Check the live Serve app's config
+serve config
 
 # [Example output]
-# Connecting to Aviary backend at:  http://localhost:8000/
-# mosaicml/mpt-7b-chat:
-# 1. Python
-# 2. Java
-# 3. JavaScript
-# 4. C++
-# 5. C#
-```
-
-### Step 7B: Deploy an LLM model with Aviary as a RayService
-
-```sh
-# Step 7B.0: Wait for the service to be ready.
-# Note that the service will be created only when all serve applications are ready. 
-kubectl get svc # Check if `aviary-serve-svc` is created.
-
-# If the service is not yet ready, check the status by running `serve status` on the head pod. After a few minutes, the status should move from UPDATING to HEALTHY.
-# You can find the head pod by running `kubectl get pod`.
-
-# kubectl exec -it aviary-raycluster-sfz6r-head-59g8h -- serve status
-
-# name: amazon--LightGPT
-# app_status:
-#   status: DEPLOYING
-#   message: ''
-#   deployment_timestamp: 1691448736.5297794
-# deployment_statuses:
-# - name: amazon--LightGPT_amazon--LightGPT
-#   status: UPDATING
-#   message: Deployment amazon--LightGPT_amazon--LightGPT has 1 replicas that have taken
-#     more than 30s to initialize. This may be caused by a slow __init__ or reconfigure
-#     method.
-
-# Step 7B.1: Port forward the Kubernetes serve service.  This command will block, so please open a new terminal.
-kubectl port-forward service/aviary-serve-svc 8000:8000
-
-# Step 7B.2: Ensure the Aviary client is installed. You may also need to run `pip install boto3` and `pip install pydantic` if you run into Python import errors.
-pip install "aviary @ git+https://github.com/ray-project/aviary.git"
-
-# Step 7B.3: List models via the Aviary CLI outside the Kubernetes cluster.
-export AVIARY_URL="http://localhost:8000"
-aviary models
-
-# Example output:
-# Connecting to Aviary backend at: http://localhost:8000/v1
-# amazon/LightGPT
-
-# Step 7B.4: Send a query to `amazon/LightGPT`.
-aviary query --model amazon/LightGPT --prompt "What are the top 5 most popular programming languages?"
-
-# Example output:
-# Connecting to Aviary backend at: http://localhost:8000/v1
-# amazon/LightGPT:
-# 1. JavaScript
-# 2. Java
-# 3. Python
-# 4. C++
-# 5. C#
+# name: router
+# route_prefix: /
+# import_path: aviary.backend:router_application
+# args:
+#   models:
+#     meta-llama/Llama-2-7b-chat-hf: ./models/continuous_batching/meta-llama--Llama-2-7b-chat-hf.yaml
+
+# ---
+
+# name: meta-llama--Llama-2-7b-chat-hf
+# route_prefix: /meta-llama--Llama-2-7b-chat-hf
+# import_path: aviary.backend:llm_application
+# args:
+#   model: ./models/continuous_batching/meta-llama--Llama-2-7b-chat-hf.yaml
+
+# Step 7.5: Send a query to `meta-llama/Llama-2-7b-chat-hf`.
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "meta-llama/Llama-2-7b-chat-hf",
+    "messages": [
+      {"role": "system", "content": "You are a helpful assistant."},
+      {"role": "user", "content": "What are the top 5 most popular programming languages?"}
+    ],
+    "temperature": 0.7
+  }'
+
+# [Example output for `meta-llama/Llama-2-7b-chat-hf`]
+{
+  "id":"meta-llama/Llama-2-7b-chat-hf-95239f0b-4601-4557-8a33-3977e9b6b779",
+  "object":"text_completion","created":1694814804,"model":"meta-llama/Llama-2-7b-chat-hf",
+  "choices":[
+    {
+      "message":
+      {
+        "role":"assistant",
+        "content":"As a helpful assistant, I'm glad to provide you with the top 5 most popular programming languages based on various sources and metrics:\n\n1. Java: Java is a popular language used for developing enterprise-level applications, Android apps, and web applications. It's known for its platform independence, which allows Java developers to create applications that can run on any device supporting the Java Virtual Machine (JVM).\n\n2. Python: Python is a versatile language that's widely used in various industries, including web development, data science, artificial intelligence, and machine learning. Its simplicity, readability, and ease of use make it a favorite among developers.\n\n3. JavaScript: JavaScript is the language of the web and is used for creating interactive client-side functionality for web applications. It's also used in mobile app development, game development, and server-side programming.\n\n4. C++: C++ is a high-performance language used for developing operating systems, games, and other high-performance applications. It's known for its efficiency, speed, and flexibility, making it a popular choice among developers.\n\n5. PHP: PHP is a server-side scripting language used for web development, especially for building dynamic websites and web applications. It's known for its ease of use and is widely used in the web development community.\n\nThese are the top 5 most popular programming languages based on various sources, but it's worth noting that programming language popularity can vary depending on the source and the time frame considered."
+      },
+      "index":0,
+      "finish_reason":"stop"
+    }
+  ],
+  "usage":{
+    "prompt_tokens":39,
+    "completion_tokens":330,
+    "total_tokens":369
+  }
+}
 ```
 
 ## Step 8: Clean up resources
 
 **Warning: GPU nodes are extremely expensive. Please remember to delete the cluster if you no longer need it.**
 
-Run one of the following commands, depending on which option you chose in [Step 6](#step-6-deploy-aviary).
-
 ```sh
-# Step 8.1A: Delete the RayCluster custom resource
+# Step 8.1: Delete the RayCluster
 # path: docs/kuberay
 kubectl delete -f ray-cluster.aviary-gke.yaml
-```
-
-```sh
-# Step 8.1B: Delete the RayService custom resource
-# path: docs/kuberay
-kubectl delete -f ray-service.aviary-gke.yaml
-```
-
-Finally, run the following commands to delete the KubeRay operator chart and the GKE cluster.
-
-```sh
 
 # Step 8.2: Uninstall the KubeRay operator chart
 helm uninstall kuberay-operator
 
 # Step 8.3: Delete the GKE cluster
-gcloud container clusters delete aviary-gpu-cluster
+gcloud container clusters delete rayllm-gpu-cluster
 ```
 
 See the [GKE documentation](https://cloud.google.com/kubernetes-engine/docs/how-to/deleting-a-cluster) for more details on deleting a GKE cluster.
diff --git a/docs/kuberay/ray-cluster.aviary-eks.yaml b/docs/kuberay/ray-cluster.aviary-eks.yaml
index 823388ee..7a3412df 100644
--- a/docs/kuberay/ray-cluster.aviary-eks.yaml
+++ b/docs/kuberay/ray-cluster.aviary-eks.yaml
@@ -39,7 +39,7 @@ spec:
     # logical group name, for this called small-group, also can be functional
     groupName: gpu-group
     rayStartParams:
-      resources: '"{\"accelerator_type_cpu\": 48, \"accelerator_type_a10\": 2, \"accelerator_type_a100_80g\": 2}"'
+      resources: '"{\"accelerator_type_cpu\": 48, \"accelerator_type_a10\": 2, \"accelerator_type_a100\": 2}"'
     #pod template
     template:
       spec:
diff --git a/docs/kuberay/ray-cluster.aviary-gke.yaml b/docs/kuberay/ray-cluster.aviary-gke.yaml
index 86927136..c1ebb961 100644
--- a/docs/kuberay/ray-cluster.aviary-gke.yaml
+++ b/docs/kuberay/ray-cluster.aviary-gke.yaml
@@ -16,7 +16,7 @@ spec:
       spec:
         containers:
         - name: ray-head
-          image: anyscale/aviary:latest
+          image: anyscale/aviary:latest-tgi
           resources:
             limits:
               cpu: 2
@@ -39,13 +39,13 @@ spec:
     # logical group name, for this called small-group, also can be functional
     groupName: gpu-group
     rayStartParams:
-      resources: '"{\"accelerator_type_cpu\": 48, \"accelerator_type_a10\": 2, \"accelerator_type_a100_80g\": 2}"'
+      resources: '"{\"accelerator_type_cpu\": 48, \"accelerator_type_a10\": 2, \"accelerator_type_a100\": 2}"'
     # pod template
     template:
       spec:
         containers:
         - name: llm
-          image: anyscale/aviary:latest
+          image: anyscale/aviary:latest-tgi
           lifecycle:
             preStop:
               exec:
diff --git a/docs/kuberay/ray-service.aviary-eks.yaml b/docs/kuberay/ray-service.aviary-eks.yaml
index 38d03749..738c994d 100644
--- a/docs/kuberay/ray-service.aviary-eks.yaml
+++ b/docs/kuberay/ray-service.aviary-eks.yaml
@@ -7,13 +7,23 @@ spec:
   deploymentUnhealthySecondThreshold: 1200 # Config for the health check threshold for deployments. Default value is 60.
   serveConfigV2: |
       applications:
+      - name: amazon--LightGPT
+        import_path: aviary.backend:llm_application
+        route_prefix: /amazon--LightGPT
+        args:
+          model: "./models/continuous_batching/amazon--LightGPT.yaml"
+      - name: OpenAssistant--falcon-7b-sft-top1-696
+        import_path: aviary.backend:llm_application
+        route_prefix: /OpenAssistant--falcon-7b-sft-top1-696
+        args:
+          model: "./models/continuous_batching/OpenAssistant--falcon-7b-sft-top1-696.yaml"
       - name: router
         import_path: aviary.backend:router_application
         route_prefix: /
         args:
           models:
-            - ./models/continuous_batching/amazon--LightGPT.yaml
-            - ./models/continuous_batching/OpenAssistant--falcon-7b-sft-top1-696.yaml
+            amazon/LightGPT: ./models/continuous_batching/amazon--LightGPT.yaml
+            OpenAssistant/falcon-7b-sft-top1-696: ./models/continuous_batching/OpenAssistant--falcon-7b-sft-top1-696.yaml
   rayClusterConfig:
     # Ray head pod template
     headGroupSpec:
@@ -53,7 +63,7 @@ spec:
       # logical group name, for this called small-group, also can be functional
       groupName: gpu-group
       rayStartParams:
-        resources: '"{\"accelerator_type_cpu\": 48, \"accelerator_type_a10\": 2, \"accelerator_type_a100_80g\": 2}"'
+        resources: '"{\"accelerator_type_cpu\": 48, \"accelerator_type_a10\": 2, \"accelerator_type_a100\": 2}"'
       #pod template
       template:
         spec:
diff --git a/docs/reference.md b/docs/reference.md
deleted file mode 100644
index 5840b2f4..00000000
--- a/docs/reference.md
+++ /dev/null
@@ -1,66 +0,0 @@
-
-# Aviary Reference
-
-## Installing Aviary
-
-To install Aviary and its dependencies, run the following command:
-
-```shell
-pip install "aviary @ git+https://github.com/ray-project/aviary.git"
-```
-
-The default Aviary installation only includes the Aviary API client.
-
-Aviary consists of a backend and a frontend (Aviary Explorer), both of which come with additional
-dependencies. To install the dependencies for the frontend run the following commands:
-
-```shell
-pip install "aviary[frontend] @ git+https://github.com/ray-project/aviary.git"
-```
-
-The backend dependencies are heavy weight, and quite large. We recommend using the official
-`anyscale/aviary` image. Installing the backend manually is not a supported usecase.
-
-## Running Aviary Frontend locally
-
-Aviary consists of two components, a backend and a frontend.
-The Backend exposes a Ray Serve FastAPI interface running on a Ray cluster allowing you to deploy various LLMs efficiently.
-
-The frontend is a [Gradio](https://gradio.app/) interface that allows you to interact
-with the models in the backend through a web interface.
-The Gradio app is served using [Ray Serve](https://docs.ray.io/en/latest/serve/index.html).
-
-To run the Aviary frontend locally, you need to set the following environment variable:
-
-```shell
-export AVIARY_URL=<hostname of the backend, eg. 'http://localhost:8000'>
-```
-
-Once you have set these environment variables, you can run the frontend with the
-following command:
-
-```shell
-serve run aviary.frontend.app:app
-```
-
-To just use the Gradio frontend without Ray Serve, you can start it 
-with `python aviary/frontend/app.py`.
-
-In any case, the Gradio interface should be accessible at `http://localhost:7860`
-in your browser.
-If running the frontend yourself is not an option, you can still use 
-[our hosted version](http://aviary.anyscale.com/) for your experiments.
-
-### Usage stats collection
-
-Aviary backend collects basic, non-identifiable usage statistics to help us improve the project.
-The mechanism for collection is the same as in Ray.
-For more information on what is collected and how to opt-out, see the
-[Usage Stats Collection](https://docs.ray.io/en/latest/cluster/usage-stats.html) page in
-Ray documentation.
-
-## Aviary Model Registry
-
-Aviary allows you to easily add new models by adding a single configuration file.
-To learn more about how to customize or add new models, 
-see the [Aviary Model Registry](models/README.md).

From 6f70dfc851d5aa07f47e2ba20aed618163727769 Mon Sep 17 00:00:00 2001
From: Richard Liaw <rliaw@berkeley.edu>
Date: Mon, 2 Oct 2023 18:26:27 -0700
Subject: [PATCH 2/4] Apply suggestions from code review

Signed-off-by: Richard Liaw <rliaw@berkeley.edu>
---
 docs/kuberay/deploy-on-eks.md            | 4 ++--
 docs/kuberay/ray-cluster.aviary-gke.yaml | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/kuberay/deploy-on-eks.md b/docs/kuberay/deploy-on-eks.md
index 8edaf055..9de9e4a5 100644
--- a/docs/kuberay/deploy-on-eks.md
+++ b/docs/kuberay/deploy-on-eks.md
@@ -116,10 +116,10 @@ Something is worth noticing:
       resources: '"{\"accelerator_type_cpu\": 2}"'
 
     # Ray workers: The Ray worker has a Pod resource limit of 48 CPUs and 4 GPUs.
-    # `accelerator_type_a10` and `accelerator_type_a100` below are only used for Ray logical-resource scheduling.
+    # `accelerator_type_a10` and `accelerator_type_a100_80g` below are only used for Ray logical-resource scheduling.
     # This does not imply that each worker has 2 A10 GPUs and 2 A100 GPUs.
     rayStartParams:
-      resources: '"{\"accelerator_type_cpu\": 48, \"accelerator_type_a10\": 2, \"accelerator_type_a100\": 2}"'
+      resources: '"{\"accelerator_type_cpu\": 48, \"accelerator_type_a10\": 2, \"accelerator_type_a100_80g\": 2}"'
     ```
 
 ## Step 2: Deploy an LLM model with RayLLM
diff --git a/docs/kuberay/ray-cluster.aviary-gke.yaml b/docs/kuberay/ray-cluster.aviary-gke.yaml
index c1ebb961..86927136 100644
--- a/docs/kuberay/ray-cluster.aviary-gke.yaml
+++ b/docs/kuberay/ray-cluster.aviary-gke.yaml
@@ -16,7 +16,7 @@ spec:
       spec:
         containers:
         - name: ray-head
-          image: anyscale/aviary:latest-tgi
+          image: anyscale/aviary:latest
           resources:
             limits:
               cpu: 2
@@ -39,13 +39,13 @@ spec:
     # logical group name, for this called small-group, also can be functional
     groupName: gpu-group
     rayStartParams:
-      resources: '"{\"accelerator_type_cpu\": 48, \"accelerator_type_a10\": 2, \"accelerator_type_a100\": 2}"'
+      resources: '"{\"accelerator_type_cpu\": 48, \"accelerator_type_a10\": 2, \"accelerator_type_a100_80g\": 2}"'
     # pod template
     template:
       spec:
         containers:
         - name: llm
-          image: anyscale/aviary:latest-tgi
+          image: anyscale/aviary:latest
           lifecycle:
             preStop:
               exec:

From 3ae8d70e16c58d0d9e9027d8460df9f9119a4c1b Mon Sep 17 00:00:00 2001
From: Richard Liaw <rliaw@berkeley.edu>
Date: Mon, 2 Oct 2023 18:27:20 -0700
Subject: [PATCH 3/4] Apply suggestions from code review

Signed-off-by: Richard Liaw <rliaw@berkeley.edu>
---
 docs/kuberay/deploy-on-gke.md            | 4 ++--
 docs/kuberay/ray-cluster.aviary-eks.yaml | 2 +-
 docs/kuberay/ray-service.aviary-eks.yaml | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/kuberay/deploy-on-gke.md b/docs/kuberay/deploy-on-gke.md
index a945846a..de8e8a43 100644
--- a/docs/kuberay/deploy-on-gke.md
+++ b/docs/kuberay/deploy-on-gke.md
@@ -91,9 +91,9 @@ kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable
 ## Step 5: Install the KubeRay operator
 
 ```sh
-# Install both CRDs and KubeRay operator v0.5.0.
+# Install both CRDs and KubeRay operator v0.6.0.
 helm repo add kuberay https://ray-project.github.io/kuberay-helm/
-helm install kuberay-operator kuberay/kuberay-operator --version 0.5.0
+helm repo update
 
 # It should be scheduled on the CPU node. If it is not, something is wrong.
 ```
diff --git a/docs/kuberay/ray-cluster.aviary-eks.yaml b/docs/kuberay/ray-cluster.aviary-eks.yaml
index 7a3412df..823388ee 100644
--- a/docs/kuberay/ray-cluster.aviary-eks.yaml
+++ b/docs/kuberay/ray-cluster.aviary-eks.yaml
@@ -39,7 +39,7 @@ spec:
     # logical group name, for this called small-group, also can be functional
     groupName: gpu-group
     rayStartParams:
-      resources: '"{\"accelerator_type_cpu\": 48, \"accelerator_type_a10\": 2, \"accelerator_type_a100\": 2}"'
+      resources: '"{\"accelerator_type_cpu\": 48, \"accelerator_type_a10\": 2, \"accelerator_type_a100_80g\": 2}"'
     #pod template
     template:
       spec:
diff --git a/docs/kuberay/ray-service.aviary-eks.yaml b/docs/kuberay/ray-service.aviary-eks.yaml
index 738c994d..44a469a6 100644
--- a/docs/kuberay/ray-service.aviary-eks.yaml
+++ b/docs/kuberay/ray-service.aviary-eks.yaml
@@ -63,7 +63,7 @@ spec:
       # logical group name, for this called small-group, also can be functional
       groupName: gpu-group
       rayStartParams:
-        resources: '"{\"accelerator_type_cpu\": 48, \"accelerator_type_a10\": 2, \"accelerator_type_a100\": 2}"'
+        resources: '"{\"accelerator_type_cpu\": 48, \"accelerator_type_a10\": 2, \"accelerator_type_a100_80g\": 2}"'
       #pod template
       template:
         spec:

From f22e811630de74cf20ac4a1f8c9fcaca48eecae5 Mon Sep 17 00:00:00 2001
From: Richard Liaw <rliaw@berkeley.edu>
Date: Tue, 3 Oct 2023 18:34:44 -0700
Subject: [PATCH 4/4] update

---
 docs/kuberay/ray-service.aviary-eks.yaml | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/docs/kuberay/ray-service.aviary-eks.yaml b/docs/kuberay/ray-service.aviary-eks.yaml
index 738c994d..38d03749 100644
--- a/docs/kuberay/ray-service.aviary-eks.yaml
+++ b/docs/kuberay/ray-service.aviary-eks.yaml
@@ -7,23 +7,13 @@ spec:
   deploymentUnhealthySecondThreshold: 1200 # Config for the health check threshold for deployments. Default value is 60.
   serveConfigV2: |
       applications:
-      - name: amazon--LightGPT
-        import_path: aviary.backend:llm_application
-        route_prefix: /amazon--LightGPT
-        args:
-          model: "./models/continuous_batching/amazon--LightGPT.yaml"
-      - name: OpenAssistant--falcon-7b-sft-top1-696
-        import_path: aviary.backend:llm_application
-        route_prefix: /OpenAssistant--falcon-7b-sft-top1-696
-        args:
-          model: "./models/continuous_batching/OpenAssistant--falcon-7b-sft-top1-696.yaml"
       - name: router
         import_path: aviary.backend:router_application
         route_prefix: /
         args:
           models:
-            amazon/LightGPT: ./models/continuous_batching/amazon--LightGPT.yaml
-            OpenAssistant/falcon-7b-sft-top1-696: ./models/continuous_batching/OpenAssistant--falcon-7b-sft-top1-696.yaml
+            - ./models/continuous_batching/amazon--LightGPT.yaml
+            - ./models/continuous_batching/OpenAssistant--falcon-7b-sft-top1-696.yaml
   rayClusterConfig:
     # Ray head pod template
     headGroupSpec:
@@ -63,7 +53,7 @@ spec:
       # logical group name, for this called small-group, also can be functional
       groupName: gpu-group
       rayStartParams:
-        resources: '"{\"accelerator_type_cpu\": 48, \"accelerator_type_a10\": 2, \"accelerator_type_a100\": 2}"'
+        resources: '"{\"accelerator_type_cpu\": 48, \"accelerator_type_a10\": 2, \"accelerator_type_a100_80g\": 2}"'
       #pod template
       template:
         spec: