Skip to content

Commit

Permalink
Merge branch 'main' into Combine_compose
Browse files Browse the repository at this point in the history
  • Loading branch information
ZePan110 authored Nov 8, 2024
2 parents 11bb987 + 617e119 commit 6033f43
Show file tree
Hide file tree
Showing 20 changed files with 116 additions and 946 deletions.
32 changes: 32 additions & 0 deletions .github/workflows/check-online-doc-build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

name: Check Online Document Building
permissions: {}

on:
pull_request:
branches: [main]

jobs:
build:
runs-on: ubuntu-latest
steps:

- name: Checkout
uses: actions/checkout@v4
with:
path: GenAIComps

- name: Checkout docs
uses: actions/checkout@v4
with:
repository: opea-project/docs
path: docs

- name: Build Online Document
shell: bash
run: |
echo "build online doc"
cd docs
bash scripts/build.sh
111 changes: 72 additions & 39 deletions comps/cores/mega/gateway.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,41 @@
from .micro_service import MicroService


def read_pdf(file):
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader(file)
docs = loader.load_and_split()
return docs


def read_text_from_file(file, save_file_name):
import docx2txt
from langchain.text_splitter import CharacterTextSplitter

# read text file
if file.headers["content-type"] == "text/plain":
file.file.seek(0)
content = file.file.read().decode("utf-8")
# Split text
text_splitter = CharacterTextSplitter()
texts = text_splitter.split_text(content)
# Create multiple documents
file_content = texts
# read pdf file
elif file.headers["content-type"] == "application/pdf":
documents = read_pdf(save_file_name)
file_content = [doc.page_content for doc in documents]
# read docx file
elif (
file.headers["content-type"] == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
or file.headers["content-type"] == "application/octet-stream"
):
file_content = docx2txt.process(save_file_name)

return file_content


class Gateway:
def __init__(
self,
Expand Down Expand Up @@ -72,8 +107,19 @@ async def handle_request(self, request: Request):

def list_service(self):
response = {}
for node in self.all_leaves():
response = {self.services[node].description: self.services[node].endpoint_path}
for node, service in self.megaservice.services.items():
# Check if the service has a 'description' attribute and it is not None
if hasattr(service, "description") and service.description:
response[node] = {"description": service.description}
# Check if the service has an 'endpoint' attribute and it is not None
if hasattr(service, "endpoint") and service.endpoint:
if node in response:
response[node]["endpoint"] = service.endpoint
else:
response[node] = {"endpoint": service.endpoint}
# If neither 'description' nor 'endpoint' is available, add an error message for the node
if node not in response:
response[node] = {"error": f"Service node {node} does not have 'description' or 'endpoint' attribute."}
return response

def list_parameter(self):
Expand Down Expand Up @@ -365,39 +411,6 @@ def __init__(self, megaservice, host="0.0.0.0", port=8888):
megaservice, host, port, str(MegaServiceEndpoint.DOC_SUMMARY), ChatCompletionRequest, ChatCompletionResponse
)

def read_pdf(self, file):
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader(file)
docs = loader.load_and_split()
return docs

def read_text_from_file(self, file, save_file_name):
import docx2txt
from langchain.text_splitter import CharacterTextSplitter

# read text file
if file.headers["content-type"] == "text/plain":
file.file.seek(0)
content = file.file.read().decode("utf-8")
# Split text
text_splitter = CharacterTextSplitter()
texts = text_splitter.split_text(content)
# Create multiple documents
file_content = texts
# read pdf file
elif file.headers["content-type"] == "application/pdf":
documents = self.read_pdf(save_file_name)
file_content = [doc.page_content for doc in documents]
# read docx file
elif (
file.headers["content-type"] == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
or file.headers["content-type"] == "application/octet-stream"
):
file_content = docx2txt.process(save_file_name)

return file_content

async def handle_request(self, request: Request, files: List[UploadFile] = File(default=None)):
data = await request.form()
stream_opt = data.get("stream", True)
Expand All @@ -411,7 +424,7 @@ async def handle_request(self, request: Request, files: List[UploadFile] = File(

async with aiofiles.open(file_path, "wb") as f:
await f.write(await file.read())
docs = self.read_text_from_file(file, file_path)
docs = read_text_from_file(file, file_path)
os.remove(file_path)
if isinstance(docs, list):
file_summaries.extend(docs)
Expand Down Expand Up @@ -547,11 +560,31 @@ def __init__(self, megaservice, host="0.0.0.0", port=8888):
megaservice, host, port, str(MegaServiceEndpoint.FAQ_GEN), ChatCompletionRequest, ChatCompletionResponse
)

async def handle_request(self, request: Request):
data = await request.json()
async def handle_request(self, request: Request, files: List[UploadFile] = File(default=None)):
data = await request.form()
stream_opt = data.get("stream", True)
chat_request = ChatCompletionRequest.parse_obj(data)
prompt = self._handle_message(chat_request.messages)
file_summaries = []
if files:
for file in files:
file_path = f"/tmp/{file.filename}"

import aiofiles

async with aiofiles.open(file_path, "wb") as f:
await f.write(await file.read())
docs = read_text_from_file(file, file_path)
os.remove(file_path)
if isinstance(docs, list):
file_summaries.extend(docs)
else:
file_summaries.append(docs)

if file_summaries:
prompt = self._handle_message(chat_request.messages) + "\n".join(file_summaries)
else:
prompt = self._handle_message(chat_request.messages)

parameters = LLMParams(
max_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024,
top_k=chat_request.top_k if chat_request.top_k else 10,
Expand Down
2 changes: 2 additions & 0 deletions comps/cores/mega/micro_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def __init__(
provider: Optional[str] = None,
provider_endpoint: Optional[str] = None,
use_remote_service: Optional[bool] = False,
description: Optional[str] = None,
dynamic_batching: bool = False,
dynamic_batching_timeout: int = 1,
dynamic_batching_max_batch_size: int = 32,
Expand All @@ -53,6 +54,7 @@ def __init__(
self.input_datatype = input_datatype
self.output_datatype = output_datatype
self.use_remote_service = use_remote_service
self.description = description
self.dynamic_batching = dynamic_batching
self.dynamic_batching_timeout = dynamic_batching_timeout
self.dynamic_batching_max_batch_size = dynamic_batching_max_batch_size
Expand Down
138 changes: 10 additions & 128 deletions comps/llms/text-generation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,20 @@

This microservice, designed for Language Model Inference (LLM), processes input consisting of a query string and associated reranked documents. It constructs a prompt based on the query and documents, which is then used to perform inference with a large language model. The service delivers the inference results as output.

A prerequisite for using this microservice is that users must have a LLM text generation service (etc., TGI, vLLM and Ray) already running. Users need to set the LLM service's endpoint into an environment variable. The microservice utilizes this endpoint to create an LLM object, enabling it to communicate with the LLM service for executing language model operations.
A prerequisite for using this microservice is that users must have a LLM text generation service (etc., TGI, vLLM) already running. Users need to set the LLM service's endpoint into an environment variable. The microservice utilizes this endpoint to create an LLM object, enabling it to communicate with the LLM service for executing language model operations.

Overall, this microservice offers a streamlined way to integrate large language model inference into applications, requiring minimal setup from the user beyond initiating a TGI/vLLM/Ray service and configuring the necessary environment variables. This allows for the seamless processing of queries and documents to generate intelligent, context-aware responses.
Overall, this microservice offers a streamlined way to integrate large language model inference into applications, requiring minimal setup from the user beyond initiating a TGI/vLLM service and configuring the necessary environment variables. This allows for the seamless processing of queries and documents to generate intelligent, context-aware responses.

## Validated LLM Models

| Model | TGI-Gaudi | vLLM-CPU | vLLM-Gaudi | Ray |
| --------------------------- | --------- | -------- | ---------- | --- |
| [Intel/neural-chat-7b-v3-3] |||||
| [Llama-2-7b-chat-hf] |||||
| [Llama-2-70b-chat-hf] || - || x |
| [Meta-Llama-3-8B-Instruct] |||||
| [Meta-Llama-3-70B-Instruct] || - || x |
| [Phi-3] | x | Limit 4K | Limit 4K ||
| Model | TGI-Gaudi | vLLM-CPU | vLLM-Gaudi |
| --------------------------- | --------- | -------- | ---------- |
| [Intel/neural-chat-7b-v3-3] ||||
| [Llama-2-7b-chat-hf] ||||
| [Llama-2-70b-chat-hf] || - ||
| [Meta-Llama-3-8B-Instruct] ||||
| [Meta-Llama-3-70B-Instruct] || - ||
| [Phi-3] | x | Limit 4K | Limit 4K |

## Clone OPEA GenAIComps

Expand Down Expand Up @@ -121,53 +121,6 @@ export vLLM_ENDPOINT="http://${vLLM_HOST_IP}:8008"
python llm.py
```

#### 1.2.3 Start the Ray Service

Install the requirements for Ray Service

```bash
cd ${OPEA_GENAICOMPS_ROOT}/comps/llms/text-generation/vllm/ray

pip install -r requirements.txt
```

Execute the docker run command to initiate the backend, along with the Python script that launches the microservice.

```bash
export vLLM_RAY_HOST_IP=$(hostname -I | awk '{print $1}') # This sets IP of the current machine
export LLM_MODEL=${your_hf_llm_model}
export DATA_DIR=$HOME/data # Location to download the model
export HF_TOKEN=${your_hf_api_token}

# Build the image first as opea/vllm:cpu
bash ${OPEA_GENAICOMPS_ROOT}/comps/llms/text-generation/vllm/ray/dependency/build_docker_vllmray.sh

# Initiate the backend
docker run \
--name="vllm-ray-service" \
--runtime=habana \
-v $DATA_DIR:/data \
-e HABANA_VISIBLE_DEVICES=all \
-e OMPI_MCA_btl_vader_single_copy_mechanism=none \
--cap-add=sys_nice \
--ipc=host \
-p 8006:8000 \
-e HF_TOKEN=$HF_TOKEN \
opea/vllm_ray:habana \
/bin/bash -c " \
ray start --head && \
python vllm_ray_openai.py \
--port_number 8000 \
--model_id_or_path $LLM_MODEL \
--tensor_parallel_size 2 \
--enforce_eager False"

# Start the microservice with an endpoint as the above docker run command
export vLLM_RAY_ENDPOINT="http://${vLLM_RAY_HOST_IP}:8006"

python llm.py
```

## 🚀2. Start Microservice with Docker (Option 2)

In order to start the microservices with docker, you need to build the docker images first for the microservice.
Expand Down Expand Up @@ -203,22 +156,6 @@ docker build \
-f comps/llms/text-generation/vllm/langchain/Dockerfile .
```

#### 2.1.3 Ray

```bash
# Build the Ray Serve docker
bash ${OPEA_GENAICOMPS_ROOT}/comps/llms/text-generation/vllm/ray/dependency/build_docker_vllmray.sh

# Build the microservice docker
cd ${OPEA_GENAICOMPS_ROOT}

docker build \
--build-arg https_proxy=$https_proxy \
--build-arg http_proxy=$http_proxy \
-t opea/llm-vllm-ray:latest \
-f comps/llms/text-generation/vllm/ray/Dockerfile .
```

### 2.2 Start LLM Service with the built image

To start a docker container, you have two options:
Expand Down Expand Up @@ -247,15 +184,6 @@ export vLLM_LLM_ENDPOINT="http://${your_ip}:8008"
export LLM_MODEL=${your_hf_llm_model}
```

In order to start Ray serve and LLM services, you need to setup the following environment variables first.

```bash
export HF_TOKEN=${your_hf_api_token}
export RAY_Serve_ENDPOINT="http://${your_ip}:8008"
export LLM_MODEL=${your_hf_llm_model}
export CHAT_PROCESSOR="ChatModelLlama"
```

### 2.3 Run Docker with CLI (Option A)

#### 2.3.1 TGI
Expand Down Expand Up @@ -311,29 +239,6 @@ docker run \
opea/llm-vllm:latest
```

#### 2.3.3 Ray Serve

Start Ray Serve endpoint.

```bash
bash ${OPEA_GENAICOMPS_ROOT}/comps/llms/text-generation/vllm/ray/dependency/launch_vllmray.sh
```

Start Ray Serve microservice.

```bash
docker run -d \
--name="llm-ray-server" \
-p 9000:9000 \
--ipc=host \
-e http_proxy=$http_proxy \
-e https_proxy=$https_proxy \
-e RAY_Serve_ENDPOINT=$RAY_Serve_ENDPOINT \
-e HF_TOKEN=$HF_TOKEN \
-e LLM_MODEL=$LLM_MODEL \
opea/llm-ray:latest
```

### 2.4 Run Docker with Docker Compose (Option B)

#### 2.4.1 TGI
Expand All @@ -350,13 +255,6 @@ cd ${OPEA_GENAICOMPS_ROOT}/comps/llms/text-generation/vllm/langchain
docker compose -f docker_compose_llm.yaml up -d
```

#### 2.4.3 Ray Serve

```bash
cd ${OPEA_GENAICOMPS_ROOT}/comps/llms/text-generation/vllm/ray
docker compose -f docker_compose_llm.yaml up -d
```

## 🚀3. Consume LLM Service

### 3.1 Check Service Status
Expand Down Expand Up @@ -391,22 +289,6 @@ curl http://${your_ip}:8008/v1/completions \
}'
```

#### 3.2.3 Verify the Ray Service

```bash
curl http://${your_ip}:8008/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": ${your_hf_llm_model},
"messages": [
{"role": "assistant", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is Deep Learning?"}
],
"max_tokens": 32,
"stream": true
}'
```

### 3.3 Consume LLM Service

You can set the following model parameters according to your actual needs, such as `max_tokens`, `streaming`.
Expand Down
Loading

0 comments on commit 6033f43

Please sign in to comment.