Merge branch 'main' into Combine_compose

opea-project · Nov 8, 2024 · 6033f43 · 6033f43
2 parents 11bb987 + 617e119
commit 6033f43
Show file tree

Hide file tree

Showing 20 changed files with 116 additions and 946 deletions.
diff --git a/.github/workflows/check-online-doc-build.yml b/.github/workflows/check-online-doc-build.yml
@@ -0,0 +1,32 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+name: Check Online Document Building
+permissions: {}
+
+on:
+  pull_request:
+    branches: [main]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+
+    - name: Checkout
+      uses: actions/checkout@v4
+      with:
+        path: GenAIComps
+
+    - name: Checkout docs
+      uses: actions/checkout@v4
+      with:
+        repository: opea-project/docs
+        path: docs
+
+    - name: Build Online Document
+      shell: bash
+      run: |
+        echo "build online doc"
+        cd docs
+        bash scripts/build.sh
diff --git a/comps/cores/mega/gateway.py b/comps/cores/mega/gateway.py
@@ -25,6 +25,41 @@
 from .micro_service import MicroService
 
 
+def read_pdf(file):
+    from langchain.document_loaders import PyPDFLoader
+
+    loader = PyPDFLoader(file)
+    docs = loader.load_and_split()
+    return docs
+
+
+def read_text_from_file(file, save_file_name):
+    import docx2txt
+    from langchain.text_splitter import CharacterTextSplitter
+
+    # read text file
+    if file.headers["content-type"] == "text/plain":
+        file.file.seek(0)
+        content = file.file.read().decode("utf-8")
+        # Split text
+        text_splitter = CharacterTextSplitter()
+        texts = text_splitter.split_text(content)
+        # Create multiple documents
+        file_content = texts
+    # read pdf file
+    elif file.headers["content-type"] == "application/pdf":
+        documents = read_pdf(save_file_name)
+        file_content = [doc.page_content for doc in documents]
+    # read docx file
+    elif (
+        file.headers["content-type"] == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+        or file.headers["content-type"] == "application/octet-stream"
+    ):
+        file_content = docx2txt.process(save_file_name)
+
+    return file_content
+
+
 class Gateway:
     def __init__(
         self,
@@ -72,8 +107,19 @@ async def handle_request(self, request: Request):
 
     def list_service(self):
         response = {}
-        for node in self.all_leaves():
-            response = {self.services[node].description: self.services[node].endpoint_path}
+        for node, service in self.megaservice.services.items():
+            # Check if the service has a 'description' attribute and it is not None
+            if hasattr(service, "description") and service.description:
+                response[node] = {"description": service.description}
+            # Check if the service has an 'endpoint' attribute and it is not None
+            if hasattr(service, "endpoint") and service.endpoint:
+                if node in response:
+                    response[node]["endpoint"] = service.endpoint
+                else:
+                    response[node] = {"endpoint": service.endpoint}
+            # If neither 'description' nor 'endpoint' is available, add an error message for the node
+            if node not in response:
+                response[node] = {"error": f"Service node {node} does not have 'description' or 'endpoint' attribute."}
         return response
 
     def list_parameter(self):
@@ -365,39 +411,6 @@ def __init__(self, megaservice, host="0.0.0.0", port=8888):
             megaservice, host, port, str(MegaServiceEndpoint.DOC_SUMMARY), ChatCompletionRequest, ChatCompletionResponse
         )
 
-    def read_pdf(self, file):
-        from langchain.document_loaders import PyPDFLoader
-
-        loader = PyPDFLoader(file)
-        docs = loader.load_and_split()
-        return docs
-
-    def read_text_from_file(self, file, save_file_name):
-        import docx2txt
-        from langchain.text_splitter import CharacterTextSplitter
-
-        # read text file
-        if file.headers["content-type"] == "text/plain":
-            file.file.seek(0)
-            content = file.file.read().decode("utf-8")
-            # Split text
-            text_splitter = CharacterTextSplitter()
-            texts = text_splitter.split_text(content)
-            # Create multiple documents
-            file_content = texts
-        # read pdf file
-        elif file.headers["content-type"] == "application/pdf":
-            documents = self.read_pdf(save_file_name)
-            file_content = [doc.page_content for doc in documents]
-        # read docx file
-        elif (
-            file.headers["content-type"] == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-            or file.headers["content-type"] == "application/octet-stream"
-        ):
-            file_content = docx2txt.process(save_file_name)
-
-        return file_content
-
     async def handle_request(self, request: Request, files: List[UploadFile] = File(default=None)):
         data = await request.form()
         stream_opt = data.get("stream", True)
@@ -411,7 +424,7 @@ async def handle_request(self, request: Request, files: List[UploadFile] = File(
 
                 async with aiofiles.open(file_path, "wb") as f:
                     await f.write(await file.read())
-                docs = self.read_text_from_file(file, file_path)
+                docs = read_text_from_file(file, file_path)
                 os.remove(file_path)
                 if isinstance(docs, list):
                     file_summaries.extend(docs)
@@ -547,11 +560,31 @@ def __init__(self, megaservice, host="0.0.0.0", port=8888):
             megaservice, host, port, str(MegaServiceEndpoint.FAQ_GEN), ChatCompletionRequest, ChatCompletionResponse
         )
 
-    async def handle_request(self, request: Request):
-        data = await request.json()
+    async def handle_request(self, request: Request, files: List[UploadFile] = File(default=None)):
+        data = await request.form()
         stream_opt = data.get("stream", True)
         chat_request = ChatCompletionRequest.parse_obj(data)
-        prompt = self._handle_message(chat_request.messages)
+        file_summaries = []
+        if files:
+            for file in files:
+                file_path = f"/tmp/{file.filename}"
+
+                import aiofiles
+
+                async with aiofiles.open(file_path, "wb") as f:
+                    await f.write(await file.read())
+                docs = read_text_from_file(file, file_path)
+                os.remove(file_path)
+                if isinstance(docs, list):
+                    file_summaries.extend(docs)
+                else:
+                    file_summaries.append(docs)
+
+        if file_summaries:
+            prompt = self._handle_message(chat_request.messages) + "\n".join(file_summaries)
+        else:
+            prompt = self._handle_message(chat_request.messages)
+
         parameters = LLMParams(
             max_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024,
             top_k=chat_request.top_k if chat_request.top_k else 10,

diff --git a/comps/cores/mega/micro_service.py b/comps/cores/mega/micro_service.py
@@ -38,6 +38,7 @@ def __init__(
         provider: Optional[str] = None,
         provider_endpoint: Optional[str] = None,
         use_remote_service: Optional[bool] = False,
+        description: Optional[str] = None,
         dynamic_batching: bool = False,
         dynamic_batching_timeout: int = 1,
         dynamic_batching_max_batch_size: int = 32,
@@ -53,6 +54,7 @@ def __init__(
         self.input_datatype = input_datatype
         self.output_datatype = output_datatype
         self.use_remote_service = use_remote_service
+        self.description = description
         self.dynamic_batching = dynamic_batching
         self.dynamic_batching_timeout = dynamic_batching_timeout
         self.dynamic_batching_max_batch_size = dynamic_batching_max_batch_size

diff --git a/comps/llms/text-generation/README.md b/comps/llms/text-generation/README.md
@@ -2,20 +2,20 @@
 
 This microservice, designed for Language Model Inference (LLM), processes input consisting of a query string and associated reranked documents. It constructs a prompt based on the query and documents, which is then used to perform inference with a large language model. The service delivers the inference results as output.
 
-A prerequisite for using this microservice is that users must have a LLM text generation service (etc., TGI, vLLM and Ray) already running. Users need to set the LLM service's endpoint into an environment variable. The microservice utilizes this endpoint to create an LLM object, enabling it to communicate with the LLM service for executing language model operations.
+A prerequisite for using this microservice is that users must have a LLM text generation service (etc., TGI, vLLM) already running. Users need to set the LLM service's endpoint into an environment variable. The microservice utilizes this endpoint to create an LLM object, enabling it to communicate with the LLM service for executing language model operations.
 
-Overall, this microservice offers a streamlined way to integrate large language model inference into applications, requiring minimal setup from the user beyond initiating a TGI/vLLM/Ray service and configuring the necessary environment variables. This allows for the seamless processing of queries and documents to generate intelligent, context-aware responses.
+Overall, this microservice offers a streamlined way to integrate large language model inference into applications, requiring minimal setup from the user beyond initiating a TGI/vLLM service and configuring the necessary environment variables. This allows for the seamless processing of queries and documents to generate intelligent, context-aware responses.
 
 ## Validated LLM Models
 
-| Model                       | TGI-Gaudi | vLLM-CPU | vLLM-Gaudi | Ray |
-| --------------------------- | --------- | -------- | ---------- | --- |
-| [Intel/neural-chat-7b-v3-3] | ✓         | ✓        | ✓          | ✓   |
-| [Llama-2-7b-chat-hf]        | ✓         | ✓        | ✓          | ✓   |
-| [Llama-2-70b-chat-hf]       | ✓         | -        | ✓          | x   |
-| [Meta-Llama-3-8B-Instruct]  | ✓         | ✓        | ✓          | ✓   |
-| [Meta-Llama-3-70B-Instruct] | ✓         | -        | ✓          | x   |
-| [Phi-3]                     | x         | Limit 4K | Limit 4K   | ✓   |
+| Model                       | TGI-Gaudi | vLLM-CPU | vLLM-Gaudi |
+| --------------------------- | --------- | -------- | ---------- |
+| [Intel/neural-chat-7b-v3-3] | ✓         | ✓        | ✓          |
+| [Llama-2-7b-chat-hf]        | ✓         | ✓        | ✓          |
+| [Llama-2-70b-chat-hf]       | ✓         | -        | ✓          |
+| [Meta-Llama-3-8B-Instruct]  | ✓         | ✓        | ✓          |
+| [Meta-Llama-3-70B-Instruct] | ✓         | -        | ✓          |
+| [Phi-3]                     | x         | Limit 4K | Limit 4K   |
 
 ## Clone OPEA GenAIComps
 
@@ -121,53 +121,6 @@ export vLLM_ENDPOINT="http://${vLLM_HOST_IP}:8008"
 python llm.py
 ```
 
-#### 1.2.3 Start the Ray Service
-
-Install the requirements for Ray Service
-
-```bash
-cd ${OPEA_GENAICOMPS_ROOT}/comps/llms/text-generation/vllm/ray
-
-pip install -r requirements.txt
-```
-
-Execute the docker run command to initiate the backend, along with the Python script that launches the microservice.
-
-```bash
-export vLLM_RAY_HOST_IP=$(hostname -I | awk '{print $1}')  # This sets IP of the current machine
-export LLM_MODEL=${your_hf_llm_model}
-export DATA_DIR=$HOME/data  # Location to download the model
-export HF_TOKEN=${your_hf_api_token}
-
-# Build the image first as opea/vllm:cpu
-bash ${OPEA_GENAICOMPS_ROOT}/comps/llms/text-generation/vllm/ray/dependency/build_docker_vllmray.sh
-
-# Initiate the backend
-docker run \
-  --name="vllm-ray-service" \
-  --runtime=habana \
-  -v $DATA_DIR:/data \
-  -e HABANA_VISIBLE_DEVICES=all \
-  -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-  --cap-add=sys_nice \
-  --ipc=host \
-  -p 8006:8000 \
-  -e HF_TOKEN=$HF_TOKEN \
-  opea/vllm_ray:habana \
-  /bin/bash -c " \
-    ray start --head && \
-    python vllm_ray_openai.py \
-    --port_number 8000 \
-    --model_id_or_path $LLM_MODEL \
-    --tensor_parallel_size 2 \
-    --enforce_eager False"
-
-# Start the microservice with an endpoint as the above docker run command
-export vLLM_RAY_ENDPOINT="http://${vLLM_RAY_HOST_IP}:8006"
-
-python llm.py
-```
-
 ## 🚀2. Start Microservice with Docker (Option 2)
 
 In order to start the microservices with docker, you need to build the docker images first for the microservice.
@@ -203,22 +156,6 @@ docker build \
   -f comps/llms/text-generation/vllm/langchain/Dockerfile .
 ```
 
-#### 2.1.3 Ray
-
-```bash
-# Build the Ray Serve docker
-bash ${OPEA_GENAICOMPS_ROOT}/comps/llms/text-generation/vllm/ray/dependency/build_docker_vllmray.sh
-
-# Build the microservice docker
-cd ${OPEA_GENAICOMPS_ROOT}
-
-docker build \
-  --build-arg https_proxy=$https_proxy \
-  --build-arg http_proxy=$http_proxy \
-  -t opea/llm-vllm-ray:latest \
-  -f comps/llms/text-generation/vllm/ray/Dockerfile .
-```
-
 ### 2.2 Start LLM Service with the built image
 
 To start a docker container, you have two options:
@@ -247,15 +184,6 @@ export vLLM_LLM_ENDPOINT="http://${your_ip}:8008"
 export LLM_MODEL=${your_hf_llm_model}
 ```
 
-In order to start Ray serve and LLM services, you need to setup the following environment variables first.
-
-```bash
-export HF_TOKEN=${your_hf_api_token}
-export RAY_Serve_ENDPOINT="http://${your_ip}:8008"
-export LLM_MODEL=${your_hf_llm_model}
-export CHAT_PROCESSOR="ChatModelLlama"
-```
-
 ### 2.3 Run Docker with CLI (Option A)
 
 #### 2.3.1 TGI
@@ -311,29 +239,6 @@ docker run \
   opea/llm-vllm:latest
 ```
 
-#### 2.3.3 Ray Serve
-
-Start Ray Serve endpoint.
-
-```bash
-bash ${OPEA_GENAICOMPS_ROOT}/comps/llms/text-generation/vllm/ray/dependency/launch_vllmray.sh
-```
-
-Start Ray Serve microservice.
-
-```bash
-docker run -d \
-  --name="llm-ray-server" \
-  -p 9000:9000 \
-  --ipc=host \
-  -e http_proxy=$http_proxy \
-  -e https_proxy=$https_proxy \
-  -e RAY_Serve_ENDPOINT=$RAY_Serve_ENDPOINT \
-  -e HF_TOKEN=$HF_TOKEN \
-  -e LLM_MODEL=$LLM_MODEL \
-  opea/llm-ray:latest
-```
-
 ### 2.4 Run Docker with Docker Compose (Option B)
 
 #### 2.4.1 TGI
@@ -350,13 +255,6 @@ cd ${OPEA_GENAICOMPS_ROOT}/comps/llms/text-generation/vllm/langchain
 docker compose -f docker_compose_llm.yaml up -d
 ```
 
-#### 2.4.3 Ray Serve
-
-```bash
-cd ${OPEA_GENAICOMPS_ROOT}/comps/llms/text-generation/vllm/ray
-docker compose -f docker_compose_llm.yaml up -d
-```
-
 ## 🚀3. Consume LLM Service
 
 ### 3.1 Check Service Status
@@ -391,22 +289,6 @@ curl http://${your_ip}:8008/v1/completions \
   }'
 ```
 
-#### 3.2.3 Verify the Ray Service
-
-```bash
-curl http://${your_ip}:8008/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-  "model": ${your_hf_llm_model},
-  "messages": [
-        {"role": "assistant", "content": "You are a helpful assistant."},
-        {"role": "user", "content": "What is Deep Learning?"}
-    ],
-  "max_tokens": 32,
-  "stream": true
-  }'
-```
-
 ### 3.3 Consume LLM Service
 
 You can set the following model parameters according to your actual needs, such as `max_tokens`, `streaming`.