Skip to content

Commit

Permalink
Model Server (#695)
Browse files Browse the repository at this point in the history
Provides the ability to pull out the NLP models into a separate model server which can then be hosted on a GPU instance if desired.
  • Loading branch information
yuhongsun96 authored Nov 7, 2023
1 parent fe938b6 commit 7433ddd
Show file tree
Hide file tree
Showing 20 changed files with 614 additions and 85 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: Build and Push Backend Images on Tagging

on:
push:
tags:
- '*'

jobs:
build-and-push:
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v2

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1

- name: Login to Docker Hub
uses: docker/login-action@v1
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_TOKEN }}

- name: Backend Image Docker Build and Push
uses: docker/build-push-action@v2
with:
context: ./backend
file: ./backend/Dockerfile.model_server
platforms: linux/amd64,linux/arm64
push: true
tags: |
danswer/danswer-model-server:${{ github.ref_name }}
danswer/danswer-model-server:latest
build-args: |
DANSWER_VERSION: ${{ github.ref_name }}
1 change: 1 addition & 0 deletions backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ RUN apt-get remove -y linux-libc-dev && \
# Set up application files
WORKDIR /app
COPY ./danswer /app/danswer
COPY ./shared_models /app/shared_models
COPY ./alembic /app/alembic
COPY ./alembic.ini /app/alembic.ini
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
Expand Down
27 changes: 27 additions & 0 deletions backend/Dockerfile.model_server
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
FROM python:3.11.4-slim-bookworm

# Default DANSWER_VERSION, typically overriden during builds by GitHub Actions.
ARG DANSWER_VERSION=0.2-dev
ENV DANSWER_VERSION=${DANSWER_VERSION}

COPY ./requirements/model_server.txt /tmp/requirements.txt
RUN pip install --no-cache-dir --upgrade -r /tmp/requirements.txt

WORKDIR /app
# Needed for model configs and defaults
COPY ./danswer/configs /app/danswer/configs
# Utils used by model server
COPY ./danswer/utils/logger.py /app/danswer/utils/logger.py
COPY ./danswer/utils/timing.py /app/danswer/utils/timing.py
# Version information
COPY ./danswer/__init__.py /app/danswer/__init__.py
# Shared implementations for running NLP models locally
COPY ./danswer/search/search_nlp_models.py /app/danswer/search/search_nlp_models.py
# Request/Response models
COPY ./shared_models /app/shared_models
# Model Server main code
COPY ./model_server /app/model_server

ENV PYTHONPATH /app

CMD ["uvicorn", "model_server.main:app", "--host", "0.0.0.0", "--port", "9000"]
6 changes: 4 additions & 2 deletions backend/danswer/background/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from danswer.background.indexing.job_client import SimpleJobClient
from danswer.background.indexing.run_indexing import run_indexing_entrypoint
from danswer.configs.app_configs import EXPERIMENTAL_SIMPLE_JOB_CLIENT_ENABLED
from danswer.configs.app_configs import MODEL_SERVER_HOST
from danswer.configs.app_configs import NUM_INDEXING_WORKERS
from danswer.configs.model_configs import MIN_THREADS_ML_MODELS
from danswer.db.connector import fetch_connectors
Expand Down Expand Up @@ -290,7 +291,8 @@ def update_loop(delay: int = 10, num_workers: int = NUM_INDEXING_WORKERS) -> Non


if __name__ == "__main__":
logger.info("Warming up Embedding Model(s)")
warm_up_models(indexer_only=True)
if not MODEL_SERVER_HOST:
logger.info("Warming up Embedding Model(s)")
warm_up_models(indexer_only=True)
logger.info("Starting Indexing Loop")
update_loop()
14 changes: 8 additions & 6 deletions backend/danswer/configs/app_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from danswer.configs.constants import AuthType
from danswer.configs.constants import DocumentIndexType


#####
# App Configs
#####
Expand All @@ -19,6 +20,7 @@
# Use this if you want to use Danswer as a search engine only without the LLM capabilities
DISABLE_GENERATIVE_AI = os.environ.get("DISABLE_GENERATIVE_AI", "").lower() == "true"


#####
# Web Configs
#####
Expand Down Expand Up @@ -56,7 +58,6 @@
if _VALID_EMAIL_DOMAINS_STR
else []
)

# OAuth Login Flow
# Used for both Google OAuth2 and OIDC flows
OAUTH_CLIENT_ID = (
Expand Down Expand Up @@ -200,12 +201,13 @@


#####
# Encoder Model Endpoint Configs (Currently unused, running the models in memory)
# Model Server Configs
#####
BI_ENCODER_HOST = "localhost"
BI_ENCODER_PORT = 9000
CROSS_ENCODER_HOST = "localhost"
CROSS_ENCODER_PORT = 9000
# If MODEL_SERVER_HOST is set, the NLP models required for Danswer are offloaded to the server via
# requests. Be sure to include the scheme in the MODEL_SERVER_HOST value.
MODEL_SERVER_HOST = os.environ.get("MODEL_SERVER_HOST") or None
MODEL_SERVER_ALLOWED_HOST = os.environ.get("MODEL_SERVER_HOST") or "0.0.0.0"
MODEL_SERVER_PORT = int(os.environ.get("MODEL_SERVER_PORT") or "9000")


#####
Expand Down
15 changes: 4 additions & 11 deletions backend/danswer/indexing/embedder.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
import numpy
from sentence_transformers import SentenceTransformer # type: ignore

from danswer.configs.app_configs import ENABLE_MINI_CHUNK
from danswer.configs.model_configs import ASYM_PASSAGE_PREFIX
from danswer.configs.model_configs import BATCH_SIZE_ENCODE_CHUNKS
from danswer.configs.model_configs import NORMALIZE_EMBEDDINGS
from danswer.indexing.chunker import split_chunk_text_into_mini_chunks
from danswer.indexing.models import ChunkEmbedding
from danswer.indexing.models import DocAwareChunk
from danswer.indexing.models import IndexChunk
from danswer.search.models import Embedder
from danswer.search.search_nlp_models import get_default_embedding_model
from danswer.search.search_nlp_models import EmbeddingModel
from danswer.utils.timing import log_function_time


Expand All @@ -24,7 +22,7 @@ def encode_chunks(
) -> list[IndexChunk]:
embedded_chunks: list[IndexChunk] = []
if embedding_model is None:
embedding_model = get_default_embedding_model()
embedding_model = EmbeddingModel()

chunk_texts = []
chunk_mini_chunks_count = {}
Expand All @@ -43,15 +41,10 @@ def encode_chunks(
chunk_texts[i : i + batch_size] for i in range(0, len(chunk_texts), batch_size)
]

embeddings_np: list[numpy.ndarray] = []
embeddings: list[list[float]] = []
for text_batch in text_batches:
# Normalize embeddings is only configured via model_configs.py, be sure to use right value for the set loss
embeddings_np.extend(
embedding_model.encode(
text_batch, normalize_embeddings=NORMALIZE_EMBEDDINGS
)
)
embeddings: list[list[float]] = [embedding.tolist() for embedding in embeddings_np]
embeddings.extend(embedding_model.encode(text_batch))

embedding_ind_start = 0
for chunk_ind, chunk in enumerate(chunks):
Expand Down
26 changes: 21 additions & 5 deletions backend/danswer/main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import nltk # type:ignore
import torch
import uvicorn
from fastapi import FastAPI
from fastapi import Request
Expand All @@ -7,6 +8,7 @@
from fastapi.responses import JSONResponse
from httpx_oauth.clients.google import GoogleOAuth2

from danswer import __version__
from danswer.auth.schemas import UserCreate
from danswer.auth.schemas import UserRead
from danswer.auth.schemas import UserUpdate
Expand All @@ -17,6 +19,8 @@
from danswer.configs.app_configs import APP_PORT
from danswer.configs.app_configs import AUTH_TYPE
from danswer.configs.app_configs import DISABLE_GENERATIVE_AI
from danswer.configs.app_configs import MODEL_SERVER_HOST
from danswer.configs.app_configs import MODEL_SERVER_PORT
from danswer.configs.app_configs import OAUTH_CLIENT_ID
from danswer.configs.app_configs import OAUTH_CLIENT_SECRET
from danswer.configs.app_configs import SECRET
Expand Down Expand Up @@ -72,7 +76,7 @@ def value_error_handler(_: Request, exc: ValueError) -> JSONResponse:


def get_application() -> FastAPI:
application = FastAPI(title="Internal Search QA Backend", debug=True, version="0.1")
application = FastAPI(title="Danswer Backend", version=__version__)
application.include_router(backend_router)
application.include_router(chat_router)
application.include_router(event_processing_router)
Expand Down Expand Up @@ -176,11 +180,23 @@ def startup_event() -> None:
logger.info(f'Query embedding prefix: "{ASYM_QUERY_PREFIX}"')
logger.info(f'Passage embedding prefix: "{ASYM_PASSAGE_PREFIX}"')

logger.info("Warming up local NLP models.")
warm_up_models()
qa_model = get_default_qa_model()
if MODEL_SERVER_HOST:
logger.info(
f"Using Model Server: http://{MODEL_SERVER_HOST}:{MODEL_SERVER_PORT}"
)
else:
logger.info("Warming up local NLP models.")
if torch.cuda.is_available():
logger.info("GPU is available")
else:
logger.info("GPU is not available")
logger.info(f"Torch Threads: {torch.get_num_threads()}")

warm_up_models()

# This is for the LLM, most LLMs will not need warming up
qa_model.warm_up_model()
# It logs for itself
get_default_qa_model().warm_up_model()

logger.info("Verifying query preprocessing (NLTK) data is downloaded")
nltk.download("stopwords", quiet=True)
Expand Down
19 changes: 6 additions & 13 deletions backend/danswer/search/danswer_helper.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
import numpy as np
import tensorflow as tf # type:ignore
from transformers import AutoTokenizer # type:ignore

from danswer.search.models import QueryFlow
from danswer.search.models import SearchType
from danswer.search.search_nlp_models import get_default_intent_model
from danswer.search.search_nlp_models import get_default_intent_model_tokenizer
from danswer.search.search_nlp_models import get_default_tokenizer
from danswer.search.search_nlp_models import IntentModel
from danswer.search.search_runner import remove_stop_words
from danswer.server.models import HelperResponse
from danswer.utils.logger import setup_logger
Expand All @@ -28,15 +25,11 @@ def count_unk_tokens(text: str, tokenizer: AutoTokenizer) -> int:

@log_function_time()
def query_intent(query: str) -> tuple[SearchType, QueryFlow]:
tokenizer = get_default_intent_model_tokenizer()
intent_model = get_default_intent_model()
model_input = tokenizer(query, return_tensors="tf", truncation=True, padding=True)

predictions = intent_model(model_input)[0]
probabilities = tf.nn.softmax(predictions, axis=-1)
class_percentages = np.round(probabilities.numpy() * 100, 2)

keyword, semantic, qa = class_percentages.tolist()[0]
intent_model = IntentModel()
class_probs = intent_model.predict(query)
keyword = class_probs[0]
semantic = class_probs[1]
qa = class_probs[2]

# Heavily bias towards QA, from user perspective, answering a statement is not as bad as not answering a question
if qa > 20:
Expand Down
Loading

1 comment on commit 7433ddd

@vercel
Copy link

@vercel vercel bot commented on 7433ddd Nov 7, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.