diff --git a/model-engine/model_engine_server/api/app.py b/model-engine/model_engine_server/api/app.py index 851f01833..33e6830a8 100644 --- a/model-engine/model_engine_server/api/app.py +++ b/model-engine/model_engine_server/api/app.py @@ -33,7 +33,7 @@ logger = make_logger(logger_name()) -# Allows us to make the Uvicorn worker concurrency in model_engine_server/api/worker.py very high +# See also Uvicorn worker concurrency in model_engine_server/api/worker.py MAX_CONCURRENCY = 500 concurrency_limiter = MultiprocessingConcurrencyLimiter( diff --git a/model-engine/model_engine_server/api/worker.py b/model-engine/model_engine_server/api/worker.py deleted file mode 100644 index 289640c88..000000000 --- a/model-engine/model_engine_server/api/worker.py +++ /dev/null @@ -1,14 +0,0 @@ -from uvicorn.workers import UvicornWorker - -# Gunicorn returns 503 instead of 429 when concurrency exceeds the limit -# We'll autoscale at target concurrency of a much lower number (around 50), and this just makes sure we don't 503 with bursty traffic -# We set this very high since model_engine_server/api/app.py sets a lower per-pod concurrency at which we start returning 429s -CONCURRENCY_LIMIT = 10000 - - -class LaunchWorker(UvicornWorker): - """Overrides the configuration of the Uvicorn Worker.""" - - # uvloop and httptools are both faster than their alternatives, but they are not compatible - # with Windows or PyPy. - CONFIG_KWARGS = {"loop": "uvloop", "http": "httptools", "limit_concurrency": CONCURRENCY_LIMIT} diff --git a/model-engine/model_engine_server/entrypoints/start_fastapi_server.py b/model-engine/model_engine_server/entrypoints/start_fastapi_server.py index 119935ffb..eed6a88ae 100644 --- a/model-engine/model_engine_server/entrypoints/start_fastapi_server.py +++ b/model-engine/model_engine_server/entrypoints/start_fastapi_server.py @@ -3,28 +3,42 @@ You can do this with `start-fastapi-server`. """ + import argparse import subprocess from typing import List +# Uvicorn returns 503 instead of 429 when concurrency exceeds the limit +# We'll autoscale at target concurrency of a much lower number (around 50), and this just makes sure we don't 503 with bursty traffic +# We set this very high since model_engine_server/api/app.py sets a lower per-pod concurrency at which we start returning 429s +CONCURRENCY_LIMIT = 10000 + -def start_gunicorn_server(port: int, num_workers: int, debug: bool) -> None: - """Starts a GUnicorn server locally.""" +def start_uvicorn_server(port: int, debug: bool) -> None: + """Starts a Uvicorn server locally.""" additional_args: List[str] = [] if debug: - additional_args.extend(["--reload", "--timeout", "0"]) + additional_args.extend(["--reload", "--timeout-graceful-shutdown", "0"]) command = [ - "gunicorn", - "--bind", - f"[::]:{port}", - "--timeout", + "uvicorn", + "--host", + "::", + "--port", + f"{port}", + "--timeout-graceful-shutdown", "60", - "--keep-alive", + "--timeout-keep-alive", "2", - "--worker-class", - "model_engine_server.api.worker.LaunchWorker", + # uvloop and httptools are both faster than their alternatives, but they are not compatible + # with Windows or PyPy. + "--loop", + "uvloop", + "--http", + "httptools", + "--limit-concurrency", + f"{CONCURRENCY_LIMIT}", "--workers", - f"{num_workers}", + "1", # Let the Kubernetes deployment handle the number of pods *additional_args, "model_engine_server.api.app:app", ] @@ -38,11 +52,10 @@ def entrypoint(): # We can probably use asyncio since this service is going to be more I/O bound. parser = argparse.ArgumentParser(description="Hosted Inference Server") parser.add_argument("--port", type=int, default=5000) - parser.add_argument("--num-workers", type=int, default=4) parser.add_argument("--debug", "-d", action="store_true") args = parser.parse_args() - start_gunicorn_server(args.port, args.num_workers, args.debug) + start_uvicorn_server(args.port, args.debug) if __name__ == "__main__": diff --git a/model-engine/requirements.in b/model-engine/requirements.in index 2ef63150f..0a6e70e75 100644 --- a/model-engine/requirements.in +++ b/model-engine/requirements.in @@ -57,6 +57,6 @@ tokenizers~=0.15.2 tqdm~=4.64 transformers==4.38.0 twine==3.7.1 -uvicorn==0.17.6 +uvicorn==0.30.0 uvloop==0.17.0 yarl~=1.4 \ No newline at end of file diff --git a/model-engine/requirements.txt b/model-engine/requirements.txt index 71e7440d8..546844085 100644 --- a/model-engine/requirements.txt +++ b/model-engine/requirements.txt @@ -23,8 +23,6 @@ anyio==3.7.1 # azure-core # httpx # starlette -asgiref==3.7.2 - # via uvicorn asn1crypto==1.5.1 # via scramp async-timeout==4.0.2 @@ -76,9 +74,7 @@ boto3==1.28.1 # celery # kombu boto3-stubs[essential]==1.26.67 - # via - # -r model-engine/requirements.in - # boto3-stubs + # via -r model-engine/requirements.in botocore==1.31.1 # via # -r model-engine/requirements.in @@ -95,9 +91,7 @@ cachetools==5.3.1 cattrs==23.1.2 # via ddtrace celery[redis,sqs,tblib]==5.3.6 - # via - # -r model-engine/requirements.in - # celery + # via -r model-engine/requirements.in certifi==2023.7.22 # via # datadog-api-client @@ -140,7 +134,6 @@ cryptography==42.0.5 # azure-storage-blob # msal # pyjwt - # secretstorage dataclasses-json==0.5.9 # via -r model-engine/requirements.in datadog==0.47.0 @@ -159,7 +152,7 @@ docutils==0.20.1 # via readme-renderer envier==0.4.0 # via ddtrace -exceptiongroup==1.2.0 +exceptiongroup==1.2.1 # via # anyio # cattrs @@ -224,7 +217,7 @@ importlib-metadata==6.8.0 # keyring # quart # twine -importlib-resources==6.1.1 +importlib-resources==6.4.0 # via # alembic # jsonschema @@ -242,10 +235,6 @@ itsdangerous==2.1.2 # starlette jaraco-classes==3.3.0 # via keyring -jeepney==0.8.0 - # via - # keyring - # secretstorage jinja2==3.0.3 # via # -r model-engine/requirements.in @@ -371,9 +360,7 @@ pygments==2.15.1 # readme-renderer # rich pyjwt[crypto]==2.8.0 - # via - # msal - # pyjwt + # via msal pyproject-hooks==1.0.0 # via build python-dateutil==2.8.2 @@ -445,8 +432,6 @@ safetensors==0.4.2 # via transformers scramp==1.4.4 # via pg8000 -secretstorage==3.3.3 - # via keyring sentencepiece==0.1.99 # via -r model-engine/requirements.in sh==1.14.3 @@ -479,7 +464,6 @@ sqlalchemy[asyncio]==2.0.4 # via # -r model-engine/requirements.in # alembic - # sqlalchemy sse-starlette==1.6.1 # via -r model-engine/requirements.in sseclient-py==1.7.2 @@ -489,7 +473,6 @@ starlette[full]==0.36.3 # -r model-engine/requirements.in # fastapi # sse-starlette - # starlette stringcase==1.2.0 # via -r model-engine/requirements.in tblib==2.0.0 @@ -530,7 +513,6 @@ types-s3transfer==0.6.1 typing-extensions==4.10.0 # via # aioredis - # asgiref # azure-core # azure-keyvault-secrets # azure-servicebus @@ -556,6 +538,7 @@ typing-extensions==4.10.0 # sqlalchemy # starlette # typing-inspect + # uvicorn typing-inspect==0.9.0 # via dataclasses-json tzdata==2023.3 @@ -572,7 +555,7 @@ urllib3==1.26.16 # kubernetes # kubernetes-asyncio # requests -uvicorn==0.17.6 +uvicorn==0.30.0 # via -r model-engine/requirements.in uvloop==0.17.0 # via -r model-engine/requirements.in