scaleapi · yunfeng-scale · May 31, 2024 · May 31, 2024 · edgan8 · May 31, 2024
diff --git a/model-engine/model_engine_server/api/app.py b/model-engine/model_engine_server/api/app.py
@@ -33,7 +33,7 @@
 
 logger = make_logger(logger_name())
 
-# Allows us to make the Uvicorn worker concurrency in model_engine_server/api/worker.py very high
+# See also Uvicorn worker concurrency in model_engine_server/api/worker.py
 MAX_CONCURRENCY = 500
 
 concurrency_limiter = MultiprocessingConcurrencyLimiter(

diff --git a/model-engine/model_engine_server/api/worker.py b/model-engine/model_engine_server/api/worker.py
diff --git a/model-engine/model_engine_server/entrypoints/start_fastapi_server.py b/model-engine/model_engine_server/entrypoints/start_fastapi_server.py
@@ -3,28 +3,42 @@
 
 You can do this with `start-fastapi-server`.
 """
+
 import argparse
 import subprocess
 from typing import List
 
+# Uvicorn returns 503 instead of 429 when concurrency exceeds the limit
+# We'll autoscale at target concurrency of a much lower number (around 50), and this just makes sure we don't 503 with bursty traffic
+# We set this very high since model_engine_server/api/app.py sets a lower per-pod concurrency at which we start returning 429s
+CONCURRENCY_LIMIT = 10000
+
 
-def start_gunicorn_server(port: int, num_workers: int, debug: bool) -> None:
-    """Starts a GUnicorn server locally."""
+def start_uvicorn_server(port: int, debug: bool) -> None:
+    """Starts a Uvicorn server locally."""
     additional_args: List[str] = []
     if debug:
-        additional_args.extend(["--reload", "--timeout", "0"])
+        additional_args.extend(["--reload", "--timeout-graceful-shutdown", "0"])
     command = [
-        "gunicorn",
-        "--bind",
-        f"[::]:{port}",
-        "--timeout",
+        "uvicorn",
+        "--host",
+        "::",
+        "--port",
+        f"{port}",
+        "--timeout-graceful-shutdown",
         "60",
-        "--keep-alive",
+        "--timeout-keep-alive",
         "2",
-        "--worker-class",
-        "model_engine_server.api.worker.LaunchWorker",
+        # uvloop and httptools are both faster than their alternatives, but they are not compatible
+        # with Windows or PyPy.
+        "--loop",
+        "uvloop",
+        "--http",
+        "httptools",
+        "--limit-concurrency",
+        f"{CONCURRENCY_LIMIT}",
         "--workers",
-        f"{num_workers}",
+        "1",  # Let the Kubernetes deployment handle the number of pods
         *additional_args,
         "model_engine_server.api.app:app",
     ]
@@ -38,11 +52,10 @@ def entrypoint():
     # We can probably use asyncio since this service is going to be more I/O bound.
     parser = argparse.ArgumentParser(description="Hosted Inference Server")
     parser.add_argument("--port", type=int, default=5000)
-    parser.add_argument("--num-workers", type=int, default=4)
     parser.add_argument("--debug", "-d", action="store_true")
     args = parser.parse_args()
 
-    start_gunicorn_server(args.port, args.num_workers, args.debug)
+    start_uvicorn_server(args.port, args.debug)
 
 
 if __name__ == "__main__":