Add vllm & general improvements (#1)

Patches: * Enable prompt cache in llama.cpp server Projects: * Add vllm CI: * Use CI on any branches * Run CI on PRs * Rename workflow Misc: * Remove Tesla P40-specific automatic config * Update README.md
sasha0552 · Dec 24, 2023 · 996a4b2 · 996a4b2
1 parent 1144c0d
commit 996a4b2
Show file tree

Hide file tree

Showing 15 changed files with 146 additions and 39 deletions.
diff --git a/.github/workflows/build.yml → .github/workflows/build-iso-cuda.yml b/.github/workflows/build.yml → .github/workflows/build-iso-cuda.yml
@@ -1,8 +1,8 @@
-name: Build ISO
+name: Build ISO (CUDA)
 
 on:
-  push:
-    branches: main
+  - push
+  - pull_request
 
 jobs:
   build:
@@ -35,7 +35,7 @@ jobs:
             pacman --sync --noconfirm --needed archiso
 
             # Build image
-            mkarchiso -v -w /workspace/work -o /workspace/out /workspace
+            mkarchiso -v -m iso -w /workspace/work -o /workspace/out /workspace
 
       - name: Upload artifacts
         uses: actions/upload-artifact@v4

diff --git a/.gitmodules b/.gitmodules
@@ -17,3 +17,7 @@
 [submodule "airootfs/home/tori/text-generation-webui"]
   path = airootfs/home/tori/text-generation-webui
   url = https://github.com/oobabooga/text-generation-webui.git
+
+[submodule "airootfs/home/tori/vllm"]
+  path = airootfs/home/tori/vllm
+  url = https://github.com/vllm-project/vllm.git
diff --git a/README.md b/README.md
@@ -6,6 +6,7 @@ LiveCD distribution based on [ArchLinux](https://archlinux.org/) and currently i
 * [llama.cpp](https://github.com/ggerganov/llama.cpp)
 * [SillyTavern-Extras](https://github.com/SillyTavern/SillyTavern-Extras)
 * [text-generation-webui](https://github.com/oobabooga/text-generation-webui)
+* [vllm](https://github.com/vllm-project/vllm)
 
 If you would like to see another AI-related project included in ToriLinux, please open an [issue](https://github.com/sasha0552/ToriLinux/issues/new).
 
@@ -19,7 +20,7 @@ If you would like to see another AI-related project included in ToriLinux, pleas
 
 To use ToriLinux:
 1. Install [Ventoy](https://ventoy.net/en/doc_start.html) on a USB drive.
-2. Download the latest ISO from [workflows](https://github.com/sasha0552/ToriLinux/actions) and copy it to the USB drive.
+2. Download the latest ISO from [workflows](https://github.com/sasha0552/ToriLinux/actions?query=branch%3Amain) and copy it to the USB drive.
 3. Boot from the USB drive (select it as the boot device in BIOS/UEFI).
 4. Log in with the username `tori` and password `tori`. You can also use [SSH](https://en.wikipedia.org/wiki/Secure_Shell).
 

diff --git a/airootfs/home/tori/llama.cpp b/airootfs/home/tori/llama.cpp
diff --git a/airootfs/home/tori/vllm b/airootfs/home/tori/vllm
diff --git a/airootfs/root/customize_airootfs/files/automatic-config.json b/airootfs/root/customize_airootfs/files/automatic-config.json
diff --git a/airootfs/root/customize_airootfs/patches/0100-llamacpp-enable-prompt-cache.patch b/airootfs/root/customize_airootfs/patches/0100-llamacpp-enable-prompt-cache.patch
@@ -0,0 +1,29 @@
+--- a/examples/server/server.cpp
++++ b/examples/server/server.cpp
+@@ -191,7 +191,7 @@ enum slot_command
+ struct slot_params
+ {
+     bool stream       = true;
+-    bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
++    bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
+
+     uint32_t seed      = -1; // RNG seed
+     int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
+@@ -712,7 +712,7 @@ struct llama_server_context
+         }
+
+         slot->params.stream           = json_value(data, "stream",            false);
+-        slot->params.cache_prompt     = json_value(data, "cache_prompt",      false);
++        slot->params.cache_prompt     = json_value(data, "cache_prompt",      true);
+         slot->params.n_predict        = json_value(data, "n_predict",         default_params.n_predict);
+         slot->sparams.top_k           = json_value(data, "top_k",             default_sparams.top_k);
+         slot->sparams.top_p           = json_value(data, "top_p",             default_sparams.top_p);
+@@ -2439,7 +2439,7 @@ json oaicompat_completion_params_parse(
+     // Map OpenAI parameters to llama.cpp parameters
+     llama_params["model"]             = json_value(body, "model", std::string("uknown"));
+     llama_params["prompt"]            = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
+-    llama_params["cache_prompt"]      = json_value(body, "cache_prompt", false);
++    llama_params["cache_prompt"]      = json_value(body, "cache_prompt", true);
+     llama_params["temperature"]       = json_value(body, "temperature", 0.8);
+     llama_params["top_k"]             = json_value(body, "top_k", 40);
+     llama_params["top_p"]             = json_value(body, "top_p", 0.95);
diff --git a/airootfs/root/customize_airootfs/patches/0100-vllm-build-for-pascal.patch b/airootfs/root/customize_airootfs/patches/0100-vllm-build-for-pascal.patch
@@ -0,0 +1,23 @@
+--- a/setup.py
++++ b/setup.py
+@@ -15,7 +15,7 @@ ROOT_DIR = os.path.dirname(__file__)
+ MAIN_CUDA_VERSION = "12.1"
+
+ # Supported NVIDIA GPU architectures.
+-NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
++NVIDIA_SUPPORTED_ARCHS = {"6.0", "6.1", "7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
+ ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030", "gfx1100"}
+ # SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)
+
+@@ -146,9 +146,9 @@ if _is_cuda() and not compute_capabilities:
+     device_count = torch.cuda.device_count()
+     for i in range(device_count):
+         major, minor = torch.cuda.get_device_capability(i)
+-        if major < 7:
++        if major < 6:
+             raise RuntimeError(
+-                "GPUs with compute capability below 7.0 are not supported.")
++                "GPUs with compute capability below 6.0 are not supported.")
+         compute_capabilities.add(f"{major}.{minor}")
+
+ if _is_cuda():
diff --git a/airootfs/root/customize_airootfs/scripts/0100-automatic-patches.sh b/airootfs/root/customize_airootfs/scripts/0100-automatic-patches.sh
@@ -16,7 +16,4 @@ pushd "automatic"
 
   # drop pstate in idle
   patch -p1 < "$CUSTOMIZE_AIROOTFS/patches/0000-automatic-drop-pstate-in-idle.patch"
-
-  # copy config
-  cp "$CUSTOMIZE_AIROOTFS/files/automatic-config.json" "config.json"
 popd
diff --git a/airootfs/root/customize_airootfs/scripts/0100-llamacpp-patches.sh b/airootfs/root/customize_airootfs/scripts/0100-llamacpp-patches.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -eu
+
+# llama.cpp patches
+pushd "llama.cpp"
+  # enable prompt cache by default
+  patch -p1 < "$CUSTOMIZE_AIROOTFS/patches/0100-llamacpp-enable-prompt-cache.patch"
+popd
diff --git a/airootfs/root/customize_airootfs/scripts/0100-vllm-patches.sh b/airootfs/root/customize_airootfs/scripts/0100-vllm-patches.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -eu
+
+# vllm patches
+pushd "vllm"
+  # build for pascal
+  patch -p1 < "$CUSTOMIZE_AIROOTFS/patches/0100-vllm-build-for-pascal.patch"
+popd
diff --git a/airootfs/root/customize_airootfs/scripts/1000-automatic-dependencies.sh b/airootfs/root/customize_airootfs/scripts/1000-automatic-dependencies.sh
@@ -3,15 +3,15 @@ set -eu
 
 # automatic dependencies
 pushd "automatic"
+  # disable package caching
+  export PIP_NO_CACHE_DIR=0
+
   # create venv
   python3 -m venv venv
 
   # activate venv
   source venv/bin/activate
-
-  # disable package caching
-  export PIP_NO_CACHE_DIR=0
-
-  # install dependencies
-  python3 launch.py --test
+    # install dependencies
+    python3 launch.py --test
+  deactivate
 popd
diff --git a/airootfs/root/customize_airootfs/scripts/1000-sillytavern-extras-dependencies.sh b/airootfs/root/customize_airootfs/scripts/1000-sillytavern-extras-dependencies.sh
@@ -3,17 +3,17 @@ set -eu
 
 # SillyTavern-Extras dependencies
 pushd "SillyTavern-Extras"
+  # disable package caching
+  export PIP_NO_CACHE_DIR=0
+
   # create venv
   python3 -m venv venv
 
   # activate venv
   source venv/bin/activate
-
-  # disable package caching
-  export PIP_NO_CACHE_DIR=0
-
-  # install dependencies
-  pip3 install -r requirements.txt
-  pip3 install -r requirements-coqui.txt
-  pip3 install -r requirements-rvc.txt
+    # install dependencies
+    pip3 install -r requirements.txt
+    pip3 install -r requirements-coqui.txt
+    pip3 install -r requirements-rvc.txt
+  deactivate
 popd
diff --git a/airootfs/root/customize_airootfs/scripts/1000-text-generation-webui-dependencies.sh b/airootfs/root/customize_airootfs/scripts/1000-text-generation-webui-dependencies.sh
@@ -3,15 +3,15 @@ set -eu
 
 # text-generation-webui dependencies
 pushd "text-generation-webui"
+  # disable package caching
+  export PIP_NO_CACHE_DIR=0
+
   # create venv
   python3 -m venv venv
 
   # activate venv
   source venv/bin/activate
-
-  # disable package caching
-  export PIP_NO_CACHE_DIR=0
-
-  # install dependencies
-  pip3 install -r requirements.txt
+    # install dependencies
+    pip3 install -r requirements.txt
+  deactivate
 popd
diff --git a/airootfs/root/customize_airootfs/scripts/1000-vllm-dependencies.sh b/airootfs/root/customize_airootfs/scripts/1000-vllm-dependencies.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+set -eu
+
+# vllm dependencies
+pushd "vllm"
+  # disable package caching
+  export PIP_NO_CACHE_DIR=0
+
+  # limit the number of parallel jobs to avoid OOM
+  export MAX_JOBS=1
+
+  # define supported architectures
+  export TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0"
+
+  # cuda home directory
+  export CUDA_HOME=/opt/cuda
+
+  # use gcc 12
+  export CC=gcc-12
+  export CXX=g++-12
+
+  # create venv
+  python3 -m venv venv
+
+  # activate venv
+  source venv/bin/activate
+    # install dependencies
+    pip3 install -r requirements.txt
+    pip3 install -r requirements-build.txt
+
+    # build native extension
+    python3 setup.py build_ext --inplace
+  deactivate
+
+  # remove venv
+  rm -fr venv
+
+  # create venv
+  python3 -m venv venv
+
+  # activate venv
+  source venv/bin/activate
+    # install dependencies
+    pip3 install -r requirements.txt
+  deactivate
+popd
+1 −1		.github/workflows/docker.yml
+50 −6		common/sampling.cpp
+3 −0		common/sampling.h
+2 −0		examples/server/README.md
+44 −0		examples/server/server.cpp
+12 −7		ggml-cuda.cu
+11 −5		llama.cpp