From 996a4b226c68a4c561974dabd4deb4a9448d291e Mon Sep 17 00:00:00 2001
From: sasha0552 <admin@sasha0552.org>
Date: Sun, 24 Dec 2023 20:51:24 +0000
Subject: [PATCH] Add vllm & general improvements (#1)

Patches:

* Enable prompt cache in llama.cpp server

Projects:

* Add vllm

CI:

* Use CI on any branches

* Run CI on PRs

* Rename workflow

Misc:

* Remove Tesla P40-specific automatic config

* Update README.md
---
 .../{build.yml => build-iso-cuda.yml}         |  8 ++--
 .gitmodules                                   |  4 ++
 README.md                                     |  3 +-
 airootfs/home/tori/llama.cpp                  |  2 +-
 airootfs/home/tori/vllm                       |  1 +
 .../files/automatic-config.json               | 10 ----
 .../0100-llamacpp-enable-prompt-cache.patch   | 29 ++++++++++++
 .../patches/0100-vllm-build-for-pascal.patch  | 23 ++++++++++
 .../scripts/0100-automatic-patches.sh         |  3 --
 .../scripts/0100-llamacpp-patches.sh          |  8 ++++
 .../scripts/0100-vllm-patches.sh              |  8 ++++
 .../scripts/1000-automatic-dependencies.sh    | 12 ++---
 .../1000-sillytavern-extras-dependencies.sh   | 16 +++----
 ...1000-text-generation-webui-dependencies.sh | 12 ++---
 .../scripts/1000-vllm-dependencies.sh         | 46 +++++++++++++++++++
 15 files changed, 146 insertions(+), 39 deletions(-)
 rename .github/workflows/{build.yml => build-iso-cuda.yml} (88%)
 create mode 160000 airootfs/home/tori/vllm
 delete mode 100644 airootfs/root/customize_airootfs/files/automatic-config.json
 create mode 100644 airootfs/root/customize_airootfs/patches/0100-llamacpp-enable-prompt-cache.patch
 create mode 100644 airootfs/root/customize_airootfs/patches/0100-vllm-build-for-pascal.patch
 create mode 100644 airootfs/root/customize_airootfs/scripts/0100-llamacpp-patches.sh
 create mode 100644 airootfs/root/customize_airootfs/scripts/0100-vllm-patches.sh
 create mode 100644 airootfs/root/customize_airootfs/scripts/1000-vllm-dependencies.sh

diff --git a/.github/workflows/build.yml b/.github/workflows/build-iso-cuda.yml
similarity index 88%
rename from .github/workflows/build.yml
rename to .github/workflows/build-iso-cuda.yml
index 263a922..406aca3 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build-iso-cuda.yml
@@ -1,8 +1,8 @@
-name: Build ISO
+name: Build ISO (CUDA)
 
 on:
-  push:
-    branches: main
+  - push
+  - pull_request
 
 jobs:
   build:
@@ -35,7 +35,7 @@ jobs:
             pacman --sync --noconfirm --needed archiso
 
             # Build image
-            mkarchiso -v -w /workspace/work -o /workspace/out /workspace
+            mkarchiso -v -m iso -w /workspace/work -o /workspace/out /workspace
 
       - name: Upload artifacts
         uses: actions/upload-artifact@v4
diff --git a/.gitmodules b/.gitmodules
index 081d29f..d3c63f6 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -17,3 +17,7 @@
 [submodule "airootfs/home/tori/text-generation-webui"]
   path = airootfs/home/tori/text-generation-webui
   url = https://github.com/oobabooga/text-generation-webui.git
+
+[submodule "airootfs/home/tori/vllm"]
+  path = airootfs/home/tori/vllm
+  url = https://github.com/vllm-project/vllm.git
diff --git a/README.md b/README.md
index f8a3bee..864347b 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,7 @@ LiveCD distribution based on [ArchLinux](https://archlinux.org/) and currently i
 * [llama.cpp](https://github.com/ggerganov/llama.cpp)
 * [SillyTavern-Extras](https://github.com/SillyTavern/SillyTavern-Extras)
 * [text-generation-webui](https://github.com/oobabooga/text-generation-webui)
+* [vllm](https://github.com/vllm-project/vllm)
 
 If you would like to see another AI-related project included in ToriLinux, please open an [issue](https://github.com/sasha0552/ToriLinux/issues/new).
 
@@ -19,7 +20,7 @@ If you would like to see another AI-related project included in ToriLinux, pleas
 
 To use ToriLinux:
 1. Install [Ventoy](https://ventoy.net/en/doc_start.html) on a USB drive.
-2. Download the latest ISO from [workflows](https://github.com/sasha0552/ToriLinux/actions) and copy it to the USB drive.
+2. Download the latest ISO from [workflows](https://github.com/sasha0552/ToriLinux/actions?query=branch%3Amain) and copy it to the USB drive.
 3. Boot from the USB drive (select it as the boot device in BIOS/UEFI).
 4. Log in with the username `tori` and password `tori`. You can also use [SSH](https://en.wikipedia.org/wiki/Secure_Shell).
 
diff --git a/airootfs/home/tori/llama.cpp b/airootfs/home/tori/llama.cpp
index 7082d24..708e179 160000
--- a/airootfs/home/tori/llama.cpp
+++ b/airootfs/home/tori/llama.cpp
@@ -1 +1 @@
-Subproject commit 7082d24cec35e9ce9147535a2224dfc67ee0a78c
+Subproject commit 708e179e8562c2604240df95a2241dea17fd808b
diff --git a/airootfs/home/tori/vllm b/airootfs/home/tori/vllm
new file mode 160000
index 0000000..1db83e3
--- /dev/null
+++ b/airootfs/home/tori/vllm
@@ -0,0 +1 @@
+Subproject commit 1db83e31a2468cae37f326a642c0a4c4edbb5e4f
diff --git a/airootfs/root/customize_airootfs/files/automatic-config.json b/airootfs/root/customize_airootfs/files/automatic-config.json
deleted file mode 100644
index 787ad06..0000000
--- a/airootfs/root/customize_airootfs/files/automatic-config.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-  "precision": "Full",
-  "cuda_dtype": "FP32",
-  "no_half": true,
-  "no_half_vae": true,
-
-  "cross_attention_options": [
-    "SDP disable memory attention"
-  ]
-}
diff --git a/airootfs/root/customize_airootfs/patches/0100-llamacpp-enable-prompt-cache.patch b/airootfs/root/customize_airootfs/patches/0100-llamacpp-enable-prompt-cache.patch
new file mode 100644
index 0000000..e6bca3c
--- /dev/null
+++ b/airootfs/root/customize_airootfs/patches/0100-llamacpp-enable-prompt-cache.patch
@@ -0,0 +1,29 @@
+--- a/examples/server/server.cpp
++++ b/examples/server/server.cpp
+@@ -191,7 +191,7 @@ enum slot_command
+ struct slot_params
+ {
+     bool stream       = true;
+-    bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
++    bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
+ 
+     uint32_t seed      = -1; // RNG seed
+     int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
+@@ -712,7 +712,7 @@ struct llama_server_context
+         }
+ 
+         slot->params.stream           = json_value(data, "stream",            false);
+-        slot->params.cache_prompt     = json_value(data, "cache_prompt",      false);
++        slot->params.cache_prompt     = json_value(data, "cache_prompt",      true);
+         slot->params.n_predict        = json_value(data, "n_predict",         default_params.n_predict);
+         slot->sparams.top_k           = json_value(data, "top_k",             default_sparams.top_k);
+         slot->sparams.top_p           = json_value(data, "top_p",             default_sparams.top_p);
+@@ -2439,7 +2439,7 @@ json oaicompat_completion_params_parse(
+     // Map OpenAI parameters to llama.cpp parameters
+     llama_params["model"]             = json_value(body, "model", std::string("uknown"));
+     llama_params["prompt"]            = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
+-    llama_params["cache_prompt"]      = json_value(body, "cache_prompt", false);
++    llama_params["cache_prompt"]      = json_value(body, "cache_prompt", true);
+     llama_params["temperature"]       = json_value(body, "temperature", 0.8);
+     llama_params["top_k"]             = json_value(body, "top_k", 40);
+     llama_params["top_p"]             = json_value(body, "top_p", 0.95);
diff --git a/airootfs/root/customize_airootfs/patches/0100-vllm-build-for-pascal.patch b/airootfs/root/customize_airootfs/patches/0100-vllm-build-for-pascal.patch
new file mode 100644
index 0000000..5b300d1
--- /dev/null
+++ b/airootfs/root/customize_airootfs/patches/0100-vllm-build-for-pascal.patch
@@ -0,0 +1,23 @@
+--- a/setup.py
++++ b/setup.py
+@@ -15,7 +15,7 @@ ROOT_DIR = os.path.dirname(__file__)
+ MAIN_CUDA_VERSION = "12.1"
+ 
+ # Supported NVIDIA GPU architectures.
+-NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
++NVIDIA_SUPPORTED_ARCHS = {"6.0", "6.1", "7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
+ ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030", "gfx1100"}
+ # SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)
+ 
+@@ -146,9 +146,9 @@ if _is_cuda() and not compute_capabilities:
+     device_count = torch.cuda.device_count()
+     for i in range(device_count):
+         major, minor = torch.cuda.get_device_capability(i)
+-        if major < 7:
++        if major < 6:
+             raise RuntimeError(
+-                "GPUs with compute capability below 7.0 are not supported.")
++                "GPUs with compute capability below 6.0 are not supported.")
+         compute_capabilities.add(f"{major}.{minor}")
+ 
+ if _is_cuda():
diff --git a/airootfs/root/customize_airootfs/scripts/0100-automatic-patches.sh b/airootfs/root/customize_airootfs/scripts/0100-automatic-patches.sh
index 23c6c08..f7f11cc 100644
--- a/airootfs/root/customize_airootfs/scripts/0100-automatic-patches.sh
+++ b/airootfs/root/customize_airootfs/scripts/0100-automatic-patches.sh
@@ -16,7 +16,4 @@ pushd "automatic"
 
   # drop pstate in idle
   patch -p1 < "$CUSTOMIZE_AIROOTFS/patches/0000-automatic-drop-pstate-in-idle.patch"
-
-  # copy config
-  cp "$CUSTOMIZE_AIROOTFS/files/automatic-config.json" "config.json"
 popd
diff --git a/airootfs/root/customize_airootfs/scripts/0100-llamacpp-patches.sh b/airootfs/root/customize_airootfs/scripts/0100-llamacpp-patches.sh
new file mode 100644
index 0000000..72d6c8f
--- /dev/null
+++ b/airootfs/root/customize_airootfs/scripts/0100-llamacpp-patches.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -eu
+
+# llama.cpp patches
+pushd "llama.cpp"
+  # enable prompt cache by default
+  patch -p1 < "$CUSTOMIZE_AIROOTFS/patches/0100-llamacpp-enable-prompt-cache.patch"
+popd
diff --git a/airootfs/root/customize_airootfs/scripts/0100-vllm-patches.sh b/airootfs/root/customize_airootfs/scripts/0100-vllm-patches.sh
new file mode 100644
index 0000000..c1b18dc
--- /dev/null
+++ b/airootfs/root/customize_airootfs/scripts/0100-vllm-patches.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -eu
+
+# vllm patches
+pushd "vllm"
+  # build for pascal
+  patch -p1 < "$CUSTOMIZE_AIROOTFS/patches/0100-vllm-build-for-pascal.patch"
+popd
diff --git a/airootfs/root/customize_airootfs/scripts/1000-automatic-dependencies.sh b/airootfs/root/customize_airootfs/scripts/1000-automatic-dependencies.sh
index e857bfa..b68f97d 100644
--- a/airootfs/root/customize_airootfs/scripts/1000-automatic-dependencies.sh
+++ b/airootfs/root/customize_airootfs/scripts/1000-automatic-dependencies.sh
@@ -3,15 +3,15 @@ set -eu
 
 # automatic dependencies
 pushd "automatic"
+  # disable package caching
+  export PIP_NO_CACHE_DIR=0
+
   # create venv
   python3 -m venv venv
 
   # activate venv
   source venv/bin/activate
-
-  # disable package caching
-  export PIP_NO_CACHE_DIR=0
-
-  # install dependencies
-  python3 launch.py --test
+    # install dependencies
+    python3 launch.py --test
+  deactivate
 popd
diff --git a/airootfs/root/customize_airootfs/scripts/1000-sillytavern-extras-dependencies.sh b/airootfs/root/customize_airootfs/scripts/1000-sillytavern-extras-dependencies.sh
index cf9d401..0613315 100644
--- a/airootfs/root/customize_airootfs/scripts/1000-sillytavern-extras-dependencies.sh
+++ b/airootfs/root/customize_airootfs/scripts/1000-sillytavern-extras-dependencies.sh
@@ -3,17 +3,17 @@ set -eu
 
 # SillyTavern-Extras dependencies
 pushd "SillyTavern-Extras"
+  # disable package caching
+  export PIP_NO_CACHE_DIR=0
+
   # create venv
   python3 -m venv venv
 
   # activate venv
   source venv/bin/activate
-
-  # disable package caching
-  export PIP_NO_CACHE_DIR=0
-
-  # install dependencies
-  pip3 install -r requirements.txt
-  pip3 install -r requirements-coqui.txt
-  pip3 install -r requirements-rvc.txt
+    # install dependencies
+    pip3 install -r requirements.txt
+    pip3 install -r requirements-coqui.txt
+    pip3 install -r requirements-rvc.txt
+  deactivate
 popd
diff --git a/airootfs/root/customize_airootfs/scripts/1000-text-generation-webui-dependencies.sh b/airootfs/root/customize_airootfs/scripts/1000-text-generation-webui-dependencies.sh
index 19667d9..ab5001b 100644
--- a/airootfs/root/customize_airootfs/scripts/1000-text-generation-webui-dependencies.sh
+++ b/airootfs/root/customize_airootfs/scripts/1000-text-generation-webui-dependencies.sh
@@ -3,15 +3,15 @@ set -eu
 
 # text-generation-webui dependencies
 pushd "text-generation-webui"
+  # disable package caching
+  export PIP_NO_CACHE_DIR=0
+
   # create venv
   python3 -m venv venv
 
   # activate venv
   source venv/bin/activate
-
-  # disable package caching
-  export PIP_NO_CACHE_DIR=0
-
-  # install dependencies
-  pip3 install -r requirements.txt
+    # install dependencies
+    pip3 install -r requirements.txt
+  deactivate
 popd
diff --git a/airootfs/root/customize_airootfs/scripts/1000-vllm-dependencies.sh b/airootfs/root/customize_airootfs/scripts/1000-vllm-dependencies.sh
new file mode 100644
index 0000000..6dce971
--- /dev/null
+++ b/airootfs/root/customize_airootfs/scripts/1000-vllm-dependencies.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+set -eu
+
+# vllm dependencies
+pushd "vllm"
+  # disable package caching
+  export PIP_NO_CACHE_DIR=0
+
+  # limit the number of parallel jobs to avoid OOM
+  export MAX_JOBS=1
+
+  # define supported architectures
+  export TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0"
+
+  # cuda home directory
+  export CUDA_HOME=/opt/cuda
+
+  # use gcc 12
+  export CC=gcc-12
+  export CXX=g++-12
+
+  # create venv
+  python3 -m venv venv
+
+  # activate venv
+  source venv/bin/activate
+    # install dependencies
+    pip3 install -r requirements.txt
+    pip3 install -r requirements-build.txt
+
+    # build native extension
+    python3 setup.py build_ext --inplace
+  deactivate
+
+  # remove venv
+  rm -fr venv
+
+  # create venv
+  python3 -m venv venv
+
+  # activate venv
+  source venv/bin/activate
+    # install dependencies
+    pip3 install -r requirements.txt
+  deactivate
+popd