From 996a4b226c68a4c561974dabd4deb4a9448d291e Mon Sep 17 00:00:00 2001 From: sasha0552 Date: Sun, 24 Dec 2023 20:51:24 +0000 Subject: [PATCH] Add vllm & general improvements (#1) Patches: * Enable prompt cache in llama.cpp server Projects: * Add vllm CI: * Use CI on any branches * Run CI on PRs * Rename workflow Misc: * Remove Tesla P40-specific automatic config * Update README.md --- .../{build.yml => build-iso-cuda.yml} | 8 ++-- .gitmodules | 4 ++ README.md | 3 +- airootfs/home/tori/llama.cpp | 2 +- airootfs/home/tori/vllm | 1 + .../files/automatic-config.json | 10 ---- .../0100-llamacpp-enable-prompt-cache.patch | 29 ++++++++++++ .../patches/0100-vllm-build-for-pascal.patch | 23 ++++++++++ .../scripts/0100-automatic-patches.sh | 3 -- .../scripts/0100-llamacpp-patches.sh | 8 ++++ .../scripts/0100-vllm-patches.sh | 8 ++++ .../scripts/1000-automatic-dependencies.sh | 12 ++--- .../1000-sillytavern-extras-dependencies.sh | 16 +++---- ...1000-text-generation-webui-dependencies.sh | 12 ++--- .../scripts/1000-vllm-dependencies.sh | 46 +++++++++++++++++++ 15 files changed, 146 insertions(+), 39 deletions(-) rename .github/workflows/{build.yml => build-iso-cuda.yml} (88%) create mode 160000 airootfs/home/tori/vllm delete mode 100644 airootfs/root/customize_airootfs/files/automatic-config.json create mode 100644 airootfs/root/customize_airootfs/patches/0100-llamacpp-enable-prompt-cache.patch create mode 100644 airootfs/root/customize_airootfs/patches/0100-vllm-build-for-pascal.patch create mode 100644 airootfs/root/customize_airootfs/scripts/0100-llamacpp-patches.sh create mode 100644 airootfs/root/customize_airootfs/scripts/0100-vllm-patches.sh create mode 100644 airootfs/root/customize_airootfs/scripts/1000-vllm-dependencies.sh diff --git a/.github/workflows/build.yml b/.github/workflows/build-iso-cuda.yml similarity index 88% rename from .github/workflows/build.yml rename to .github/workflows/build-iso-cuda.yml index 263a922..406aca3 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build-iso-cuda.yml @@ -1,8 +1,8 @@ -name: Build ISO +name: Build ISO (CUDA) on: - push: - branches: main + - push + - pull_request jobs: build: @@ -35,7 +35,7 @@ jobs: pacman --sync --noconfirm --needed archiso # Build image - mkarchiso -v -w /workspace/work -o /workspace/out /workspace + mkarchiso -v -m iso -w /workspace/work -o /workspace/out /workspace - name: Upload artifacts uses: actions/upload-artifact@v4 diff --git a/.gitmodules b/.gitmodules index 081d29f..d3c63f6 100644 --- a/.gitmodules +++ b/.gitmodules @@ -17,3 +17,7 @@ [submodule "airootfs/home/tori/text-generation-webui"] path = airootfs/home/tori/text-generation-webui url = https://github.com/oobabooga/text-generation-webui.git + +[submodule "airootfs/home/tori/vllm"] + path = airootfs/home/tori/vllm + url = https://github.com/vllm-project/vllm.git diff --git a/README.md b/README.md index f8a3bee..864347b 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ LiveCD distribution based on [ArchLinux](https://archlinux.org/) and currently i * [llama.cpp](https://github.com/ggerganov/llama.cpp) * [SillyTavern-Extras](https://github.com/SillyTavern/SillyTavern-Extras) * [text-generation-webui](https://github.com/oobabooga/text-generation-webui) +* [vllm](https://github.com/vllm-project/vllm) If you would like to see another AI-related project included in ToriLinux, please open an [issue](https://github.com/sasha0552/ToriLinux/issues/new). @@ -19,7 +20,7 @@ If you would like to see another AI-related project included in ToriLinux, pleas To use ToriLinux: 1. Install [Ventoy](https://ventoy.net/en/doc_start.html) on a USB drive. -2. Download the latest ISO from [workflows](https://github.com/sasha0552/ToriLinux/actions) and copy it to the USB drive. +2. Download the latest ISO from [workflows](https://github.com/sasha0552/ToriLinux/actions?query=branch%3Amain) and copy it to the USB drive. 3. Boot from the USB drive (select it as the boot device in BIOS/UEFI). 4. Log in with the username `tori` and password `tori`. You can also use [SSH](https://en.wikipedia.org/wiki/Secure_Shell). diff --git a/airootfs/home/tori/llama.cpp b/airootfs/home/tori/llama.cpp index 7082d24..708e179 160000 --- a/airootfs/home/tori/llama.cpp +++ b/airootfs/home/tori/llama.cpp @@ -1 +1 @@ -Subproject commit 7082d24cec35e9ce9147535a2224dfc67ee0a78c +Subproject commit 708e179e8562c2604240df95a2241dea17fd808b diff --git a/airootfs/home/tori/vllm b/airootfs/home/tori/vllm new file mode 160000 index 0000000..1db83e3 --- /dev/null +++ b/airootfs/home/tori/vllm @@ -0,0 +1 @@ +Subproject commit 1db83e31a2468cae37f326a642c0a4c4edbb5e4f diff --git a/airootfs/root/customize_airootfs/files/automatic-config.json b/airootfs/root/customize_airootfs/files/automatic-config.json deleted file mode 100644 index 787ad06..0000000 --- a/airootfs/root/customize_airootfs/files/automatic-config.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "precision": "Full", - "cuda_dtype": "FP32", - "no_half": true, - "no_half_vae": true, - - "cross_attention_options": [ - "SDP disable memory attention" - ] -} diff --git a/airootfs/root/customize_airootfs/patches/0100-llamacpp-enable-prompt-cache.patch b/airootfs/root/customize_airootfs/patches/0100-llamacpp-enable-prompt-cache.patch new file mode 100644 index 0000000..e6bca3c --- /dev/null +++ b/airootfs/root/customize_airootfs/patches/0100-llamacpp-enable-prompt-cache.patch @@ -0,0 +1,29 @@ +--- a/examples/server/server.cpp ++++ b/examples/server/server.cpp +@@ -191,7 +191,7 @@ enum slot_command + struct slot_params + { + bool stream = true; +- bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt ++ bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt + + uint32_t seed = -1; // RNG seed + int32_t n_keep = 0; // number of tokens to keep from initial prompt +@@ -712,7 +712,7 @@ struct llama_server_context + } + + slot->params.stream = json_value(data, "stream", false); +- slot->params.cache_prompt = json_value(data, "cache_prompt", false); ++ slot->params.cache_prompt = json_value(data, "cache_prompt", true); + slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict); + slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k); + slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p); +@@ -2439,7 +2439,7 @@ json oaicompat_completion_params_parse( + // Map OpenAI parameters to llama.cpp parameters + llama_params["model"] = json_value(body, "model", std::string("uknown")); + llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt' +- llama_params["cache_prompt"] = json_value(body, "cache_prompt", false); ++ llama_params["cache_prompt"] = json_value(body, "cache_prompt", true); + llama_params["temperature"] = json_value(body, "temperature", 0.8); + llama_params["top_k"] = json_value(body, "top_k", 40); + llama_params["top_p"] = json_value(body, "top_p", 0.95); diff --git a/airootfs/root/customize_airootfs/patches/0100-vllm-build-for-pascal.patch b/airootfs/root/customize_airootfs/patches/0100-vllm-build-for-pascal.patch new file mode 100644 index 0000000..5b300d1 --- /dev/null +++ b/airootfs/root/customize_airootfs/patches/0100-vllm-build-for-pascal.patch @@ -0,0 +1,23 @@ +--- a/setup.py ++++ b/setup.py +@@ -15,7 +15,7 @@ ROOT_DIR = os.path.dirname(__file__) + MAIN_CUDA_VERSION = "12.1" + + # Supported NVIDIA GPU architectures. +-NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"} ++NVIDIA_SUPPORTED_ARCHS = {"6.0", "6.1", "7.0", "7.5", "8.0", "8.6", "8.9", "9.0"} + ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030", "gfx1100"} + # SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS) + +@@ -146,9 +146,9 @@ if _is_cuda() and not compute_capabilities: + device_count = torch.cuda.device_count() + for i in range(device_count): + major, minor = torch.cuda.get_device_capability(i) +- if major < 7: ++ if major < 6: + raise RuntimeError( +- "GPUs with compute capability below 7.0 are not supported.") ++ "GPUs with compute capability below 6.0 are not supported.") + compute_capabilities.add(f"{major}.{minor}") + + if _is_cuda(): diff --git a/airootfs/root/customize_airootfs/scripts/0100-automatic-patches.sh b/airootfs/root/customize_airootfs/scripts/0100-automatic-patches.sh index 23c6c08..f7f11cc 100644 --- a/airootfs/root/customize_airootfs/scripts/0100-automatic-patches.sh +++ b/airootfs/root/customize_airootfs/scripts/0100-automatic-patches.sh @@ -16,7 +16,4 @@ pushd "automatic" # drop pstate in idle patch -p1 < "$CUSTOMIZE_AIROOTFS/patches/0000-automatic-drop-pstate-in-idle.patch" - - # copy config - cp "$CUSTOMIZE_AIROOTFS/files/automatic-config.json" "config.json" popd diff --git a/airootfs/root/customize_airootfs/scripts/0100-llamacpp-patches.sh b/airootfs/root/customize_airootfs/scripts/0100-llamacpp-patches.sh new file mode 100644 index 0000000..72d6c8f --- /dev/null +++ b/airootfs/root/customize_airootfs/scripts/0100-llamacpp-patches.sh @@ -0,0 +1,8 @@ +#!/bin/bash +set -eu + +# llama.cpp patches +pushd "llama.cpp" + # enable prompt cache by default + patch -p1 < "$CUSTOMIZE_AIROOTFS/patches/0100-llamacpp-enable-prompt-cache.patch" +popd diff --git a/airootfs/root/customize_airootfs/scripts/0100-vllm-patches.sh b/airootfs/root/customize_airootfs/scripts/0100-vllm-patches.sh new file mode 100644 index 0000000..c1b18dc --- /dev/null +++ b/airootfs/root/customize_airootfs/scripts/0100-vllm-patches.sh @@ -0,0 +1,8 @@ +#!/bin/bash +set -eu + +# vllm patches +pushd "vllm" + # build for pascal + patch -p1 < "$CUSTOMIZE_AIROOTFS/patches/0100-vllm-build-for-pascal.patch" +popd diff --git a/airootfs/root/customize_airootfs/scripts/1000-automatic-dependencies.sh b/airootfs/root/customize_airootfs/scripts/1000-automatic-dependencies.sh index e857bfa..b68f97d 100644 --- a/airootfs/root/customize_airootfs/scripts/1000-automatic-dependencies.sh +++ b/airootfs/root/customize_airootfs/scripts/1000-automatic-dependencies.sh @@ -3,15 +3,15 @@ set -eu # automatic dependencies pushd "automatic" + # disable package caching + export PIP_NO_CACHE_DIR=0 + # create venv python3 -m venv venv # activate venv source venv/bin/activate - - # disable package caching - export PIP_NO_CACHE_DIR=0 - - # install dependencies - python3 launch.py --test + # install dependencies + python3 launch.py --test + deactivate popd diff --git a/airootfs/root/customize_airootfs/scripts/1000-sillytavern-extras-dependencies.sh b/airootfs/root/customize_airootfs/scripts/1000-sillytavern-extras-dependencies.sh index cf9d401..0613315 100644 --- a/airootfs/root/customize_airootfs/scripts/1000-sillytavern-extras-dependencies.sh +++ b/airootfs/root/customize_airootfs/scripts/1000-sillytavern-extras-dependencies.sh @@ -3,17 +3,17 @@ set -eu # SillyTavern-Extras dependencies pushd "SillyTavern-Extras" + # disable package caching + export PIP_NO_CACHE_DIR=0 + # create venv python3 -m venv venv # activate venv source venv/bin/activate - - # disable package caching - export PIP_NO_CACHE_DIR=0 - - # install dependencies - pip3 install -r requirements.txt - pip3 install -r requirements-coqui.txt - pip3 install -r requirements-rvc.txt + # install dependencies + pip3 install -r requirements.txt + pip3 install -r requirements-coqui.txt + pip3 install -r requirements-rvc.txt + deactivate popd diff --git a/airootfs/root/customize_airootfs/scripts/1000-text-generation-webui-dependencies.sh b/airootfs/root/customize_airootfs/scripts/1000-text-generation-webui-dependencies.sh index 19667d9..ab5001b 100644 --- a/airootfs/root/customize_airootfs/scripts/1000-text-generation-webui-dependencies.sh +++ b/airootfs/root/customize_airootfs/scripts/1000-text-generation-webui-dependencies.sh @@ -3,15 +3,15 @@ set -eu # text-generation-webui dependencies pushd "text-generation-webui" + # disable package caching + export PIP_NO_CACHE_DIR=0 + # create venv python3 -m venv venv # activate venv source venv/bin/activate - - # disable package caching - export PIP_NO_CACHE_DIR=0 - - # install dependencies - pip3 install -r requirements.txt + # install dependencies + pip3 install -r requirements.txt + deactivate popd diff --git a/airootfs/root/customize_airootfs/scripts/1000-vllm-dependencies.sh b/airootfs/root/customize_airootfs/scripts/1000-vllm-dependencies.sh new file mode 100644 index 0000000..6dce971 --- /dev/null +++ b/airootfs/root/customize_airootfs/scripts/1000-vllm-dependencies.sh @@ -0,0 +1,46 @@ +#!/bin/bash +set -eu + +# vllm dependencies +pushd "vllm" + # disable package caching + export PIP_NO_CACHE_DIR=0 + + # limit the number of parallel jobs to avoid OOM + export MAX_JOBS=1 + + # define supported architectures + export TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0" + + # cuda home directory + export CUDA_HOME=/opt/cuda + + # use gcc 12 + export CC=gcc-12 + export CXX=g++-12 + + # create venv + python3 -m venv venv + + # activate venv + source venv/bin/activate + # install dependencies + pip3 install -r requirements.txt + pip3 install -r requirements-build.txt + + # build native extension + python3 setup.py build_ext --inplace + deactivate + + # remove venv + rm -fr venv + + # create venv + python3 -m venv venv + + # activate venv + source venv/bin/activate + # install dependencies + pip3 install -r requirements.txt + deactivate +popd