Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add vllm & general improvements #1

Merged
merged 14 commits into from
Dec 24, 2023
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
name: Build ISO
name: Build ISO (CUDA)

on:
push:
branches: main
- push
- pull_request

jobs:
build:
Expand Down Expand Up @@ -35,7 +35,7 @@ jobs:
pacman --sync --noconfirm --needed archiso

# Build image
mkarchiso -v -w /workspace/work -o /workspace/out /workspace
mkarchiso -v -m iso -w /workspace/work -o /workspace/out /workspace

- name: Upload artifacts
uses: actions/upload-artifact@v4
Expand Down
4 changes: 4 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,7 @@
[submodule "airootfs/home/tori/text-generation-webui"]
path = airootfs/home/tori/text-generation-webui
url = https://github.com/oobabooga/text-generation-webui.git

[submodule "airootfs/home/tori/vllm"]
path = airootfs/home/tori/vllm
url = https://github.com/vllm-project/vllm.git
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ LiveCD distribution based on [ArchLinux](https://archlinux.org/) and currently i
* [llama.cpp](https://github.com/ggerganov/llama.cpp)
* [SillyTavern-Extras](https://github.com/SillyTavern/SillyTavern-Extras)
* [text-generation-webui](https://github.com/oobabooga/text-generation-webui)
* [vllm](https://github.com/vllm-project/vllm)

If you would like to see another AI-related project included in ToriLinux, please open an [issue](https://github.com/sasha0552/ToriLinux/issues/new).

Expand All @@ -19,7 +20,7 @@ If you would like to see another AI-related project included in ToriLinux, pleas

To use ToriLinux:
1. Install [Ventoy](https://ventoy.net/en/doc_start.html) on a USB drive.
2. Download the latest ISO from [workflows](https://github.com/sasha0552/ToriLinux/actions) and copy it to the USB drive.
2. Download the latest ISO from [workflows](https://github.com/sasha0552/ToriLinux/actions?query=branch%3Amain) and copy it to the USB drive.
3. Boot from the USB drive (select it as the boot device in BIOS/UEFI).
4. Log in with the username `tori` and password `tori`. You can also use [SSH](https://en.wikipedia.org/wiki/Secure_Shell).

Expand Down
1 change: 1 addition & 0 deletions airootfs/home/tori/vllm
Submodule vllm added at 1db83e
10 changes: 0 additions & 10 deletions airootfs/root/customize_airootfs/files/automatic-config.json

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -191,7 +191,7 @@ enum slot_command
struct slot_params
{
bool stream = true;
- bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
+ bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt

uint32_t seed = -1; // RNG seed
int32_t n_keep = 0; // number of tokens to keep from initial prompt
@@ -712,7 +712,7 @@ struct llama_server_context
}

slot->params.stream = json_value(data, "stream", false);
- slot->params.cache_prompt = json_value(data, "cache_prompt", false);
+ slot->params.cache_prompt = json_value(data, "cache_prompt", true);
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
@@ -2439,7 +2439,7 @@ json oaicompat_completion_params_parse(
// Map OpenAI parameters to llama.cpp parameters
llama_params["model"] = json_value(body, "model", std::string("uknown"));
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
- llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
+ llama_params["cache_prompt"] = json_value(body, "cache_prompt", true);
llama_params["temperature"] = json_value(body, "temperature", 0.8);
llama_params["top_k"] = json_value(body, "top_k", 40);
llama_params["top_p"] = json_value(body, "top_p", 0.95);
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@ ROOT_DIR = os.path.dirname(__file__)
MAIN_CUDA_VERSION = "12.1"

# Supported NVIDIA GPU architectures.
-NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
+NVIDIA_SUPPORTED_ARCHS = {"6.0", "6.1", "7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030", "gfx1100"}
# SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)

@@ -146,9 +146,9 @@ if _is_cuda() and not compute_capabilities:
device_count = torch.cuda.device_count()
for i in range(device_count):
major, minor = torch.cuda.get_device_capability(i)
- if major < 7:
+ if major < 6:
raise RuntimeError(
- "GPUs with compute capability below 7.0 are not supported.")
+ "GPUs with compute capability below 6.0 are not supported.")
compute_capabilities.add(f"{major}.{minor}")

if _is_cuda():
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,4 @@ pushd "automatic"

# drop pstate in idle
patch -p1 < "$CUSTOMIZE_AIROOTFS/patches/0000-automatic-drop-pstate-in-idle.patch"

# copy config
cp "$CUSTOMIZE_AIROOTFS/files/automatic-config.json" "config.json"
popd
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash
set -eu

# llama.cpp patches
pushd "llama.cpp"
# enable prompt cache by default
patch -p1 < "$CUSTOMIZE_AIROOTFS/patches/0100-llamacpp-enable-prompt-cache.patch"
popd
8 changes: 8 additions & 0 deletions airootfs/root/customize_airootfs/scripts/0100-vllm-patches.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash
set -eu

# vllm patches
pushd "vllm"
# build for pascal
patch -p1 < "$CUSTOMIZE_AIROOTFS/patches/0100-vllm-build-for-pascal.patch"
popd
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@ set -eu

# automatic dependencies
pushd "automatic"
# disable package caching
export PIP_NO_CACHE_DIR=0

# create venv
python3 -m venv venv

# activate venv
source venv/bin/activate

# disable package caching
export PIP_NO_CACHE_DIR=0

# install dependencies
python3 launch.py --test
# install dependencies
python3 launch.py --test
deactivate
popd
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,17 @@ set -eu

# SillyTavern-Extras dependencies
pushd "SillyTavern-Extras"
# disable package caching
export PIP_NO_CACHE_DIR=0

# create venv
python3 -m venv venv

# activate venv
source venv/bin/activate

# disable package caching
export PIP_NO_CACHE_DIR=0

# install dependencies
pip3 install -r requirements.txt
pip3 install -r requirements-coqui.txt
pip3 install -r requirements-rvc.txt
# install dependencies
pip3 install -r requirements.txt
pip3 install -r requirements-coqui.txt
pip3 install -r requirements-rvc.txt
deactivate
popd
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@ set -eu

# text-generation-webui dependencies
pushd "text-generation-webui"
# disable package caching
export PIP_NO_CACHE_DIR=0

# create venv
python3 -m venv venv

# activate venv
source venv/bin/activate

# disable package caching
export PIP_NO_CACHE_DIR=0

# install dependencies
pip3 install -r requirements.txt
# install dependencies
pip3 install -r requirements.txt
deactivate
popd
46 changes: 46 additions & 0 deletions airootfs/root/customize_airootfs/scripts/1000-vllm-dependencies.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/bin/bash
set -eu

# vllm dependencies
pushd "vllm"
# disable package caching
export PIP_NO_CACHE_DIR=0

# limit the number of parallel jobs to avoid OOM
export MAX_JOBS=1

# define supported architectures
export TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0"

# cuda home directory
export CUDA_HOME=/opt/cuda

# use gcc 12
export CC=gcc-12
export CXX=g++-12

# create venv
python3 -m venv venv

# activate venv
source venv/bin/activate
# install dependencies
pip3 install -r requirements.txt
pip3 install -r requirements-build.txt

# build native extension
python3 setup.py build_ext --inplace
deactivate

# remove venv
rm -fr venv

# create venv
python3 -m venv venv

# activate venv
source venv/bin/activate
# install dependencies
pip3 install -r requirements.txt
deactivate
popd