-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add vllm & general improvements (#1)
Patches: * Enable prompt cache in llama.cpp server Projects: * Add vllm CI: * Use CI on any branches * Run CI on PRs * Rename workflow Misc: * Remove Tesla P40-specific automatic config * Update README.md
- Loading branch information
Showing
15 changed files
with
146 additions
and
39 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Submodule llama.cpp
updated
7 files
+1 −1 | .github/workflows/docker.yml | |
+50 −6 | common/sampling.cpp | |
+3 −0 | common/sampling.h | |
+2 −0 | examples/server/README.md | |
+44 −0 | examples/server/server.cpp | |
+12 −7 | ggml-cuda.cu | |
+11 −5 | llama.cpp |
10 changes: 0 additions & 10 deletions
10
airootfs/root/customize_airootfs/files/automatic-config.json
This file was deleted.
Oops, something went wrong.
29 changes: 29 additions & 0 deletions
29
airootfs/root/customize_airootfs/patches/0100-llamacpp-enable-prompt-cache.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
--- a/examples/server/server.cpp | ||
+++ b/examples/server/server.cpp | ||
@@ -191,7 +191,7 @@ enum slot_command | ||
struct slot_params | ||
{ | ||
bool stream = true; | ||
- bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt | ||
+ bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt | ||
|
||
uint32_t seed = -1; // RNG seed | ||
int32_t n_keep = 0; // number of tokens to keep from initial prompt | ||
@@ -712,7 +712,7 @@ struct llama_server_context | ||
} | ||
|
||
slot->params.stream = json_value(data, "stream", false); | ||
- slot->params.cache_prompt = json_value(data, "cache_prompt", false); | ||
+ slot->params.cache_prompt = json_value(data, "cache_prompt", true); | ||
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict); | ||
slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k); | ||
slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p); | ||
@@ -2439,7 +2439,7 @@ json oaicompat_completion_params_parse( | ||
// Map OpenAI parameters to llama.cpp parameters | ||
llama_params["model"] = json_value(body, "model", std::string("uknown")); | ||
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt' | ||
- llama_params["cache_prompt"] = json_value(body, "cache_prompt", false); | ||
+ llama_params["cache_prompt"] = json_value(body, "cache_prompt", true); | ||
llama_params["temperature"] = json_value(body, "temperature", 0.8); | ||
llama_params["top_k"] = json_value(body, "top_k", 40); | ||
llama_params["top_p"] = json_value(body, "top_p", 0.95); |
23 changes: 23 additions & 0 deletions
23
airootfs/root/customize_airootfs/patches/0100-vllm-build-for-pascal.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
--- a/setup.py | ||
+++ b/setup.py | ||
@@ -15,7 +15,7 @@ ROOT_DIR = os.path.dirname(__file__) | ||
MAIN_CUDA_VERSION = "12.1" | ||
|
||
# Supported NVIDIA GPU architectures. | ||
-NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"} | ||
+NVIDIA_SUPPORTED_ARCHS = {"6.0", "6.1", "7.0", "7.5", "8.0", "8.6", "8.9", "9.0"} | ||
ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030", "gfx1100"} | ||
# SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS) | ||
|
||
@@ -146,9 +146,9 @@ if _is_cuda() and not compute_capabilities: | ||
device_count = torch.cuda.device_count() | ||
for i in range(device_count): | ||
major, minor = torch.cuda.get_device_capability(i) | ||
- if major < 7: | ||
+ if major < 6: | ||
raise RuntimeError( | ||
- "GPUs with compute capability below 7.0 are not supported.") | ||
+ "GPUs with compute capability below 6.0 are not supported.") | ||
compute_capabilities.add(f"{major}.{minor}") | ||
|
||
if _is_cuda(): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
8 changes: 8 additions & 0 deletions
8
airootfs/root/customize_airootfs/scripts/0100-llamacpp-patches.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
#!/bin/bash | ||
set -eu | ||
|
||
# llama.cpp patches | ||
pushd "llama.cpp" | ||
# enable prompt cache by default | ||
patch -p1 < "$CUSTOMIZE_AIROOTFS/patches/0100-llamacpp-enable-prompt-cache.patch" | ||
popd |
8 changes: 8 additions & 0 deletions
8
airootfs/root/customize_airootfs/scripts/0100-vllm-patches.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
#!/bin/bash | ||
set -eu | ||
|
||
# vllm patches | ||
pushd "vllm" | ||
# build for pascal | ||
patch -p1 < "$CUSTOMIZE_AIROOTFS/patches/0100-vllm-build-for-pascal.patch" | ||
popd |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
46 changes: 46 additions & 0 deletions
46
airootfs/root/customize_airootfs/scripts/1000-vllm-dependencies.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
#!/bin/bash | ||
set -eu | ||
|
||
# vllm dependencies | ||
pushd "vllm" | ||
# disable package caching | ||
export PIP_NO_CACHE_DIR=0 | ||
|
||
# limit the number of parallel jobs to avoid OOM | ||
export MAX_JOBS=1 | ||
|
||
# define supported architectures | ||
export TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0" | ||
|
||
# cuda home directory | ||
export CUDA_HOME=/opt/cuda | ||
|
||
# use gcc 12 | ||
export CC=gcc-12 | ||
export CXX=g++-12 | ||
|
||
# create venv | ||
python3 -m venv venv | ||
|
||
# activate venv | ||
source venv/bin/activate | ||
# install dependencies | ||
pip3 install -r requirements.txt | ||
pip3 install -r requirements-build.txt | ||
|
||
# build native extension | ||
python3 setup.py build_ext --inplace | ||
deactivate | ||
|
||
# remove venv | ||
rm -fr venv | ||
|
||
# create venv | ||
python3 -m venv venv | ||
|
||
# activate venv | ||
source venv/bin/activate | ||
# install dependencies | ||
pip3 install -r requirements.txt | ||
deactivate | ||
popd |