Skip to content

Commit

Permalink
Add llama-cpp-python server
Browse files Browse the repository at this point in the history
Changed default runtime from 'llama.cpp' to 'llama-cpp-python'.
Added 'llama-cpp-python' as a runtime option for better
flexibility with the `--runtime` flag.

Signed-off-by: Eric Curtin <[email protected]>
  • Loading branch information
ericcurtin committed Dec 20, 2024
1 parent 307628e commit 07202c5
Show file tree
Hide file tree
Showing 13 changed files with 77 additions and 67 deletions.
6 changes: 3 additions & 3 deletions container-images/asahi/Containerfile
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
FROM fedora:41

ARG LLAMA_CPP_SHA=1329c0a75e6a7defc5c380eaf80d8e0f66d7da78
ARG LLAMA_CPP_SHA=7585edbdebd02861e0994dae67c9338731fb3fc5
# renovate: datasource=git-refs depName=ggerganov/whisper.cpp packageName=https://github.com/ggerganov/whisper.cpp gitRef=master versioning=loose type=digest
ARG WHISPER_CPP_SHA=6266a9f9e56a5b925e9892acf650f3eb1245814d
ARG WHISPER_CPP_SHA=3de9deead5759eb038966990e3cb5d83984ae467

COPY ../scripts /scripts
RUN chmod +x /scripts/*.sh && \
/scripts/build_llama_and_whisper.sh "asahi" "$LLAMA_CPP_SHA" \
"$WHISPER_CPP_SHA" "/usr" "-DGGML_VULKAN=1"
"$WHISPER_CPP_SHA"

7 changes: 3 additions & 4 deletions container-images/cuda/Containerfile
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
# Base image with CUDA for compilation
FROM docker.io/nvidia/cuda:12.6.2-devel-ubi9 AS builder

ARG LLAMA_CPP_SHA=af148c9386da825a60c7038549c121c35ca56b50
ARG LLAMA_CPP_SHA=7585edbdebd02861e0994dae67c9338731fb3fc5
# renovate: datasource=git-refs depName=ggerganov/whisper.cpp packageName=https://github.com/ggerganov/whisper.cpp gitRef=master versioning=loose type=digest
ARG WHISPER_CPP_SHA=6266a9f9e56a5b925e9892acf650f3eb1245814d
ARG WHISPER_CPP_SHA=3de9deead5759eb038966990e3cb5d83984ae467

COPY ../scripts /scripts
RUN chmod +x /scripts/*.sh && \
/scripts/build_llama_and_whisper.sh "cuda" "$LLAMA_CPP_SHA" \
"$WHISPER_CPP_SHA" "/tmp/install" \
"-DGGML_CUDA=ON" "-DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined"
"$WHISPER_CPP_SHA"

# Final runtime image
FROM docker.io/nvidia/cuda:12.6.2-runtime-ubi9
Expand Down
6 changes: 3 additions & 3 deletions container-images/ramalama/Containerfile
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
FROM registry.access.redhat.com/ubi9/ubi:9.4-1214.1729773476

ARG LLAMA_CPP_SHA=af148c9386da825a60c7038549c121c35ca56b50
ARG LLAMA_CPP_SHA=7585edbdebd02861e0994dae67c9338731fb3fc5
# renovate: datasource=git-refs depName=ggerganov/whisper.cpp packageName=https://github.com/ggerganov/whisper.cpp gitRef=master versioning=loose type=digest
ARG WHISPER_CPP_SHA=6266a9f9e56a5b925e9892acf650f3eb1245814d
ARG WHISPER_CPP_SHA=3de9deead5759eb038966990e3cb5d83984ae467

COPY ../scripts /scripts
RUN chmod +x /scripts/*.sh && \
/scripts/build_llama_and_whisper.sh "ramalama" "$LLAMA_CPP_SHA" \
"$WHISPER_CPP_SHA" "/usr" "-DGGML_KOMPUTE=1"
"$WHISPER_CPP_SHA"

ENV WHISPER_CPP_SHA=${WHISPER_CPP_SHA}
ENV LLAMA_CPP_SHA=${LLAMA_CPP_SHA}
2 changes: 1 addition & 1 deletion container-images/rocm/Containerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ COPY rocm/rocm.repo /etc/yum.repos.d/
COPY scripts /scripts
RUN chmod +x /scripts/*.sh && \
/scripts/build_llama_and_whisper.sh "rocm" "$LLAMA_CPP_SHA" \
"$WHISPER_CPP_SHA" "/usr" "-DGGML_HIPBLAS=1"
"$WHISPER_CPP_SHA"

39 changes: 26 additions & 13 deletions container-images/scripts/build_llama_and_whisper.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ dnf_install() {
dnf install -y rocm-dev hipblas-devel rocblas-devel
elif [ "$containerfile" = "cuda" ]; then
dnf install -y "${rpm_list[@]}" gcc-toolset-12
source /opt/rh/gcc-toolset-12/enable
# shellcheck disable=SC1091
. /opt/rh/gcc-toolset-12/enable
fi

# For Vulkan image, we don't need to install anything extra but rebuild with
Expand All @@ -39,28 +40,42 @@ dnf_install() {

cmake_steps() {
local flag="$1"
cmake -B build "${common_flags[@]}" "$flag"
cmake -B build "${cpp_flags[@]}" "$flag"
cmake --build build --config Release -j"$(nproc)"
cmake --install build
}

set_install_prefix() {
if [ "$containerfile" = "cuda" ]; then
install_prefix="/tmp/install"
else
install_prefix="/usr"
fi
}

main() {
set -e

local containerfile="$1"
local llama_cpp_sha="$2"
local whisper_cpp_sha="$3"
local install_prefix="$4"
local build_flag_1="$5"
local build_flag_2="$6"
local common_flags=("-DGGML_CCACHE=0" \
"-DCMAKE_INSTALL_PREFIX=$install_prefix" "$build_flag_1")
if [ -n "$build_flag_2" ]; then
common_flags+=("$build_flag_2")
local install_prefix
set_install_prefix
local common_flags=("-DGGML_NATIVE=OFF")
if [ "$containerfile" = "ramalama" ]; then
common_flags+=("-DGGML_KOMPUTE=1")
elif [ "$containerfile" = "rocm" ]; then
common_flags+=("-DGGML_HIPBLAS=1")
elif [ "$containerfile" = "cuda" ]; then
common_flags+=("-DGGML_CUDA=ON" "-DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined")
elif [ "$containerfile" = "vulkan" ] || [ "$containerfile" = "asahi" ]; then
common_flags+=("-DGGML_VULKAN=1")
fi

local cpp_flags=("${common_flags[@]}")
cpp_flags+=("-DGGML_CCACHE=0" \
"-DCMAKE_INSTALL_PREFIX=$install_prefix")
dnf_install

git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
git reset --hard "$llama_cpp_sha"
Expand All @@ -75,9 +90,7 @@ main() {
mv build/bin/server "$install_prefix/bin/whisper-server"
cd ..

CMAKE_ARGS="${common_flags[*]}" FORCE_CMAKE=1 \
pip install --prefix="$install_prefix" 'llama-cpp-python[server]'

CMAKE_ARGS="${common_flags[*]}" pip install "llama-cpp-python[server]"
dnf clean all
rm -rf /var/cache/*dnf* /opt/rocm-*/lib/llvm \
/opt/rocm-*/lib/rocblas/library/*gfx9* llama.cpp whisper.cpp
Expand Down
2 changes: 1 addition & 1 deletion container-images/vulkan/Containerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ FROM quay.io/ramalama/ramalama:latest
COPY ../scripts /scripts
RUN chmod +x /scripts/*.sh && \
/scripts/build_llama_and_whisper.sh "vulkan" "$LLAMA_CPP_SHA" \
"$WHISPER_CPP_SHA" "/usr" "-DGGML_VULKAN=1"
"$WHISPER_CPP_SHA"

2 changes: 1 addition & 1 deletion docs/ramalama-serve.1.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ After=local-fs.target
[Container]
AddDevice=-/dev/dri
AddDevice=-/dev/kfd
Exec=llama-server --port 1234 -m $HOME/.local/share/ramalama/models/huggingface/instructlab/granite-7b-lab-GGUF/granite-7b-lab-Q4_K_M.gguf
Exec=python3 -m llama_cpp.server --port 1234 --model $HOME/.local/share/ramalama/models/huggingface/instructlab/granite-7b-lab-GGUF/granite-7b-lab-Q4_K_M.gguf
Image=quay.io/ramalama/ramalama:latest
Mount=type=bind,src=/home/dwalsh/.local/share/ramalama/models/huggingface/instructlab/granite-7b-lab-GGUF/granite-7b-lab-Q4_K_M.gguf,target=/mnt/models/model.file,ro,Z
ContainerName=MyGraniteServer
Expand Down
1 change: 1 addition & 0 deletions ramalama/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ def configure_arguments(parser):
)
parser.add_argument("-v", "--version", dest="version", action="store_true", help="show RamaLama version")


def configure_subcommands(parser):
"""Add subcommand parsers to the main argument parser."""
subparsers = parser.add_subparsers(dest="subcommand")
Expand Down
5 changes: 0 additions & 5 deletions ramalama/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,6 @@ def exec_cmd(args, stderr=True, debug=False):
if debug:
perror("exec_cmd: ", *args)

if not stderr:
# Redirecting stderr to /dev/null
with open(os.devnull, "w") as devnull:
os.dup2(devnull.fileno(), sys.stderr.fileno())

try:
return os.execvp(args[0], args)
except Exception:
Expand Down
49 changes: 18 additions & 31 deletions ramalama/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,15 +233,15 @@ def run(self, args):
if not args.container:
raise KeyError("--nocontainer and --name options conflict. --name requires a container.")

prompt = "You are a helpful assistant"
prompt = ""
if args.ARGS:
prompt = " ".join(args.ARGS)

# Build a prompt with the stdin text that prepend the prompt passed as
# an argument to ramalama cli
if not sys.stdin.isatty():
input = sys.stdin.read()
prompt = input + "\n\n" + prompt
inp = sys.stdin.read()
prompt = inp + "\n\n" + prompt

if args.dryrun:
model_path = "/path/to/model"
Expand All @@ -254,36 +254,22 @@ def run(self, args):
if not args.container:
exec_model_path = model_path

exec_args = [
"llama-cli",
"-m",
exec_model_path,
"--in-prefix",
"",
"--in-suffix",
"",
"-c",
f"{args.context}",
"--temp",
f"{args.temp}",
]
exec_args = ["llama-run", "-c", f"{args.context}", "--temp", f"{args.temp}"]

if args.seed:
exec_args += ["--seed", args.seed]

if not args.debug:
exec_args += ["--no-display-prompt"]
exec_args += [
"-p",
prompt,
]

if not args.ARGS and sys.stdin.isatty():
exec_args.append("-cnv")
if args.debug:
exec_args += ["-v"]

if args.gpu:
exec_args.extend(self.gpu_args())

exec_args += [
exec_model_path,
prompt,
]

try:
if self.exec_model_in_container(model_path, exec_args, args):
return
Expand Down Expand Up @@ -315,22 +301,23 @@ def serve(self, args):
exec_model_path = model_path

exec_args = [
"llama-server",
"python3",
"-m",
"llama_cpp.server",
"--port",
args.port,
"-m",
"--model",
exec_model_path,
"-c",
f"{args.context}",
"--host",
args.host,
"--temp",
f"{args.temp}",
]
if args.seed:
exec_args += ["--seed", args.seed]

if args.runtime == "vllm":
if not (exec_model_path.endswith(".GGUF") or exec_model_path.endswith(".gguf")):
exec_model_path = os.path.dirname(exec_model_path)
exec_model_path = os.path.dirname(exec_model_path)
exec_args = ["vllm", "serve", "--port", args.port, exec_model_path]
else:
if args.gpu:
Expand Down
15 changes: 15 additions & 0 deletions scripts/replace-shas.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

set -euo pipefail

find_files() {
grep -rl "$1_CPP_SHA=" container-images/
}

sed_files() {
xargs sed -i "s/ARG $1_CPP_SHA=.*/ARG $1_CPP_SHA=$2/g"
}

find_files "LLAMA" | sed_files "LLAMA" "$1"
find_files "WHISPER" | sed_files "WHISPER" "$2"

2 changes: 1 addition & 1 deletion test/system/030-run.bats
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ load helpers
is "$output" ".*${image} /bin/sh -c" "verify image name"
else
run_ramalama --dryrun run -c 4096 ${model}
is "$output" 'llama-cli -m /path/to/model --in-prefix --in-suffix -c 4096 --temp 0.8 --no-display-prompt -p.*' "dryrun correct"
is "$output" 'llama-run -c 4096 --temp 0.8 /path/to/model' "dryrun correct"
is "$output" ".*-c 4096" "verify model name"

run_ramalama 1 run --ctx-size=4096 --name foobar tiny
Expand Down
8 changes: 4 additions & 4 deletions test/system/040-serve.bats
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name

run cat tinyllama.container
is "$output" ".*PublishPort=1234" "PublishPort should match"
is "$output" ".*Exec=llama-server --port 1234 -m .*" "Exec line should be correct"
is "$output" ".*Exec=python3 -m llama_cpp.server --port 1234 --model .*" "Exec line should be correct"
is "$output" ".*Mount=type=bind,.*tinyllama" "Mount line should be correct"

rm tinyllama.container
Expand Down Expand Up @@ -183,7 +183,7 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name
run cat $name.container
is "$output" ".*PublishPort=1234" "PublishPort should match"
is "$output" ".*ContainerName=${name}" "Quadlet should have ContainerName field"
is "$output" ".*Exec=llama-server --port 1234 -m .*" "Exec line should be correct"
is "$output" ".*Exec=python3 -m llama_cpp.server --port 1234 --model .*" "Exec line should be correct"
is "$output" ".*Mount=type=image,source=${ociimage},destination=/mnt/models,subpath=/models,readwrite=false" "Volume line should be correct"

if is_container; then
Expand Down Expand Up @@ -235,7 +235,7 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name

run cat $name.yaml
is "$output" ".*image: quay.io/ramalama/ramalama:latest" "Should container image"
is "$output" ".*command: \[\"llama-server\"\]" "Should command"
is "$output" ".*command: \[\"python3\"\]" "Should command"
is "$output" ".*containerPort: 1234" "Should container container port"

run_ramalama serve --name=${name} --port 1234 --generate=quadlet/kube ${model}
Expand All @@ -244,7 +244,7 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name

run cat $name.yaml
is "$output" ".*image: quay.io/ramalama/ramalama:latest" "Should container image"
is "$output" ".*command: \[\"llama-server\"\]" "Should command"
is "$output" ".*command: \[\"python3\"\]" "Should command"
is "$output" ".*containerPort: 1234" "Should container container port"

run cat $name.kube
Expand Down

0 comments on commit 07202c5

Please sign in to comment.