Skip to content

Commit

Permalink
Merge branch 'master' into dev-refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
chraac committed Aug 5, 2024
2 parents 47f6e02 + bc0f887 commit 5ea980d
Show file tree
Hide file tree
Showing 69 changed files with 1,803 additions and 553 deletions.
4 changes: 2 additions & 2 deletions .devops/llama-server.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION AS build

RUN apt-get update && \
apt-get install -y build-essential git libcurl4-openssl-dev curl
apt-get install -y build-essential git libcurl4-openssl-dev

WORKDIR /app

Expand All @@ -16,7 +16,7 @@ RUN make -j$(nproc) llama-server
FROM ubuntu:$UBUNTU_VERSION AS runtime

RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1
apt-get install -y libcurl4-openssl-dev libgomp1 curl

COPY --from=build /app/llama-server /llama-server

Expand Down
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,8 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location o
# determining _precisely_ which defines are necessary for the llama-config
# package.
#
get_directory_property(GGML_DIR_DEFINES DIRECTORY ggml/src COMPILE_DEFINITIONS)
get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
Expand Down
29 changes: 14 additions & 15 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -1605,42 +1605,41 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
# Mark legacy binary targets as .PHONY so that they are always checked.
.PHONY: main quantize perplexity embedding server

# Define the object file target
examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp
$(CXX) $(CXXFLAGS) -c $< -o $@

# NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
# Eventually we will want to remove these target from building all the time.
main: examples/deprecation-warning/deprecation-warning.cpp
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
main: examples/deprecation-warning/deprecation-warning.o
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."

server: examples/deprecation-warning/deprecation-warning.cpp
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
server: examples/deprecation-warning/deprecation-warning.o
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
@echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead."

quantize: examples/deprecation-warning/deprecation-warning.cpp
quantize: examples/deprecation-warning/deprecation-warning.o
ifneq (,$(wildcard quantize))
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
@echo "#########"
@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
@echo " Remove the 'quantize' binary to remove this warning."
@echo "#########"
endif

perplexity: examples/deprecation-warning/deprecation-warning.cpp
perplexity: examples/deprecation-warning/deprecation-warning.o
ifneq (,$(wildcard perplexity))
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
@echo "#########"
@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
@echo " Remove the 'perplexity' binary to remove this warning."
@echo "#########"
endif

embedding: examples/deprecation-warning/deprecation-warning.cpp
embedding: examples/deprecation-warning/deprecation-warning.o
ifneq (,$(wildcard embedding))
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
@echo "#########"
@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
@echo " Remove the 'embedding' binary to remove this warning."
Expand Down
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,16 @@ Typically finetunes of the base models below are supported as well.
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
- [x] [OLMo](https://allenai.org/olmo)
- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
- [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520)
- [x] [Smaug](https://huggingface.co/models?search=Smaug)
- [x] [Poro 34B](https://huggingface.co/LumiOpen/Poro-34B)
- [x] [Bitnet b1.58 models](https://huggingface.co/1bitLLM)
- [x] [Flan T5](https://huggingface.co/models?search=flan-t5)
- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)

(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))

Expand Down Expand Up @@ -145,6 +153,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [Faraday](https://faraday.dev/) (proprietary)
- [LMStudio](https://lmstudio.ai/) (proprietary)
- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
- [ramalama](https://github.com/containers/ramalama) (MIT)
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
Expand Down
2 changes: 1 addition & 1 deletion common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1634,7 +1634,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() });
options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() });
options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" });
options.push_back({ "server", " --embedding(s)", "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" });
options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" });
options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });
Expand Down
2 changes: 1 addition & 1 deletion convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ def prepare_tensors(self):
if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
data = gguf.quantize_bf16(data)
assert data.dtype == np.int16
assert data.dtype == np.uint16
data_qtype = gguf.GGMLQuantizationType.BF16

elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
Expand Down
6 changes: 5 additions & 1 deletion docs/build.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,11 @@ For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](ht
cmake --build build --config Release
```
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
The following compilation options are also available to tweak performance:
| Option | Legal values | Default | Description |
|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
Expand Down
1 change: 0 additions & 1 deletion examples/baby-llama/baby-llama.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#include "ggml.h"
#include "train.h"

#include <vector>
#include <cassert>
#include <cstdlib>
#include <cstring>
Expand Down
2 changes: 1 addition & 1 deletion examples/batched-bench/batched-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ int main(int argc, char ** argv) {
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);

// ensure enough sequences are available
ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());

llama_context * ctx = llama_new_context_with_model(model, ctx_params);

Expand Down
2 changes: 1 addition & 1 deletion examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ server:
--host HOST ip address to listen (default: 127.0.0.1)
--port PORT port to listen (default: 8080)
--path PATH path to serve static files from (default: )
--embedding(s) enable embedding endpoint (default: disabled)
--embedding(s) restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
--api-key KEY API key to use for authentication (default: none)
--api-key-file FNAME path to file containing API keys (default: none)
--ssl-key-file FNAME path to file a PEM-encoded SSL private key
Expand Down
2 changes: 1 addition & 1 deletion examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -900,7 +900,7 @@ struct server_context {

slot.params.stream = json_value(data, "stream", false);
slot.params.cache_prompt = json_value(data, "cache_prompt", false);
slot.params.n_predict = json_value(data, "n_predict", default_params.n_predict);
slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict));
slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
Expand Down
18 changes: 0 additions & 18 deletions examples/server/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -355,24 +355,6 @@ static json oaicompat_completion_params_parse(

llama_params["__oaicompat"] = true;

// Map OpenAI parameters to llama.cpp parameters
//
// For parameters that are defined by the OpenAI documentation (e.g.
// temperature), we explicitly specify OpenAI's intended default; we
// need to do that because sometimes OpenAI disagrees with llama.cpp
//
// https://platform.openai.com/docs/api-reference/chat/create
llama_sampling_params default_sparams;
llama_params["model"] = json_value(body, "model", std::string("unknown"));
llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
llama_params["logit_bias"] = json_value(body, "logit_bias", json::object());
llama_params["n_predict"] = json_value(body, "max_tokens", -1);
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
llama_params["stream"] = json_value(body, "stream", false);
llama_params["temperature"] = json_value(body, "temperature", 1.0);
llama_params["top_p"] = json_value(body, "top_p", 1.0);

// Apply chat template to the list of messages
llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));

Expand Down
20 changes: 10 additions & 10 deletions flake.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,7 @@ extern "C" {
GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
GGML_API void ggml_fp32_to_bf16_row_ref(const float *, ggml_bf16_t *, int64_t);
GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);

struct ggml_object;
Expand Down Expand Up @@ -1455,7 +1456,6 @@ extern "C" {
// if mode & 2 == 1, GPT-NeoX style
//
// b is an int32 vector with size a->ne[2], it contains the positions
// c is freq factors (e.g. phi3-128k), (optional)
GGML_API struct ggml_tensor * ggml_rope(
struct ggml_context * ctx,
struct ggml_tensor * a,
Expand All @@ -1472,6 +1472,7 @@ extern "C" {
int mode);

// custom RoPE
// c is freq factors (e.g. phi3-128k), (optional)
GGML_API struct ggml_tensor * ggml_rope_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
Expand Down
Loading

0 comments on commit 5ea980d

Please sign in to comment.