Skip to content

Commit

Permalink
Merge branch 'ggerganov:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
bmtwl authored Feb 16, 2024
2 parents 26ea983 + d2819d5 commit 238445e
Show file tree
Hide file tree
Showing 11 changed files with 139 additions and 58 deletions.
4 changes: 4 additions & 0 deletions ci/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,10 @@ function gg_run_embd_bge_small {
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/modules.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json

gg_wget models-mnt/bge-small/1_Pooling https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json

path_models="../models-mnt/bge-small"

Expand Down
59 changes: 38 additions & 21 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break;
}
const auto sampler_names = string_split(argv[i], ';');
sparams.samplers_sequence = sampler_types_from_names(sampler_names);
sparams.samplers_sequence = sampler_types_from_names(sampler_names, true);
} else if (arg == "--sampling-seq") {
if (++i >= argc) {
invalid_param = true;
Expand Down Expand Up @@ -964,7 +964,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
printf(" --samplers samplers that will be used for generation in the order, separated by \';\' (default: %s)\n", sampler_type_names.c_str());
printf(" --samplers samplers that will be used for generation in the order, separated by \';\'\n");
printf(" (default: %s)\n", sampler_type_names.c_str());
printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str());
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
Expand Down Expand Up @@ -1133,34 +1134,50 @@ std::vector<std::string> string_split(std::string input, char separator) {
return parts;
}

std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names) {
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
{"top_k", llama_sampler_type::TOP_K},
{"top_p", llama_sampler_type::TOP_P},
{"typical_p", llama_sampler_type::TYPICAL_P},
{"min_p", llama_sampler_type::MIN_P},
{"tfs_z", llama_sampler_type::TFS_Z},
{"temperature", llama_sampler_type::TEMPERATURE}
};

// since samplers names are written multiple ways
// make it ready for both system names and input names
std::unordered_map<std::string, llama_sampler_type> sampler_name_map {
{"top_k", llama_sampler_type::TOP_K},
std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
{"top-k", llama_sampler_type::TOP_K},
{"top_p", llama_sampler_type::TOP_P},
{"top-p", llama_sampler_type::TOP_P},
{"nucleus", llama_sampler_type::TOP_P},
{"typical_p", llama_sampler_type::TYPICAL_P},
{"typical-p", llama_sampler_type::TYPICAL_P},
{"typical", llama_sampler_type::TYPICAL_P},
{"min_p", llama_sampler_type::MIN_P},
{"min-p", llama_sampler_type::MIN_P},
{"tfs_z", llama_sampler_type::TFS_Z},
{"tfs-z", llama_sampler_type::TFS_Z},
{"tfs", llama_sampler_type::TFS_Z},
{"temp", llama_sampler_type::TEMP},
{"temperature", llama_sampler_type::TEMP}
{"temp", llama_sampler_type::TEMPERATURE}
};

std::vector<llama_sampler_type> sampler_types;
sampler_types.reserve(names.size());
for (const auto& name : names) {
const auto sampler_item = sampler_name_map.find(name);
if (sampler_item != sampler_name_map.end()) {
for (const auto & name : names)
{
auto sampler_item = sampler_canonical_name_map.find(name);
if (sampler_item != sampler_canonical_name_map.end())
{
sampler_types.push_back(sampler_item->second);
}
else
{
if (allow_alt_names)
{
sampler_item = sampler_alt_name_map.find(name);
if (sampler_item != sampler_alt_name_map.end())
{
sampler_types.push_back(sampler_item->second);
}
}
}
}
return sampler_types;
}
Expand All @@ -1172,7 +1189,7 @@ std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & nam
{'y', llama_sampler_type::TYPICAL_P},
{'m', llama_sampler_type::MIN_P},
{'f', llama_sampler_type::TFS_Z},
{'t', llama_sampler_type::TEMP}
{'t', llama_sampler_type::TEMPERATURE}
};

std::vector<llama_sampler_type> sampler_types;
Expand All @@ -1188,12 +1205,12 @@ std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & nam

std::string sampler_type_to_name_string(llama_sampler_type sampler_type) {
switch (sampler_type) {
case llama_sampler_type::TOP_K: return "top_k";
case llama_sampler_type::TFS_Z: return "tfs_z";
case llama_sampler_type::TYPICAL_P: return "typical_p";
case llama_sampler_type::TOP_P: return "top_p";
case llama_sampler_type::MIN_P: return "min_p";
case llama_sampler_type::TEMP: return "temp";
case llama_sampler_type::TOP_K: return "top_k";
case llama_sampler_type::TFS_Z: return "tfs_z";
case llama_sampler_type::TYPICAL_P: return "typical_p";
case llama_sampler_type::TOP_P: return "top_p";
case llama_sampler_type::MIN_P: return "min_p";
case llama_sampler_type::TEMPERATURE: return "temperature";
default : return "";
}
}
Expand Down
2 changes: 1 addition & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ void process_escapes(std::string& input);
// String utils
//

std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names);
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
std::vector<std::string> string_split(std::string input, char separator);
std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
Expand Down
2 changes: 1 addition & 1 deletion common/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ static void sampler_queue(
case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
case llama_sampler_type::TEMP:
case llama_sampler_type::TEMPERATURE:
if (dynatemp_range > 0) {
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
Expand Down
14 changes: 7 additions & 7 deletions common/sampling.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@

// sampler types
enum class llama_sampler_type : char {
TOP_K = 'k',
TOP_P = 'p',
MIN_P = 'm',
TFS_Z = 'f',
TYPICAL_P = 'y',
TEMP = 't'
TOP_K = 'k',
TOP_P = 'p',
MIN_P = 'm',
TFS_Z = 'f',
TYPICAL_P = 'y',
TEMPERATURE = 't'
};

// sampling parameters
Expand Down Expand Up @@ -45,7 +45,7 @@ typedef struct llama_sampling_params {
llama_sampler_type::TYPICAL_P,
llama_sampler_type::TOP_P,
llama_sampler_type::MIN_P,
llama_sampler_type::TEMP
llama_sampler_type::TEMPERATURE
};

std::string grammar; // optional BNF-like grammar to constrain sampling
Expand Down
4 changes: 2 additions & 2 deletions examples/llava/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director
1) Backup your pth/safetensor model files as llava-surgery modifies them
2) Use `python llava-surgery-v2.py -C -m /path/to/hf-model` which also supports llava-1.5 variants pytorch as well as safetensor models:
- you will find a llava.projector and a llava.clip file in your model directory
3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory (https://huggingface.co/cmp-nct/llava-1.6-gguf/blob/main/config.json)
4) Create the visual gguf model: `python ./examples/llava/convert-image-encoder-to-gguf.py -m ../path/to/vit --llava-projector ../path/to/llava.projector --output-dir ../path/to/output --clip_model_is_vision`
3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory (https://huggingface.co/cmp-nct/llava-1.6-gguf/blob/main/config_vit.json) and rename it to config.json.
4) Create the visual gguf model: `python ./examples/llava/convert-image-encoder-to-gguf.py -m ../path/to/vit --llava-projector ../path/to/llava.projector --output-dir ../path/to/output --clip-model-is-vision`
- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
5) Everything else as usual: convert.py the hf model, quantize as needed
**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
Expand Down
1 change: 0 additions & 1 deletion examples/llava/llava.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,6 @@ static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_thre
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
if (!image_embd) {
fprintf(stderr, "Unable to allocate memory for image embeddings\n");
free(image_embd);
return false;
}

Expand Down
2 changes: 2 additions & 0 deletions examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,8 @@ node index.js
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. (default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values)
### Result JSON
- Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
Expand Down
70 changes: 46 additions & 24 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -436,10 +436,6 @@ struct llama_server_context
default_generation_settings_for_props["seed"] = -1;

batch = llama_batch_init(n_ctx, 0, params.n_parallel);

// empty system prompt
system_prompt = "";
system_tokens.clear();
}

std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
Expand Down Expand Up @@ -676,6 +672,24 @@ struct llama_server_context
}
}

const auto &samplers_sequence = data.find("samplers");
if (samplers_sequence != data.end() && samplers_sequence->is_array())
{
std::vector<std::string> sampler_names;
for (const auto &sampler_name : *samplers_sequence)
{
if (sampler_name.is_string())
{
sampler_names.emplace_back(sampler_name);
}
}
slot->sparams.samplers_sequence = sampler_types_from_names(sampler_names, false);
}
else
{
slot->sparams.samplers_sequence = default_sparams.samplers_sequence;
}

if (multimodal)
{
const auto &images_data = data.find("image_data");
Expand Down Expand Up @@ -765,27 +779,30 @@ struct llama_server_context
}

void update_system_prompt() {
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
kv_cache_clear();
system_tokens.clear();

llama_batch_clear(batch);
if (!system_prompt.empty()) {
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);

kv_cache_clear();
llama_batch_clear(batch);

for (int i = 0; i < (int) system_tokens.size(); ++i)
{
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
}
for (int i = 0; i < (int)system_tokens.size(); ++i)
{
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
}

if (llama_decode(ctx, batch) != 0)
{
LOG_TEE("%s: llama_decode() failed\n", __func__);
return;
}
if (llama_decode(ctx, batch) != 0)
{
LOG_TEE("%s: llama_decode() failed\n", __func__);
return;
}

// assign the system KV cache to all parallel sequences
for (int32_t i = 1; i < params.n_parallel; ++i)
{
llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());
// assign the system KV cache to all parallel sequences
for (int32_t i = 1; i < params.n_parallel; ++i)
{
llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());
}
}

LOG_TEE("system prompt updated\n");
Expand All @@ -807,10 +824,8 @@ struct llama_server_context
name_user = sys_props.value("anti_prompt", "");
name_assistant = sys_props.value("assistant_name", "");

if (slots.size() > 0)
{
notify_system_prompt_changed();
}

notify_system_prompt_changed();
}

static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
Expand Down Expand Up @@ -1029,6 +1044,12 @@ struct llama_server_context
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
std::vector<std::string> samplers_sequence;
for (const auto &sampler_type : slot.sparams.samplers_sequence)
{
samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type));
}

return json {
{"n_ctx", slot.n_ctx},
{"model", params.model_alias},
Expand Down Expand Up @@ -1059,6 +1080,7 @@ struct llama_server_context
{"logit_bias", slot.sparams.logit_bias},
{"n_probs", slot.sparams.n_probs},
{"grammar", slot.sparams.grammar},
{"samplers", samplers_sequence}
};
}

Expand Down
2 changes: 1 addition & 1 deletion llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10893,7 +10893,7 @@ static int llama_apply_lora_from_file_internal(
{
LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
__func__, ftype);
return false;
return 1;
}
}

Expand Down
37 changes: 37 additions & 0 deletions scripts/compare-commits.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/bash

if [ $# -lt 2 ]; then
echo "usage: ./scripts/compare-commits.sh <commit1> <commit2> [additional llama-bench arguments]"
exit 1
fi

set -e
set -x

bench_args="${@:3}"

rm -f llama-bench.sqlite

backend="cpu"

if [[ "$OSTYPE" == "darwin"* ]]; then
backend="metal"
elif command -v nvcc &> /dev/null; then
backend="cuda"
fi

make_opts=""

if [[ "$backend" == "cuda" ]]; then
make_opts="LLAMA_CUBLAS=1"
fi

git checkout $1
make clean && make -j32 $make_opts llama-bench
./llama-bench -o sql $bench_args | tee /dev/tty | sqlite3 llama-bench.sqlite

git checkout $2
make clean && make -j32 $make_opts llama-bench
./llama-bench -o sql $bench_args | tee /dev/tty | sqlite3 llama-bench.sqlite

./scripts/compare-llama-bench.py -b $1 -c $2

0 comments on commit 238445e

Please sign in to comment.