diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 7a2a193fd38722..1616039ad8bbed 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -1 +1,42 @@ #include "llama-arch.h" + +#include "llama-impl.h" + +LLM_KV::LLM_KV(llm_arch arch) : arch(arch) {} + +std::string LLM_KV::operator()(llm_kv kv) const { + return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch)); +} + +std::string LLM_TN_IMPL::str() const { + if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) { + return "__missing__"; + } + + std::string name = ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid, xid); + + if (suffix != nullptr) { + name += "."; + name += suffix; + } + + return name; +} + +const char * llm_arch_name(llm_arch arch) { + auto it = LLM_ARCH_NAMES.find(arch); + if (it == LLM_ARCH_NAMES.end()) { + return "unknown"; + } + return it->second; +} + +llm_arch llm_arch_from_string(const std::string & name) { + for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT + if (kv.second == name) { + return kv.first; + } + } + + return LLM_ARCH_UNKNOWN; +} diff --git a/src/llama-arch.h b/src/llama-arch.h index e2bdb295dfb0b6..a68cbd262e4278 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -1,7 +1,5 @@ #pragma once -#include "llama-impl.h" - #include // @@ -375,13 +373,11 @@ static const std::map LLM_KV_NAMES = { }; struct LLM_KV { - LLM_KV(llm_arch arch) : arch(arch) {} + LLM_KV(llm_arch arch); llm_arch arch; - std::string operator()(llm_kv kv) const { - return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch)); - } + std::string operator()(llm_kv kv) const; }; enum llm_tensor { @@ -1589,16 +1585,6 @@ static const std::map LLM_CHAT_TEMPLATES = { { "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT }, }; -static llm_arch llm_arch_from_string(const std::string & name) { - for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT - if (kv.second == name) { - return kv.first; - } - } - - return LLM_ARCH_UNKNOWN; -} - // helper to handle gguf constants // usage: // @@ -1615,20 +1601,7 @@ struct LLM_TN_IMPL { const int bid; const int xid; - std::string str() const { - if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) { - return "__missing__"; - } - - std::string name = ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid, xid); - - if (suffix != nullptr) { - name += "."; - name += suffix; - } - - return name; - } + std::string str() const; operator std::string() const { return str(); @@ -1657,58 +1630,6 @@ struct LLM_TN { } }; -// -// load LLaMA models -// - -static const char * llama_model_arch_name(llm_arch arch) { - auto it = LLM_ARCH_NAMES.find(arch); - if (it == LLM_ARCH_NAMES.end()) { - return "unknown"; - } - return it->second; -} - -static std::string llama_model_ftype_name(llama_ftype ftype) { - if (ftype & LLAMA_FTYPE_GUESSED) { - return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)"; - } - - switch (ftype) { - case LLAMA_FTYPE_ALL_F32: return "all F32"; - case LLAMA_FTYPE_MOSTLY_F16: return "F16"; - case LLAMA_FTYPE_MOSTLY_BF16: return "BF16"; - case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0"; - case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; - case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0"; - case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1"; - case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0"; - case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium"; - case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small"; - case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small"; - case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium"; - case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large"; - case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small"; - case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium"; - case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small"; - case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium"; - case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K"; - case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary"; - case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary"; - case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; - - default: return "unknown, may not work"; - } -} +const char * llm_arch_name(llm_arch arch); +llm_arch llm_arch_from_string(const std::string & name); diff --git a/src/llama-impl.h b/src/llama-impl.h index 7a622f213a7902..273897c08fae05 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -24,22 +24,8 @@ LLAMA_ATTRIBUTE_FORMAT(2, 3) void llama_log_internal (ggml_log_level level, const char * format, ...); void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data); -// TODO: move to source LLAMA_ATTRIBUTE_FORMAT(1, 2) -static std::string format(const char * fmt, ...) { - va_list ap; - va_list ap2; - va_start(ap, fmt); - va_copy(ap2, ap); - int size = vsnprintf(NULL, 0, fmt, ap); - GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT - std::vector buf(size + 1); - int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); - GGML_ASSERT(size2 == size); - va_end(ap2); - va_end(ap); - return std::string(buf.data(), size); -} +std::string format(const char * fmt, ...); #define LLAMA_LOG(...) llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__) #define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__) diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 1dcfdcd1896e49..2b9197bb8a84a5 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -1 +1,3 @@ #include "llama-mmap.h" + + diff --git a/src/llama-mmap.h b/src/llama-mmap.h index f091558e3b05be..a1b50b3ffa3288 100644 --- a/src/llama-mmap.h +++ b/src/llama-mmap.h @@ -4,8 +4,6 @@ #include "ggml.h" -#include - #ifdef __has_include #if __has_include() #include diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 2364e7c9561bcf..1c563b4c87e51f 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1 +1,44 @@ #include "llama-model.h" + +std::string llama_model_ftype_name(llama_ftype ftype) { + if (ftype & LLAMA_FTYPE_GUESSED) { + return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)"; + } + + switch (ftype) { + case LLAMA_FTYPE_ALL_F32: return "all F32"; + case LLAMA_FTYPE_MOSTLY_F16: return "F16"; + case LLAMA_FTYPE_MOSTLY_BF16: return "BF16"; + case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0"; + case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; + case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0"; + case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1"; + case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0"; + case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium"; + case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small"; + case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small"; + case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium"; + case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large"; + case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small"; + case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium"; + case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small"; + case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium"; + case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K"; + case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary"; + case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary"; + case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; + + default: return "unknown, may not work"; + } +} diff --git a/src/llama-model.h b/src/llama-model.h index f3bd79aa9715f9..bf030e90b64e4c 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -648,3 +648,5 @@ static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & b throw std::runtime_error(format("no suitable buffer type found")); } + +std::string llama_model_ftype_name(llama_ftype ftype); diff --git a/src/llama-vocab.h b/src/llama-vocab.h index 2943c34804f89d..834ad6ab8527a3 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -1,6 +1,6 @@ #pragma once -#include "llama-impl.h" +#include "llama.h" #include #include diff --git a/src/llama.cpp b/src/llama.cpp index 1ab22e6a4ab939..e7ab6f94bda5db 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -59,6 +59,21 @@ // helpers // +std::string format(const char * fmt, ...) { + va_list ap; + va_list ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + int size = vsnprintf(NULL, 0, fmt, ap); + GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT + std::vector buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + GGML_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), size); +} + // trim whitespace from the beginning and end of a string static std::string trim(const std::string & str) { size_t start = 0; @@ -16432,9 +16447,9 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3 int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) { return snprintf(buf, buf_size, "%s %s %s", - llama_model_arch_name(model->arch), - llama_model_type_name(model->type), - llama_model_ftype_name(model->ftype).c_str()); + llm_arch_name(model->arch), // TODO: llama_model_arch_name(model) + llama_model_type_name(model->type), // TODO: llama_model_type_name(model) + llama_model_ftype_name(model->ftype).c_str()); // TODO: llama_model_ftype_name(model) } uint64_t llama_model_size(const struct llama_model * model) {