From dcbfda1f694682457b86719ce5f5bc153248afb2 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 22 Dec 2024 22:28:20 +0200 Subject: [PATCH] llama : adapter ggml-ci --- common/common.h | 10 ++ include/llama.h | 5 +- src/llama-adapter.cpp | 313 ++++++++++++++++++++++++++++++++++++++++ src/llama-adapter.h | 327 +++--------------------------------------- src/llama-model.cpp | 13 ++ src/llama-model.h | 15 +- src/llama.cpp | 29 ++-- 7 files changed, 383 insertions(+), 329 deletions(-) diff --git a/common/common.h b/common/common.h index 1d2bd932c211dc..fb5c56d88489c3 100644 --- a/common/common.h +++ b/common/common.h @@ -481,7 +481,17 @@ std::string fs_get_cache_file(const std::string & filename); struct common_init_result { struct llama_model * model = nullptr; struct llama_context * context = nullptr; + std::vector lora_adapters; + + ~common_init_result() { + llama_free(context); + llama_free_model(model); + + for (auto & lora_adapter : lora_adapters) { + llama_lora_adapter_free(lora_adapter.adapter); + } + } }; struct common_init_result common_init_from_params(common_params & params); diff --git a/include/llama.h b/include/llama.h index bb6970cb733d55..05c0130e46c215 100644 --- a/include/llama.h +++ b/include/llama.h @@ -417,6 +417,7 @@ extern "C" { const char * path_model, struct llama_model_params params); + // TODO: rename to llama_model_free LLAMA_API void llama_free_model(struct llama_model * model); // TODO: rename to llama_init_from_model @@ -507,7 +508,6 @@ extern "C" { // // Load a LoRA adapter from file - // The loaded adapter will be associated to the given model, and will be free when the model is deleted // TODO: rename to llama_adapter_lora_init LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init( struct llama_model * model, @@ -530,8 +530,7 @@ extern "C" { // Remove all LoRA adapters from given context // TODO: rename to llama_clear_adapter_lora - LLAMA_API void llama_lora_adapter_clear( - struct llama_context * ctx); + LLAMA_API void llama_lora_adapter_clear(struct llama_context * ctx); // Manually free a LoRA adapter // Note: loaded adapters will be free when the associated model is deleted diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index 587210512b3cc3..4394ade1ae9964 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -1,5 +1,318 @@ #include "llama-adapter.h" +#include "llama-model.h" + +#include +#include +#include + +// vec + +struct ggml_tensor * llama_control_vector::tensor_for(int il) const { + if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) { + return nullptr; + } + + return tensors[il]; +} + +struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const { + ggml_tensor * layer_dir = tensor_for(il); + if (layer_dir != nullptr) { + cur = ggml_add(ctx, cur, layer_dir); + } + + return cur; +} + +static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) { + const auto & hparams = model.hparams; + + GGML_ASSERT(cvec.tensors.empty()); + GGML_ASSERT(cvec.ctxs.empty()); + GGML_ASSERT(cvec.bufs.empty()); + + // create a context for each buffer type + std::map ctx_map; + auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { + auto it = ctx_map.find(buft); + if (it == ctx_map.end()) { + struct ggml_init_params params = { + /*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + + ggml_context * ctx = ggml_init(params); + if (!ctx) { + return nullptr; + } + + ctx_map[buft] = ctx; + cvec.ctxs.emplace_back(ctx); + + return ctx; + } + + return it->second; + }; + + // make tensors + cvec.tensors.reserve(hparams.n_layer); + cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0 + for (size_t il = 1; il < hparams.n_layer; il++) { + ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il); + ggml_context * ctx = ctx_for_buft(buft); + if (!ctx) { + LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__); + return false; + } + ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd); + cvec.tensors.push_back(tensor); + } + + // allocate tensors / buffers and zero + cvec.bufs.reserve(ctx_map.size()); + for (auto it : ctx_map) { + ggml_backend_buffer_type_t buft = it.first; + ggml_context * ctx = it.second; + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (!buf) { + LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__); + return false; + } + ggml_backend_buffer_clear(buf, 0); + cvec.bufs.emplace_back(buf); + } + + return true; +} + +int32_t llama_control_vector_apply( + struct llama_control_vector & cvec, + const llama_model & model, + const float * data, + size_t len, + int32_t n_embd, + int32_t il_start, + int32_t il_end) { + const auto & hparams = model.hparams; + + if (data == nullptr) { + // disable the current control vector (but leave allocated for later) + cvec.layer_start = -1; + cvec.layer_end = -1; + return 0; + } + + if (n_embd != (int) hparams.n_embd) { + LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__); + return 1; + } + + if (cvec.tensors.empty()) { + if (!llama_control_vector_init(cvec, model)) { + return 1; + } + } + + cvec.layer_start = il_start; + cvec.layer_end = il_end; + + for (size_t il = 1; il < hparams.n_layer; il++) { + assert(cvec.tensors[il] != nullptr); + + const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present + if (off + n_embd <= len) { + ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il])); + } + } + + return 0; +} + +// lora + +llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) { + const std::string name(w->name); + + const auto pos = ab_map.find(name); + if (pos != ab_map.end()) { + return &pos->second; + } + + return nullptr; +} + void llama_lora_adapter_free(struct llama_lora_adapter * adapter) { delete adapter; } + +void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) { + LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora); + + ggml_context * ctx_init; + struct gguf_init_params meta_gguf_params = { + /* .no_alloc = */ true, + /* .ctx = */ &ctx_init, + }; + + gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) }; + if (!ctx_gguf) { + throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora)); + } + + ggml_context_ptr ctx { ctx_init }; + + // check metadata + { + auto get_kv_str = [&](const std::string & key) -> std::string { + int id = gguf_find_key(ctx_gguf.get(), key.c_str()); + return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id)); + }; + auto get_kv_f32 = [&](const std::string & key) -> float { + int id = gguf_find_key(ctx_gguf.get(), key.c_str()); + return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id); + }; + LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); + + auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE)); + if (general_type != "adapter") { + throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type); + } + + auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE)); + auto general_arch = llm_arch_from_string(general_arch_str); + if (general_arch != model.arch) { + throw std::runtime_error("model arch and LoRA arch mismatch"); + } + + auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE)); + if (adapter_type != "lora") { + throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type); + } + + adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA)); + } + + int n_tensors = gguf_get_n_tensors(ctx_gguf.get()); + + // contexts for each buffer type + std::map ctx_map; + auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { + auto it = ctx_map.find(buft); + if (it == ctx_map.end()) { + // add a new context + struct ggml_init_params params = { + /*.mem_size =*/ n_tensors*ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ggml_context * buft_ctx = ggml_init(params); + if (!buft_ctx) { + return nullptr; + } + ctx_map[buft] = buft_ctx; + adapter.ctxs.emplace_back(buft_ctx); + return buft_ctx; + }; + return it->second; + }; + + // bundle lora_a and lora_b into pairs + std::map ab_map; + auto str_endswith = [](const std::string & str, const std::string & suffix) { + return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0; + }; + + for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) { + std::string name(cur->name); + if (str_endswith(name, ".lora_a")) { + replace_all(name, ".lora_a", ""); + if (ab_map.find(name) == ab_map.end()) { + ab_map[name] = llama_lora_weight(cur, nullptr); + } else { + ab_map[name].a = cur; + } + } else if (str_endswith(name, ".lora_b")) { + replace_all(name, ".lora_b", ""); + if (ab_map.find(name) == ab_map.end()) { + ab_map[name] = llama_lora_weight(nullptr, cur); + } else { + ab_map[name].b = cur; + } + } else { + throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix"); + } + } + + // add tensors + for (auto & it : ab_map) { + const std::string & name = it.first; + llama_lora_weight & w = it.second; + + if (!w.a || !w.b) { + throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component"); + } + + // device buft and device ctx + auto * model_tensor = llama_model_get_tensor(model, name.c_str()); + if (!model_tensor) { + throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model"); + } + + struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer)); + // validate tensor shape + if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) { + throw std::runtime_error("tensor '" + name + "' has incorrect shape"); + } + if (w.a->ne[1] != w.b->ne[0]) { + throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)"); + } + + // save tensor to adapter + struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a); + struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b); + ggml_set_name(tensor_a, w.a->name); + ggml_set_name(tensor_b, w.b->name); + adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b); + } + + // allocate tensors / buffers and zero + { + adapter.ctxs.reserve(ctx_map.size()); + adapter.bufs.reserve(ctx_map.size()); + for (auto & it : ctx_map) { + ggml_backend_buffer_type_t buft = it.first; + ggml_context * ctx_dev = it.second; + ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) }; + if (!buf) { + throw std::runtime_error("failed to allocate buffer for lora adapter\n"); + } + LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0); + adapter.bufs.emplace_back(std::move(buf)); + } + } + + // set tensor data + { + llama_file gguf_file(path_lora, "rb"); + std::vector read_buf; + auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) { + size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name)); + size_t size = ggml_nbytes(orig); + read_buf.resize(size); + gguf_file.seek(offs, SEEK_SET); + gguf_file.read_raw(read_buf.data(), size); + ggml_backend_tensor_set(dev, read_buf.data(), 0, size); + }; + for (auto & it : adapter.ab_map) { + auto orig = ab_map[it.first]; + auto dev = it.second; + set_tensor(orig.a, dev.a); + set_tensor(orig.b, dev.b); + } + } + + LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2); +} diff --git a/src/llama-adapter.h b/src/llama-adapter.h index b8351ef7e93f47..7b8ce47a824631 100644 --- a/src/llama-adapter.h +++ b/src/llama-adapter.h @@ -1,13 +1,11 @@ #pragma once #include "llama-impl.h" -#include "ggml-cpp.h" +#include "llama-hparams.h" -#include "llama-model.h" // TODO: need only hparams +#include "ggml-cpp.h" -#include -#include -#include +#include #include // @@ -16,119 +14,27 @@ // TODO: rename to llama_adapter_vec struct llama_control_vector { - std::vector tensors; // per layer std::vector ctxs; std::vector bufs; + std::vector tensors; // per layer + int32_t layer_start = -1; int32_t layer_end = -1; - struct ggml_tensor * tensor_for(int il) const { - if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) { - return nullptr; - } - return tensors[il]; - } + struct ggml_tensor * tensor_for(int il) const; - struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const { - ggml_tensor * layer_dir = tensor_for(il); - if (layer_dir != nullptr) { - cur = ggml_add(ctx, cur, layer_dir); - } - return cur; - } + struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const; }; -static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) { - GGML_ASSERT(cvec.tensors.empty()); - GGML_ASSERT(cvec.ctxs.empty()); - GGML_ASSERT(cvec.bufs.empty()); - - // create a context for each buffer type - std::map ctx_map; - auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { - auto it = ctx_map.find(buft); - if (it == ctx_map.end()) { - struct ggml_init_params params = { - /*.mem_size =*/ model.hparams.n_layer*ggml_tensor_overhead(), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - ggml_context * ctx = ggml_init(params); - if (!ctx) { - return nullptr; - } - ctx_map[buft] = ctx; - cvec.ctxs.emplace_back(ctx); - return ctx; - } - return it->second; - }; - - // make tensors - cvec.tensors.reserve(model.hparams.n_layer); - cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0 - for (size_t il = 1; il < model.hparams.n_layer; il++) { - ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il); - ggml_context * ctx = ctx_for_buft(buft); - if (!ctx) { - LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__); - return false; - } - ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd); - cvec.tensors.push_back(tensor); - } - - // allocate tensors / buffers and zero - cvec.bufs.reserve(ctx_map.size()); - for (auto it : ctx_map) { - ggml_backend_buffer_type_t buft = it.first; - ggml_context * ctx = it.second; - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); - if (!buf) { - LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__); - return false; - } - ggml_backend_buffer_clear(buf, 0); - cvec.bufs.emplace_back(buf); - } - - return true; -} - -static int32_t llama_control_vector_apply(struct llama_control_vector & cvec, const llama_model & model, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) { - if (data == nullptr) { - // disable the current control vector (but leave allocated for later) - cvec.layer_start = -1; - cvec.layer_end = -1; - return 0; - } - - if (n_embd != (int) model.hparams.n_embd) { - LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__); - return 1; - } - - if (cvec.tensors.empty()) { - if (!llama_control_vector_init(cvec, model)) { - return 1; - } - } - - cvec.layer_start = il_start; - cvec.layer_end = il_end; - - for (size_t il = 1; il < model.hparams.n_layer; il++) { - assert(cvec.tensors[il] != nullptr); - - const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present - if (off + n_embd <= len) { - ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il])); - } - } - - return 0; -} +int32_t llama_control_vector_apply( + struct llama_control_vector & cvec, + const llama_model & model, + const float * data, + size_t len, + int32_t n_embd, + int32_t il_start, + int32_t il_end); // // llama_adapter_lora @@ -138,214 +44,25 @@ static int32_t llama_control_vector_apply(struct llama_control_vector & cvec, co struct llama_lora_weight { struct ggml_tensor * a = nullptr; struct ggml_tensor * b = nullptr; + llama_lora_weight() = default; - llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {} + llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {} }; // TODO: rename to llama_adapter_lora struct llama_lora_adapter { - struct llama_model * base_model; // map tensor name to lora_a_b std::unordered_map ab_map; + std::vector ctxs; std::vector bufs; float alpha; - llama_lora_adapter(struct llama_model * base_model): base_model(base_model) { - base_model->lora_adapters.insert(this); - } - - llama_lora_weight * get_weight(struct ggml_tensor * w) { - std::string name(w->name); - auto pos = ab_map.find(name); - if (ab_map.find(name) != ab_map.end()) { - return &pos->second; - } - return nullptr; - } + llama_lora_adapter() = default; + ~llama_lora_adapter() = default; - ~llama_lora_adapter() { - auto pos = base_model->lora_adapters.find(this); - if (pos != base_model->lora_adapters.end()) { - base_model->lora_adapters.erase(pos); - } - } + llama_lora_weight * get_weight(struct ggml_tensor * w); }; -static struct ggml_tensor * llama_get_model_tensor(const struct llama_model * model, const char * name) { - auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(), - [name](const std::pair & it) { - return it.first == name; - }); - if (it == model->tensors_by_name.end()) { - return nullptr; - } - return it->second; -} - -static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) { - LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora); - - ggml_context * ctx_init; - struct gguf_init_params meta_gguf_params = { - /* .no_alloc = */ true, - /* .ctx = */ &ctx_init, - }; - - gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) }; - if (!ctx_gguf) { - throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora)); - } - - ggml_context_ptr ctx { ctx_init }; - - // check metadata - { - auto get_kv_str = [&](const std::string & key) -> std::string { - int id = gguf_find_key(ctx_gguf.get(), key.c_str()); - return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id)); - }; - auto get_kv_f32 = [&](const std::string & key) -> float { - int id = gguf_find_key(ctx_gguf.get(), key.c_str()); - return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id); - }; - LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); - - auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE)); - if (general_type != "adapter") { - throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type); - } - - auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE)); - auto general_arch = llm_arch_from_string(general_arch_str); - if (general_arch != model->arch) { - throw std::runtime_error("model arch and LoRA arch mismatch"); - } - - auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE)); - if (adapter_type != "lora") { - throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type); - } - - adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA)); - } - - int n_tensors = gguf_get_n_tensors(ctx_gguf.get()); - - // contexts for each buffer type - std::map ctx_map; - auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { - auto it = ctx_map.find(buft); - if (it == ctx_map.end()) { - // add a new context - struct ggml_init_params params = { - /*.mem_size =*/ n_tensors*ggml_tensor_overhead(), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - ggml_context * buft_ctx = ggml_init(params); - if (!buft_ctx) { - return nullptr; - } - ctx_map[buft] = buft_ctx; - adapter.ctxs.emplace_back(buft_ctx); - return buft_ctx; - }; - return it->second; - }; - - // bundle lora_a and lora_b into pairs - std::map ab_map; - auto str_endswith = [](const std::string & str, const std::string & suffix) { - return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0; - }; - for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) { - std::string name(cur->name); - if (str_endswith(name, ".lora_a")) { - replace_all(name, ".lora_a", ""); - if (ab_map.find(name) == ab_map.end()) { - ab_map[name] = llama_lora_weight(cur, nullptr); - } else { - ab_map[name].a = cur; - } - } else if (str_endswith(name, ".lora_b")) { - replace_all(name, ".lora_b", ""); - if (ab_map.find(name) == ab_map.end()) { - ab_map[name] = llama_lora_weight(nullptr, cur); - } else { - ab_map[name].b = cur; - } - } else { - throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix"); - } - } - - // add tensors - for (auto & it : ab_map) { - const std::string & name = it.first; - llama_lora_weight & w = it.second; - - if (!w.a || !w.b) { - throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component"); - } - - // device buft and device ctx - auto * model_tensor = llama_get_model_tensor(model, name.c_str()); - if (!model_tensor) { - throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model"); - } - struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer)); - // validate tensor shape - if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) { - throw std::runtime_error("tensor '" + name + "' has incorrect shape"); - } - if (w.a->ne[1] != w.b->ne[0]) { - throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)"); - } - // save tensor to adapter - struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a); - struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b); - ggml_set_name(tensor_a, w.a->name); - ggml_set_name(tensor_b, w.b->name); - adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b); - } - - // allocate tensors / buffers and zero - { - adapter.ctxs.reserve(ctx_map.size()); - adapter.bufs.reserve(ctx_map.size()); - for (auto & it : ctx_map) { - ggml_backend_buffer_type_t buft = it.first; - ggml_context * ctx_dev = it.second; - ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) }; - if (!buf) { - throw std::runtime_error("failed to allocate buffer for lora adapter\n"); - } - LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0); - adapter.bufs.emplace_back(std::move(buf)); - } - } - - // set tensor data - { - llama_file gguf_file(path_lora, "rb"); - std::vector read_buf; - auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) { - size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name)); - size_t size = ggml_nbytes(orig); - read_buf.resize(size); - gguf_file.seek(offs, SEEK_SET); - gguf_file.read_raw(read_buf.data(), size); - ggml_backend_tensor_set(dev, read_buf.data(), 0, size); - }; - for (auto & it : adapter.ab_map) { - auto orig = ab_map[it.first]; - auto dev = it.second; - set_tensor(orig.a, dev.a); - set_tensor(orig.b, dev.b); - } - } - - LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2); -} +void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index cd0217b5c28f6d..ba9a59e396313c 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2,6 +2,7 @@ #include "llama-impl.h" +#include #include const char * llm_type_name(llm_type type) { @@ -172,3 +173,15 @@ ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, in return ggml_add(ctx, cur, layer_dir); }); } + +struct ggml_tensor * llama_model_get_tensor(const struct llama_model & model, const char * name) { + auto it = std::find_if(model.tensors_by_name.begin(), model.tensors_by_name.end(), + [name](const std::pair & it) { + return it.first == name; + }); + if (it == model.tensors_by_name.end()) { + return nullptr; + } + + return it->second; +} diff --git a/src/llama-model.h b/src/llama-model.h index fa8509b64a906e..aa3ff9b0d8567e 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -363,15 +363,6 @@ struct llama_model { // total size of all the tensors in the model in bytes size_t n_bytes = 0; - - // keep track of loaded lora adapters - std::set lora_adapters; - - ~llama_model() { - while (!lora_adapters.empty()) { - llama_lora_adapter_free(*lora_adapters.begin()); - } - } }; const char * llm_type_name(llm_type type); @@ -380,5 +371,9 @@ std::string llama_model_arch_name (const llama_model & model); std::string llama_model_type_name (const llama_model & model); std::string llama_model_ftype_name(const llama_model & model); -// TODO: this probably belongs to llama-adapter +// used by llama_adapter_vec ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il); + +// used by llama_adapter_lora +struct ggml_tensor * llama_model_get_tensor(const struct llama_model & model, const char * name); + diff --git a/src/llama.cpp b/src/llama.cpp index f049325bfce9cd..f1ed3bb0f62413 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15430,6 +15430,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } +struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) { + struct llama_lora_adapter * adapter = new llama_lora_adapter(); + + try { + llama_lora_adapter_init_impl(*model, path_lora, *adapter); + return adapter; + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); + + delete adapter; + } + + return nullptr; +} + int32_t llama_lora_adapter_set( struct llama_context * ctx, struct llama_lora_adapter * adapter, @@ -15438,7 +15453,9 @@ int32_t llama_lora_adapter_set( LLAMA_LOG_ERROR("%s: flash_attn is not compatible with LoRA\n", __func__); return -1; } + ctx->lora_adapters[adapter] = scale; + return 0; } @@ -15450,6 +15467,7 @@ int32_t llama_lora_adapter_remove( ctx->lora_adapters.erase(pos); return 0; } + return -1; } @@ -16319,17 +16337,6 @@ uint32_t llama_model_quantize( } } -struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) { - try { - struct llama_lora_adapter * adapter = new llama_lora_adapter(model); - llama_lora_adapter_init_internal(model, path_lora, *adapter); - return adapter; - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); - return nullptr; - } -} - struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) { struct llama_kv_cache_view result = { /*.n_cells = */ 0,