diff --git a/common/common.h b/common/common.h index fee4c264ea9de..856c494cb1b35 100644 --- a/common/common.h +++ b/common/common.h @@ -638,6 +638,10 @@ common_control_vector_data common_control_vector_load(const std::vector -#include #include #include #include #include - -#include -#include #include + +#include +#include #include #if defined(_WIN32) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 912caf346e75e..ab91d0b40aa03 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -1,7 +1,7 @@ -#include "common.h" #include "ggml.h" #include "llama.h" -#include "llama-impl.h" +#include "llama-context.h" +#include "common.h" #include #include @@ -9,11 +9,9 @@ #include #include #include -#include #include #include #include -#include #include #include #include @@ -330,13 +328,13 @@ int main(int argc, char ** argv) { } } - const auto &tensors = llama_internal_get_tensor_map(ctx); + const auto & tensors = llama_internal_get_tensor_map(ctx); // check layer tensors int included_layers = 0; int64_t max_nelements = 0; bool is_f16 = false; - for (const auto& kv_tensor : tensors) { + for (const auto & kv_tensor : tensors) { if (!layer_included(params, kv_tensor.first)) { continue; } @@ -371,8 +369,8 @@ int main(int argc, char ** argv) { if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { continue; } - const auto * qfns = ggml_get_type_traits(type); - const auto * qfns_cpu = ggml_get_type_traits_cpu(type); + const auto * qfns = ggml_get_type_traits(type); + const auto * qfns_cpu = ggml_get_type_traits_cpu(type); if (qfns_cpu->from_float && qfns->to_float) { if (params.verbose) { printf("testing %s ...\n", ggml_type_name(type)); @@ -382,7 +380,7 @@ int main(int argc, char ** argv) { error_stats global_stats {}; - for (const auto& kv_tensor : tensors) { + for (const auto & kv_tensor : tensors) { if (!layer_included(params, kv_tensor.first)) { continue; } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8dd4fcf777c23..6bb1a75dc4c33 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -15,6 +15,7 @@ add_library(llama llama-chat.cpp llama-context.cpp llama-hparams.cpp + llama-impl.cpp llama-grammar.cpp llama-kv-cache.cpp llama-mmap.cpp diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index 4394ade1ae996..31dac843dadef 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -5,6 +5,7 @@ #include #include #include +#include // vec diff --git a/src/llama-batch.h b/src/llama-batch.h index 68e22a0caa18e..4b53748810be2 100644 --- a/src/llama-batch.h +++ b/src/llama-batch.h @@ -26,7 +26,9 @@ struct llama_ubatch { struct llama_sbatch_seq { int32_t n_seq_id; + llama_seq_id * seq_id; + size_t offset; size_t length; }; @@ -112,8 +114,8 @@ struct llama_sbatch { if (ubatch.equal_seqs) { for (size_t i = 0; i < length; ++i) { memcpy( - ubatch.embd + n_embd * (ubatch.n_tokens + i), - batch->embd + n_embd * ids[seq.offset + i], + ubatch.embd + (n_embd * (ubatch.n_tokens + i)), + batch->embd + (n_embd * ids[seq.offset + i]), n_embd * sizeof(float) ); } diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 9c7497379c457..69de29b6d0398 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1,5 +1,7 @@ #include "llama-context.h" +#include + // deprecated size_t llama_get_state_size(struct llama_context * ctx) { return llama_state_get_size(ctx); @@ -968,3 +970,8 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa } } +const std::vector> & llama_internal_get_tensor_map( + struct llama_context * ctx +) { + return ctx->model.tensors_by_name; +} diff --git a/src/llama-context.h b/src/llama-context.h index bfa54002e222d..94d94a98b0958 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -219,3 +219,7 @@ static void llama_output_reorder(struct llama_context * ctx) { out_ids.clear(); } } + +// For internal test use +// TODO: remove +const std::vector> & llama_internal_get_tensor_map(struct llama_context * ctx); diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp index 76d0cb3a2ff78..186dc9a25cf96 100644 --- a/src/llama-grammar.cpp +++ b/src/llama-grammar.cpp @@ -1,5 +1,6 @@ #include "llama-grammar.h" +#include "llama-impl.h" #include "llama-vocab.h" #include "llama-sampling.h" diff --git a/src/llama-grammar.h b/src/llama-grammar.h index 13e940fb52e24..f8b40c6515744 100644 --- a/src/llama-grammar.h +++ b/src/llama-grammar.h @@ -1,8 +1,10 @@ #pragma once -#include "llama-impl.h" +#include "llama.h" #include +#include +#include struct llama_vocab; diff --git a/src/llama-impl.cpp b/src/llama-impl.cpp new file mode 100644 index 0000000000000..fc3fee213f8d6 --- /dev/null +++ b/src/llama-impl.cpp @@ -0,0 +1,74 @@ +#include "llama-impl.h" + +#include "llama.h" + +#include + +struct llama_logger_state { + ggml_log_callback log_callback = llama_log_callback_default; + void * log_callback_user_data = nullptr; +}; + +static llama_logger_state g_logger_state; + +time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {} + +time_meas::~time_meas() { + if (t_start_us >= 0) { + t_acc += ggml_time_us() - t_start_us; + } + } + +void replace_all(std::string & s, const std::string & search, const std::string & replace) { + if (search.empty()) { + return; + } + std::string builder; + builder.reserve(s.length()); + size_t pos = 0; + size_t last_pos = 0; + while ((pos = s.find(search, last_pos)) != std::string::npos) { + builder.append(s, last_pos, pos - last_pos); + builder.append(replace); + last_pos = pos + search.length(); + } + builder.append(s, last_pos, std::string::npos); + s = std::move(builder); +} + +void llama_log_set(ggml_log_callback log_callback, void * user_data) { + ggml_log_set(log_callback, user_data); + g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default; + g_logger_state.log_callback_user_data = user_data; +} + +static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) { + va_list args_copy; + va_copy(args_copy, args); + char buffer[128]; + int len = vsnprintf(buffer, 128, format, args); + if (len < 128) { + g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data); + } else { + char * buffer2 = new char[len + 1]; + vsnprintf(buffer2, len + 1, format, args_copy); + buffer2[len] = 0; + g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data); + delete[] buffer2; + } + va_end(args_copy); +} + +void llama_log_internal(ggml_log_level level, const char * format, ...) { + va_list args; + va_start(args, format); + llama_log_internal_v(level, format, args); + va_end(args); +} + +void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) { + (void) level; + (void) user_data; + fputs(text, stderr); + fflush(stderr); +} diff --git a/src/llama-impl.h b/src/llama-impl.h index b9b979ebb74c1..dbe5c21c56742 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -1,10 +1,8 @@ #pragma once -#include "llama.h" +#include "ggml.h" #include -#include -#include #ifdef __GNUC__ #ifdef __MINGW32__ @@ -40,146 +38,12 @@ std::string format(const char * fmt, ...); // struct time_meas { - time_meas(int64_t & t_acc, bool disable = false) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {} - - ~time_meas() { - if (t_start_us >= 0) { - t_acc += ggml_time_us() - t_start_us; - } - } + time_meas(int64_t & t_acc, bool disable = false); + ~time_meas(); const int64_t t_start_us; int64_t & t_acc; }; -static void replace_all(std::string & s, const std::string & search, const std::string & replace) { - if (search.empty()) { - return; - } - std::string builder; - builder.reserve(s.length()); - size_t pos = 0; - size_t last_pos = 0; - while ((pos = s.find(search, last_pos)) != std::string::npos) { - builder.append(s, last_pos, pos - last_pos); - builder.append(replace); - last_pos = pos + search.length(); - } - builder.append(s, last_pos, std::string::npos); - s = std::move(builder); -} - -const std::vector> & llama_internal_get_tensor_map( - struct llama_context * ctx -); - -// the ring buffer works similarly to std::deque, but with a fixed capacity -template -struct ring_buffer { - ring_buffer(size_t cap) : capacity(cap), data(cap) {} - - T & front() { - if (sz == 0) { - throw std::runtime_error("ring buffer is empty"); - } - return data[first]; - } - - const T & front() const { - if (sz == 0) { - throw std::runtime_error("ring buffer is empty"); - } - return data[first]; - } - - T & back() { - if (sz == 0) { - throw std::runtime_error("ring buffer is empty"); - } - return data[pos]; - } - - const T & back() const { - if (sz == 0) { - throw std::runtime_error("ring buffer is empty"); - } - return data[pos]; - } - - void push_back(const T & value) { - if (capacity == 0) { - throw std::runtime_error("ring buffer: capacity is zero"); - } - - if (sz == capacity) { - // advance the start when buffer is full - first = (first + 1) % capacity; - } else { - sz++; - } - data[pos] = value; - pos = (pos + 1) % capacity; - } - - T pop_front() { - if (sz == 0) { - throw std::runtime_error("ring buffer is empty"); - } - T value = data[first]; - first = (first + 1) % capacity; - sz--; - return value; - } - - //T & operator[](size_t i) { - // if (i >= sz) { - // throw std::runtime_error("ring buffer: index out of bounds"); - // } - // return data[(first + i) % capacity]; - //} - - //const T & at(size_t i) const { - // if (i >= sz) { - // throw std::runtime_error("ring buffer: index out of bounds"); - // } - // return data[(first + i) % capacity]; - //} - - const T & rat(size_t i) const { - if (i >= sz) { - throw std::runtime_error("ring buffer: index out of bounds"); - } - return data[(first + sz - i - 1) % capacity]; - } - - std::vector to_vector() const { - std::vector result; - result.reserve(sz); - for (size_t i = 0; i < sz; i++) { - result.push_back(data[(first + i) % capacity]); - } - return result; - } - - void clear() { - // here only reset the status of the buffer - sz = 0; - first = 0; - pos = 0; - } - - bool empty() const { - return sz == 0; - } - - size_t size() const { - return sz; - } - - size_t capacity = 0; - size_t sz = 0; - size_t first = 0; - size_t pos = 0; - std::vector data; -}; +void replace_all(std::string & s, const std::string & search, const std::string & replace); diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index ce4b2da501a80..a9932633512a6 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -6,6 +6,7 @@ #include #include +#include #ifdef __has_include #if __has_include() diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 70e6306336a3f..f4a72bebc7978 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4,6 +4,7 @@ #include #include +#include const char * llm_type_name(llm_type type) { switch (type) { diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index bebff77cfa09d..86a76f71d519a 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -1,5 +1,6 @@ #include "llama-sampling.h" +#include "llama-impl.h" #include "llama-vocab.h" #include "llama-grammar.h" @@ -14,6 +15,117 @@ #include #include #include +#include + +// the ring buffer works similarly to std::deque, but with a fixed capacity +template +struct ring_buffer { + ring_buffer(size_t cap) : capacity(cap), data(cap) {} + + T & front() { + if (sz == 0) { + throw std::runtime_error("ring buffer is empty"); + } + return data[first]; + } + + const T & front() const { + if (sz == 0) { + throw std::runtime_error("ring buffer is empty"); + } + return data[first]; + } + + T & back() { + if (sz == 0) { + throw std::runtime_error("ring buffer is empty"); + } + return data[pos]; + } + + const T & back() const { + if (sz == 0) { + throw std::runtime_error("ring buffer is empty"); + } + return data[pos]; + } + + void push_back(const T & value) { + if (capacity == 0) { + throw std::runtime_error("ring buffer: capacity is zero"); + } + + if (sz == capacity) { + // advance the start when buffer is full + first = (first + 1) % capacity; + } else { + sz++; + } + data[pos] = value; + pos = (pos + 1) % capacity; + } + + T pop_front() { + if (sz == 0) { + throw std::runtime_error("ring buffer is empty"); + } + T value = data[first]; + first = (first + 1) % capacity; + sz--; + return value; + } + + //T & operator[](size_t i) { + // if (i >= sz) { + // throw std::runtime_error("ring buffer: index out of bounds"); + // } + // return data[(first + i) % capacity]; + //} + + //const T & at(size_t i) const { + // if (i >= sz) { + // throw std::runtime_error("ring buffer: index out of bounds"); + // } + // return data[(first + i) % capacity]; + //} + + const T & rat(size_t i) const { + if (i >= sz) { + throw std::runtime_error("ring buffer: index out of bounds"); + } + return data[(first + sz - i - 1) % capacity]; + } + + std::vector to_vector() const { + std::vector result; + result.reserve(sz); + for (size_t i = 0; i < sz; i++) { + result.push_back(data[(first + i) % capacity]); + } + return result; + } + + void clear() { + // here only reset the status of the buffer + sz = 0; + first = 0; + pos = 0; + } + + bool empty() const { + return sz == 0; + } + + size_t size() const { + return sz; + } + + size_t capacity = 0; + size_t sz = 0; + size_t first = 0; + size_t pos = 0; + std::vector data; +}; static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) { // iterator for the probabilities diff --git a/src/llama.cpp b/src/llama.cpp index a8910b5e640ec..558e625c537f0 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -202,13 +202,6 @@ static std::string llama_token_to_piece(const struct llama_model * model, llama_ // globals // -struct llama_logger_state { - ggml_log_callback log_callback = llama_log_callback_default; - void * log_callback_user_data = nullptr; -}; - -static llama_logger_state g_logger_state; - static const size_t kiB = 1024; static const size_t MiB = 1024*kiB; static const size_t GiB = 1024*MiB; @@ -17188,46 +17181,3 @@ void llama_perf_context_reset(struct llama_context * ctx) { ctx->t_p_eval_us = ctx->n_p_eval = 0; } -// For internal test use -const std::vector> & llama_internal_get_tensor_map( - struct llama_context * ctx -) { - return ctx->model.tensors_by_name; -} - -void llama_log_set(ggml_log_callback log_callback, void * user_data) { - ggml_log_set(log_callback, user_data); - g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default; - g_logger_state.log_callback_user_data = user_data; -} - -static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) { - va_list args_copy; - va_copy(args_copy, args); - char buffer[128]; - int len = vsnprintf(buffer, 128, format, args); - if (len < 128) { - g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data); - } else { - char * buffer2 = new char[len + 1]; - vsnprintf(buffer2, len + 1, format, args_copy); - buffer2[len] = 0; - g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data); - delete[] buffer2; - } - va_end(args_copy); -} - -void llama_log_internal(ggml_log_level level, const char * format, ...) { - va_list args; - va_start(args, format); - llama_log_internal_v(level, format, args); - va_end(args); -} - -void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) { - (void) level; - (void) user_data; - fputs(text, stderr); - fflush(stderr); -}