From f1d4138c1389d7befad88493fc77f41634c9a770 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 21 Feb 2024 13:08:57 +0200 Subject: [PATCH] server : fix initialization thread issues --- examples/server/server.cpp | 26 +++++++++---------- examples/server/tests/features/server.feature | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c7821eca68cba..b77e00a4d5bc8 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2719,19 +2719,6 @@ int main(int argc, char **argv) log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded"; } - LOG_INFO("HTTP server listening", log_data); - // run the HTTP server in a thread - see comment below - std::thread t([&]() - { - if (!svr.listen_after_bind()) - { - state.store(SERVER_STATE_ERROR); - return 1; - } - - return 0; - }); - // load the model if (!llama.load_model(params)) { @@ -3194,6 +3181,19 @@ int main(int argc, char **argv) }*/ //); + LOG_INFO("HTTP server listening", log_data); + // run the HTTP server in a thread - see comment below + std::thread t([&]() + { + if (!svr.listen_after_bind()) + { + state.store(SERVER_STATE_ERROR); + return 1; + } + + return 0; + }); + llama.queue_tasks.on_new_task(std::bind( &llama_server_context::process_single_task, &llama, std::placeholders::_1)); llama.queue_tasks.on_finish_multitask(std::bind( diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index 8bafbc39b7519..f06375c25f725 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -84,7 +84,7 @@ Feature: llama.cpp server Then all prompts are predicted # FIXME: #3969 infinite loop on the CI, not locally, if n_prompt * n_predict > kv_size - @bug + @llama.cpp Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size Given a prompt: """