From f1d4138c1389d7befad88493fc77f41634c9a770 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 21 Feb 2024 13:08:57 +0200
Subject: [PATCH] server : fix initialization thread issues

---
 examples/server/server.cpp                    | 26 +++++++++----------
 examples/server/tests/features/server.feature |  2 +-
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index c7821eca68cba..b77e00a4d5bc8 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2719,19 +2719,6 @@ int main(int argc, char **argv)
         log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
     }
 
-    LOG_INFO("HTTP server listening", log_data);
-    // run the HTTP server in a thread - see comment below
-    std::thread t([&]()
-            {
-                if (!svr.listen_after_bind())
-                {
-                    state.store(SERVER_STATE_ERROR);
-                    return 1;
-                }
-
-                return 0;
-            });
-
     // load the model
     if (!llama.load_model(params))
     {
@@ -3194,6 +3181,19 @@ int main(int argc, char **argv)
     }*/
     //);
 
+    LOG_INFO("HTTP server listening", log_data);
+    // run the HTTP server in a thread - see comment below
+    std::thread t([&]()
+            {
+                if (!svr.listen_after_bind())
+                {
+                    state.store(SERVER_STATE_ERROR);
+                    return 1;
+                }
+
+                return 0;
+            });
+
     llama.queue_tasks.on_new_task(std::bind(
         &llama_server_context::process_single_task, &llama, std::placeholders::_1));
     llama.queue_tasks.on_finish_multitask(std::bind(
diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature
index 8bafbc39b7519..f06375c25f725 100644
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -84,7 +84,7 @@ Feature: llama.cpp server
     Then all prompts are predicted
 
   # FIXME: #3969 infinite loop on the CI, not locally, if n_prompt * n_predict > kv_size
-  @bug
+  @llama.cpp
   Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size
     Given a prompt:
       """