From 0e8cc2074f1fa0f39bc289cd0a3f54e0532121b8 Mon Sep 17 00:00:00 2001
From: Ming Lu <ming.lu@cloud.com>
Date: Fri, 27 Dec 2024 14:24:51 +0800
Subject: [PATCH 1/2] CA-404062: Wrongly restart xapi when receiving HTTP
 errors

The xapi on a supporter host would restart when it received HTTP error
from the xapi on the coordinator host.

This breaks the pool.designate_new_master use case for a big pool, e.g.
64-host pool. In this case, some supporters may restart unexpectedly
within the phase of committing new coordinator due to the logic above.

Additionally, the purpose of this logic, explained by the error message,
is not correct also. Not all HTTP errors are caused by "our master
address is wrong".

On the other hand, if a use case requires to restart the xapi, an more
explicit logic should ensure that, instead of leveraging an implicit
HTTP error code. Furhtermore, if a supporter indeed is connecting to a
wrong coordinator, this should be a bug and can be recovered manually.

Based on above arguments, the restarting xapi after receiving HTTP error
is removed. This follows the TODO concluded in CA-36936 as well.

Signed-off-by: Ming Lu <ming.lu@cloud.com>
---
 ocaml/database/master_connection.ml | 115 ++++++++++++++--------------
 1 file changed, 57 insertions(+), 58 deletions(-)

diff --git a/ocaml/database/master_connection.ml b/ocaml/database/master_connection.ml
index e10658d48c0..89247488820 100644
--- a/ocaml/database/master_connection.ml
+++ b/ocaml/database/master_connection.ml
@@ -204,6 +204,7 @@ let connection_timeout = ref !Db_globs.master_connection_default_timeout
    are exceeded *)
 let restart_on_connection_timeout = ref true
 
+
 exception Content_length_required
 
 let do_db_xml_rpc_persistent_with_reopen ~host:_ ~path (req : string) :
@@ -221,6 +222,59 @@ let do_db_xml_rpc_persistent_with_reopen ~host:_ ~path (req : string) :
     else if !backoff_delay > 256.0 then
       backoff_delay := 256.0
   in
+  let reconnect () =
+    (* RPC failed - there's no way we can recover from this so try reopening connection every 2s + backoff delay *)
+    ( match !my_connection with
+    | None ->
+        ()
+    | Some st_proc -> (
+        my_connection := None ;
+        (* don't want to try closing multiple times *)
+        try Stunnel.disconnect st_proc with _ -> ()
+      )
+    ) ;
+    let time_sofar = Unix.gettimeofday () -. time_call_started in
+    if !connection_timeout < 0. then (
+      if not !surpress_no_timeout_logs then (
+        debug
+          "Connection to master died. I will continue to retry \
+           indefinitely (supressing future logging of this message)." ;
+        error
+          "Connection to master died. I will continue to retry \
+           indefinitely (supressing future logging of this message)."
+      ) ;
+      surpress_no_timeout_logs := true
+    ) else
+      debug
+        "Connection to master died: time taken so far in this call '%f'; \
+         will %s"
+        time_sofar
+        ( if !connection_timeout < 0. then
+            "never timeout"
+          else
+            Printf.sprintf "timeout after '%f'" !connection_timeout
+        ) ;
+    if time_sofar > !connection_timeout && !connection_timeout >= 0. then
+      if !restart_on_connection_timeout then (
+        debug
+          "Exceeded timeout for retrying master connection: restarting xapi" ;
+        !Db_globs.restart_fn ()
+      ) else (
+        debug
+          "Exceeded timeout for retrying master connection: raising \
+           Cannot_connect_to_master" ;
+        raise Cannot_connect_to_master
+      ) ;
+    debug "Sleeping %f seconds before retrying master connection..."
+      !backoff_delay ;
+    let timed_out = Scheduler.PipeDelay.wait delay !backoff_delay in
+    if not timed_out then
+      debug "%s: Sleep interrupted, retrying master connection now"
+        __FUNCTION__ ;
+    update_backoff_delay () ;
+    D.log_and_ignore_exn open_secure_connection
+  in
+
   while not !write_ok do
     try
       let req_string = req in
@@ -266,67 +320,12 @@ let do_db_xml_rpc_persistent_with_reopen ~host:_ ~path (req : string) :
           Db_globs.http_limit_max_rpc_size ;
         debug "Re-raising exception to caller." ;
         raise Http.Client_requested_size_over_limit
-    (* TODO: This http exception handler caused CA-36936 and can probably be removed now that there's backoff delay in the generic handler _ below *)
     | Http_client.Http_error (http_code, err_msg) ->
-        error
-          "Received HTTP error %s (%s) from master. This suggests our master \
-           address is wrong. Sleeping for %.0fs and then executing restart_fn."
-          http_code err_msg
-          !Db_globs.permanent_master_failure_retry_interval ;
-        Thread.delay !Db_globs.permanent_master_failure_retry_interval ;
-        !Db_globs.restart_fn ()
+        error "Received HTTP error %s (%s) from the coordinator" http_code err_msg ;
+        reconnect ()
     | e ->
         error "Caught %s" (Printexc.to_string e) ;
-        (* RPC failed - there's no way we can recover from this so try reopening connection every 2s + backoff delay *)
-        ( match !my_connection with
-        | None ->
-            ()
-        | Some st_proc -> (
-            my_connection := None ;
-            (* don't want to try closing multiple times *)
-            try Stunnel.disconnect st_proc with _ -> ()
-          )
-        ) ;
-        let time_sofar = Unix.gettimeofday () -. time_call_started in
-        if !connection_timeout < 0. then (
-          if not !surpress_no_timeout_logs then (
-            debug
-              "Connection to master died. I will continue to retry \
-               indefinitely (supressing future logging of this message)." ;
-            error
-              "Connection to master died. I will continue to retry \
-               indefinitely (supressing future logging of this message)."
-          ) ;
-          surpress_no_timeout_logs := true
-        ) else
-          debug
-            "Connection to master died: time taken so far in this call '%f'; \
-             will %s"
-            time_sofar
-            ( if !connection_timeout < 0. then
-                "never timeout"
-              else
-                Printf.sprintf "timeout after '%f'" !connection_timeout
-            ) ;
-        if time_sofar > !connection_timeout && !connection_timeout >= 0. then
-          if !restart_on_connection_timeout then (
-            debug
-              "Exceeded timeout for retrying master connection: restarting xapi" ;
-            !Db_globs.restart_fn ()
-          ) else (
-            debug
-              "Exceeded timeout for retrying master connection: raising \
-               Cannot_connect_to_master" ;
-            raise Cannot_connect_to_master
-          ) ;
-        debug "Sleeping %f seconds before retrying master connection..."
-          !backoff_delay ;
-        let timed_out = Scheduler.PipeDelay.wait delay !backoff_delay in
-        if not timed_out then
-          debug "%s: Sleep interrupted, retrying master connection now"
-            __FUNCTION__ ;
-        update_backoff_delay () ;
-        D.log_and_ignore_exn open_secure_connection
+        reconnect ()
   done ;
   !result
 

From 9a44916385b7e787d553210f5b90e4d684b860fc Mon Sep 17 00:00:00 2001
From: Ming Lu <ming.lu@cloud.com>
Date: Thu, 2 Jan 2025 05:32:59 +0000
Subject: [PATCH 2/2] CA-404062: Reformat

Signed-off-by: Ming Lu <ming.lu@cloud.com>
---
 ocaml/database/master_connection.ml | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/ocaml/database/master_connection.ml b/ocaml/database/master_connection.ml
index 89247488820..ed9bfbd2826 100644
--- a/ocaml/database/master_connection.ml
+++ b/ocaml/database/master_connection.ml
@@ -204,7 +204,6 @@ let connection_timeout = ref !Db_globs.master_connection_default_timeout
    are exceeded *)
 let restart_on_connection_timeout = ref true
 
-
 exception Content_length_required
 
 let do_db_xml_rpc_persistent_with_reopen ~host:_ ~path (req : string) :
@@ -237,17 +236,17 @@ let do_db_xml_rpc_persistent_with_reopen ~host:_ ~path (req : string) :
     if !connection_timeout < 0. then (
       if not !surpress_no_timeout_logs then (
         debug
-          "Connection to master died. I will continue to retry \
-           indefinitely (supressing future logging of this message)." ;
+          "Connection to master died. I will continue to retry indefinitely \
+           (supressing future logging of this message)." ;
         error
-          "Connection to master died. I will continue to retry \
-           indefinitely (supressing future logging of this message)."
+          "Connection to master died. I will continue to retry indefinitely \
+           (supressing future logging of this message)."
       ) ;
       surpress_no_timeout_logs := true
     ) else
       debug
-        "Connection to master died: time taken so far in this call '%f'; \
-         will %s"
+        "Connection to master died: time taken so far in this call '%f'; will \
+         %s"
         time_sofar
         ( if !connection_timeout < 0. then
             "never timeout"
@@ -256,8 +255,7 @@ let do_db_xml_rpc_persistent_with_reopen ~host:_ ~path (req : string) :
         ) ;
     if time_sofar > !connection_timeout && !connection_timeout >= 0. then
       if !restart_on_connection_timeout then (
-        debug
-          "Exceeded timeout for retrying master connection: restarting xapi" ;
+        debug "Exceeded timeout for retrying master connection: restarting xapi" ;
         !Db_globs.restart_fn ()
       ) else (
         debug
@@ -269,8 +267,7 @@ let do_db_xml_rpc_persistent_with_reopen ~host:_ ~path (req : string) :
       !backoff_delay ;
     let timed_out = Scheduler.PipeDelay.wait delay !backoff_delay in
     if not timed_out then
-      debug "%s: Sleep interrupted, retrying master connection now"
-        __FUNCTION__ ;
+      debug "%s: Sleep interrupted, retrying master connection now" __FUNCTION__ ;
     update_backoff_delay () ;
     D.log_and_ignore_exn open_secure_connection
   in
@@ -321,7 +318,8 @@ let do_db_xml_rpc_persistent_with_reopen ~host:_ ~path (req : string) :
         debug "Re-raising exception to caller." ;
         raise Http.Client_requested_size_over_limit
     | Http_client.Http_error (http_code, err_msg) ->
-        error "Received HTTP error %s (%s) from the coordinator" http_code err_msg ;
+        error "Received HTTP error %s (%s) from the coordinator" http_code
+          err_msg ;
         reconnect ()
     | e ->
         error "Caught %s" (Printexc.to_string e) ;