Skip to content

Commit

Permalink
CP-43901: Block pool member startup if it has a higher xapi version
Browse files Browse the repository at this point in the history
During pool upgrade, it is expected that pool coordinator needs to be
updated first, and a new xapi should start on the coordinator before it
starts on any other host in the pool. Otherwise it is dangerous and may
cause unpredictable problem.

This commit is to block pool member startup if it has a higher xapi
version than coordinator until coordinator has a equal or higher
version of xapi running.

Signed-off-by: Gang Ji <[email protected]>
  • Loading branch information
gangj authored Nov 2, 2023
1 parent 412ac00 commit 0ce3925
Show file tree
Hide file tree
Showing 7 changed files with 92 additions and 26 deletions.
3 changes: 3 additions & 0 deletions ocaml/idl/datamodel_errors.ml
Original file line number Diff line number Diff line change
Expand Up @@ -720,6 +720,9 @@ let _ =
the coordinator's database and pointing to the correct coordinator? Are \
all servers using the same pool secret?"
() ;
error Api_errors.host_xapi_version_higher_than_coordinator
["host_xapi_version"]
~doc:"The host xapi version is higher than the one in the coordinator" () ;
error Api_errors.host_broken []
~doc:
"This server failed in the middle of an automatic failover operation and \
Expand Down
1 change: 1 addition & 0 deletions ocaml/idl/datamodel_host.ml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ let local_assert_healthy =
; Api_errors.license_restriction
; Api_errors.license_does_not_support_pooling
; Api_errors.ha_should_be_fenced
; Api_errors.host_xapi_version_higher_than_coordinator
]
~allowed_roles:_R_LOCAL_ROOT_ONLY ()

Expand Down
31 changes: 17 additions & 14 deletions ocaml/util/xapi_version.ml
Original file line number Diff line number Diff line change
Expand Up @@ -18,28 +18,31 @@ let hostname = "localhost"

let date = Xapi_build_info.date

let parse_xapi_version version =
try Scanf.sscanf version "%d.%d.%s" (fun maj min rest -> (maj, min, rest))
with _ ->
failwith
(Printf.sprintf "Couldn't determine xapi version from string: '%s'"
version
)

let version, xapi_version_major, xapi_version_minor, git_id =
match Build_info.V1.version () with
| None ->
("0.0.dev", 0, 0, "dev")
| Some v -> (
| Some v ->
let str = Build_info.V1.Version.to_string v in
let version =
if String.starts_with ~prefix:"v" str then
String.sub str 1 (String.length str - 1)
else
str
in
try
let maj, min, git_id =
Scanf.sscanf version "%d.%d.%s" (fun maj min rest -> (maj, min, rest))
in
(version, maj, min, git_id)
with _ ->
failwith
(Printf.sprintf
"Couldn't determine xapi version - got unexpected version from \
dune: '%s'"
version
)
)
let maj, min, git_id = parse_xapi_version version in
(version, maj, min, git_id)

let compare_version version_a version_b =
let maj_a, min_a, _ = parse_xapi_version version_a in
let maj_b, min_b, _ = parse_xapi_version version_b in
let ( <?> ) a b = if a = 0 then b else a in
Int.compare maj_a maj_b <?> Int.compare min_a min_b <?> 0
2 changes: 2 additions & 0 deletions ocaml/util/xapi_version.mli
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,5 @@ val git_id : string
val xapi_version_major : int

val xapi_version_minor : int

val compare_version : string -> string -> int
3 changes: 3 additions & 0 deletions ocaml/xapi-consts/api_errors.ml
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,9 @@ let host_master_cannot_talk_back = "HOST_MASTER_CANNOT_TALK_BACK"

let host_unknown_to_master = "HOST_UNKNOWN_TO_MASTER"

let host_xapi_version_higher_than_coordinator =
"HOST_XAPI_VERSION_HIGHER_THAN_COORDINATOR"

(* should be fenced *)
let host_broken = "HOST_BROKEN"

Expand Down
3 changes: 3 additions & 0 deletions ocaml/xapi-consts/api_messages.ml
Original file line number Diff line number Diff line change
Expand Up @@ -355,3 +355,6 @@ let tls_verification_emergency_disabled =
addMessage "TLS_VERIFICATION_EMERGENCY_DISABLED" 3L

let periodic_update_sync_failed = addMessage "PERIODIC_UPDATE_SYNC_FAILED" 3L

let xapi_startup_blocked_as_version_higher_than_coordinator =
addMessage "XAPI_STARTUP_BLOCKED_AS_VERSION_HIGHER_THAN_COORDINATOR" 2L
75 changes: 63 additions & 12 deletions ocaml/xapi/xapi.ml
Original file line number Diff line number Diff line change
Expand Up @@ -417,14 +417,19 @@ let wait_for_management_ip_address ~__context =
) ;
ip

type hello_error =
type host_status_check_error =
| Permanent (* e.g. the pool secret is wrong i.e. wrong master *)
| Temporary

(* some glitch or other *)

(** Attempt a Pool.hello, return None if ok or Some hello_error otherwise *)
let attempt_pool_hello my_ip =
let xapi_ver_high_alerted = ref false

(** Attempt checking host status with pool coordinator:
* 1. Pool.hello
* 2. if Pool.hello ok, check xapi version
* Return None if ok or Some host_status_check_error otherwise *)
let attempt_host_status_check_with_coordinator ~__context my_ip =
let localhost_uuid = Helpers.get_localhost_uuid () in
try
Helpers.call_emergency_mode_functions (Pool_role.get_master_address ())
Expand All @@ -444,7 +449,46 @@ let attempt_pool_hello my_ip =
[localhost_uuid] ;
Some Permanent
| `ok ->
None
let xapi_version_higher version =
version |> Xapi_version.compare_version Xapi_version.version
|> fun r -> r > 0
in
if
xapi_version_higher
(Db.Host.get_software_version ~__context
~self:(Helpers.get_master ~__context)
|> List.assoc "xapi_build"
)
then (
let name_label =
Db.Host.get_name_label ~__context
~self:(Helpers.get_localhost ~__context)
in
let err_msg =
Printf.sprintf
"Xapi startup in pool member %s is blocked as its xapi \
version (%s) is higher than xapi version in pool \
coordinator."
name_label Xapi_version.version
in
if not !xapi_ver_high_alerted then (
let name, priority =
Api_messages
.xapi_startup_blocked_as_version_higher_than_coordinator
in
ignore
(Client.Client.Message.create ~rpc ~session_id ~name ~priority
~cls:`Host ~obj_uuid:localhost_uuid ~body:err_msg
) ;
xapi_ver_high_alerted := true
) ;
error "%s" err_msg ;
Xapi_host.set_emergency_mode_error
Api_errors.host_xapi_version_higher_than_coordinator
[Xapi_version.version] ;
Some Permanent
) else
None
)
with
| Api_errors.Server_error (code, _)
Expand All @@ -456,13 +500,15 @@ let attempt_pool_hello my_ip =
[localhost_uuid] ;
Some Permanent
| Api_errors.Server_error (code, params) as exn ->
debug "Caught exception: %s during Pool.hello"
(ExnHelper.string_of_exn exn) ;
debug "Caught exception: %s in %s"
(ExnHelper.string_of_exn exn)
__FUNCTION__ ;
Xapi_host.set_emergency_mode_error code params ;
Some Temporary
| exn ->
debug "Caught exception: %s during Pool.hello"
(ExnHelper.string_of_exn exn) ;
debug "Caught exception: %s in %s"
(ExnHelper.string_of_exn exn)
__FUNCTION__ ;
Xapi_host.set_emergency_mode_error Api_errors.internal_error
[ExnHelper.string_of_exn exn] ;
Some Temporary
Expand Down Expand Up @@ -1129,23 +1175,28 @@ let server_init () =
Helpers.touch_file !Xapi_globs.ready_file ;
(* Keep trying to log into master *)
let finished = ref false in

while not !finished do
(* Grab the management IP address (wait forever for it if necessary) *)
let ip = wait_for_management_ip_address ~__context in
debug "Start master_connection watchdog" ;
ignore (Master_connection.start_master_connection_watchdog ()) ;
debug "Attempting to communicate with master" ;
(* Try to say hello to the pool *)
match attempt_pool_hello ip with

(* Try to check host status with the pool *)
match
attempt_host_status_check_with_coordinator ~__context ip
with
| None ->
finished := true
| Some Temporary ->
debug "I think the error is a temporary one, retrying in 5s" ;
Thread.delay 5.
| Some Permanent ->
error
"Permanent error in Pool.hello, will retry after %.0fs \
just in case"
"Permanent error in \
attempt_host_status_check_with_coordinator, will retry \
after %.0fs just in case"
!Db_globs.permanent_master_failure_retry_interval ;
Thread.delay !Db_globs.permanent_master_failure_retry_interval
done ;
Expand Down

0 comments on commit 0ce3925

Please sign in to comment.