From ffc1e226e20ec599f0f79fdc2ef878bbd1a4f04a Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Fri, 8 Dec 2023 14:09:16 +0000 Subject: [PATCH] CA-386552 XSI-1534 Failed to disable pool HA after missing HA statefile Backport 5a639b159f3c307a4ac017ffd3306bf161ab5947 The issue arises because: * xapi_ha.ml, function disable_internal * When a static VDI is removed from an SR (manually by deleting it from an NFS server, for example) * As the SR is re-scanned, it is also removed from the xapi database * But the re-scan does not remove it from the list of static VDIs in /etc/xensource/static-vidis * when the function is called, it obtains the list of static VDIs (the UUIds) * and looks them up in the database - which fails * static VDIs references are further listed in Pool.ha_statefile and can be stale. This patch hardens the code path to avoid failure when the statefile VDI has been removed manually. Signed-off-by: Christian Lindig --- ocaml/xapi/xapi_ha.ml | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/ocaml/xapi/xapi_ha.ml b/ocaml/xapi/xapi_ha.ml index eeeecf60363..d5a5f1a3b65 100644 --- a/ocaml/xapi/xapi_ha.ml +++ b/ocaml/xapi/xapi_ha.ml @@ -1158,6 +1158,7 @@ let emergency_ha_disable __context soft = ) let ha_release_resources __context localhost = + let __FUNCTION__ = "ha_release_resources" in Monitor.stop () ; (* Why aren't we calling Xha_statefile.detach_existing_statefiles? @@ -1168,14 +1169,17 @@ let ha_release_resources __context localhost = let statefile_vdis = Db.Pool.get_ha_statefiles ~__context ~self:(Helpers.get_pool ~__context) and deactivate_and_detach_vdi vdi_str = - let uuid = Db.VDI.get_uuid ~__context ~self:(Ref.of_string vdi_str) in - Helpers.log_exn_continue - (Printf.sprintf "detaching statefile VDI uuid: %s" uuid) - (fun () -> - Static_vdis.permanent_vdi_deactivate_by_uuid ~__context ~uuid ; - Static_vdis.permanent_vdi_detach_by_uuid ~__context ~uuid - ) - () + match Db.VDI.get_uuid ~__context ~self:(Ref.of_string vdi_str) with + | uuid -> + Helpers.log_exn_continue + (Printf.sprintf "detaching statefile VDI uuid: %s" uuid) + (fun () -> + Static_vdis.permanent_vdi_deactivate_by_uuid ~__context ~uuid ; + Static_vdis.permanent_vdi_detach_by_uuid ~__context ~uuid + ) + () + | exception _e -> + warn "%s: VDI %s not found in database" __FUNCTION__ vdi_str in List.iter deactivate_and_detach_vdi statefile_vdis ; (* Deactivate and detach any metadata VDIs *) @@ -1516,9 +1520,16 @@ let abort_new_master ~__context ~address = let disable_internal __context = debug "Disabling HA on the Pool" ; let pool = Helpers.get_pool ~__context in + (* Avoid stale static VDIs *) + Static_vdis.gc () ; (* Find the HA metadata and statefile VDIs for later *) let statefile_vdis = - List.map Ref.of_string (Db.Pool.get_ha_statefiles ~__context ~self:pool) + let is_valid ref = + if Db.is_valid_ref __context ref then Some ref else None + in + Db.Pool.get_ha_statefiles ~__context ~self:pool + |> List.map Ref.of_string + |> List.filter_map is_valid in let metadata_vdis = List.map