diff --git a/eden/fs/service/EdenServer.cpp b/eden/fs/service/EdenServer.cpp index f3b0601263a34..0404370aeea92 100644 --- a/eden/fs/service/EdenServer.cpp +++ b/eden/fs/service/EdenServer.cpp @@ -2027,6 +2027,18 @@ folly::Future EdenServer::startTakeoverShutdown() { enumValue(state->state)))); } + if (getMountPoints() != getAllMountPoints()) { + return makeFuture(std::runtime_error( + "can only perform graceful restart when all mount points are initialized")); + // TODO(xavierd): There is still a potential race after this check if a + // mount is initiated at this point. Injecting a block below and starting + // a mount would manifest it. In practice, this should be fairly rare. + // Moving this further (in stopMountsForTakeover for instance) to avoid + // this race requires EdenFS to being able to gracefully handle failures + // and recover in these cases by restarting several components after they + // have been already shutdown. + } + // Make a copy of the thrift server socket so we can transfer it to the // new edenfs process. Our local thrift will close its own socket when // we stop the server. The easiest way to avoid completely closing the diff --git a/eden/integration/lib/edenclient.py b/eden/integration/lib/edenclient.py index 4670c8d6afbbf..0aa0aab6acd07 100644 --- a/eden/integration/lib/edenclient.py +++ b/eden/integration/lib/edenclient.py @@ -450,7 +450,13 @@ def graceful_restart(self, timeout: float = EDENFS_START_TIMEOUT) -> None: old_pid = self.get_pid_via_thrift() self._process = None - self.start(timeout=timeout, takeover_from=old_pid) + try: + self.start(timeout=timeout, takeover_from=old_pid) + except Exception: + # TODO: There might be classes of errors where the old_process is + # gone and we do need to track the new process here. + self._process = old_process + raise # Check the return code from the old edenfs process return_code = old_process.wait() diff --git a/eden/integration/takeover_test.py b/eden/integration/takeover_test.py index e5600555879f9..b14a77c075bd9 100644 --- a/eden/integration/takeover_test.py +++ b/eden/integration/takeover_test.py @@ -13,9 +13,9 @@ from typing import Dict, List, Optional import pexpect -from eden.fs.cli.util import get_pid_using_lockfile, poll_until +from eden.fs.cli.util import EdenStartError, get_pid_using_lockfile, poll_until from eden.thrift.legacy import EdenClient -from facebook.eden.ttypes import FaultDefinition, UnblockFaultArg +from facebook.eden.ttypes import FaultDefinition, MountState, UnblockFaultArg from fb303_core.ttypes import fb303_status from .lib import testcase @@ -434,6 +434,40 @@ def state_shutting_down() -> Optional[bool]: p.join() + def test_takeover_during_mount(self) -> None: + self.eden.unmount(self.mount_path) + + with self.eden.get_thrift_client_legacy() as client: + client.injectFault( + FaultDefinition(keyClass="mount", keyValueRegex=".*", block=True) + ) + + try: + mountProcess = Process(target=self.eden.mount, args=(self.mount_path,)) + mountProcess.start() + + def mount_initializing() -> Optional[bool]: + with self.eden.get_thrift_client_legacy() as client: + for mount_info in client.listMounts(): + if mount_info.mountPoint == self.mount_path_bytes: + if mount_info.state == MountState.INITIALIZING: + return True + return False + return None + + poll_until(mount_initializing, timeout=60) + + with self.assertRaisesRegex( + EdenStartError, "edenfs exited before becoming healthy" + ): + self.eden.graceful_restart() + finally: + with self.eden.get_thrift_client_legacy() as client: + client.unblockFault( + UnblockFaultArg(keyClass="mount", keyValueRegex=".*") + ) + mountProcess.join() + @testcase.eden_repo_test(run_on_nfs=False) class TakeoverTestNoNFSServer(TakeoverTestBase):