Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RFC] Hide /sysroot in a private mount namespace #3358

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions Makefile-ostree.am
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ ostree_SOURCES += \
src/ostree/ot-admin-builtin-upgrade.c \
src/ostree/ot-admin-builtin-unlock.c \
src/ostree/ot-admin-builtin-state-overlay.c \
src/ostree/ot-admin-builtin-nsenter.c \
src/ostree/ot-admin-builtins.h \
src/ostree/ot-admin-instutil-builtin-selinux-ensure-labeled.c \
src/ostree/ot-admin-instutil-builtin-set-kargs.c \
Expand Down
2 changes: 1 addition & 1 deletion src/libostree/ostree-bootloader-zipl.c
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,7 @@ _ostree_bootloader_zipl_post_bls_sync (OstreeBootloader *bootloader, int bootver
// This can happen in a unit testing environment; at some point what we want to do here
// is move all of the zipl logic to a systemd unit instead that's keyed of
// ostree-finalize-staged.service.
if (getuid () != 0)
if (!ot_util_process_privileged ())
return TRUE;

// If we're in a booted deployment, we don't need to spawn a container.
Expand Down
156 changes: 66 additions & 90 deletions src/libostree/ostree-impl-system-generator.c
Original file line number Diff line number Diff line change
Expand Up @@ -126,31 +126,32 @@ require_internal_units (const char *normal_dir, const char *early_dir, const cha
#endif
}

// Resolve symlink to return osname
static gboolean
ruihe774 marked this conversation as resolved.
Show resolved Hide resolved
_ostree_sysroot_parse_bootlink_aboot (const char *bootlink, char **out_osname, GError **error)
write_unit_file (int dir_fd, const char *path, GCancellable *cancellable, GError **error, const char *fmt, ...)
{
static gsize regex_initialized;
static GRegex *regex;
g_autofree char *symlink_val = glnx_readlinkat_malloc (-1, bootlink, NULL, error);
if (!symlink_val)
return glnx_prefix_error (error, "Failed to read '%s' symlink", bootlink);

if (g_once_init_enter (&regex_initialized))
{
regex = g_regex_new ("^deploy/([^/]+)/", 0, 0, NULL);
g_assert (regex);
g_once_init_leave (&regex_initialized, 1);
}

g_autoptr (GMatchInfo) match = NULL;
if (!g_regex_match (regex, symlink_val, 0, &match))
return glnx_throw (error,
"Invalid aboot symlink in /ostree, expected symlink to resolve to "
"deploy/OSNAME/... instead it resolves to '%s'",
symlink_val);

*out_osname = g_match_info_fetch (match, 1);
g_auto (GLnxTmpfile) tmpf = {
0,
};
if (!glnx_open_tmpfile_linkable_at (dir_fd, ".", O_WRONLY | O_CLOEXEC, &tmpf, error))
return FALSE;
g_autoptr (GOutputStream) outstream = g_unix_output_stream_new (tmpf.fd, FALSE);
gsize bytes_written;
va_list args;
va_start (args, fmt);
const gboolean r = g_output_stream_vprintf (outstream, &bytes_written, cancellable, error, fmt, args);
va_end (args);
if (!r)
return FALSE;
if (!g_output_stream_flush (outstream, cancellable, error))
return FALSE;
g_clear_object (&outstream);
/* It should be readable */
if (!glnx_fchmod (tmpf.fd, 0644, error))
return FALSE;
/* Error out if somehow it already exists, that'll help us debug conflicts */
if (!glnx_link_tmpfile_at (&tmpf, GLNX_LINK_TMPFILE_NOREPLACE, dir_fd, path,
error))
return FALSE;
return TRUE;
}

Expand All @@ -163,22 +164,37 @@ fstab_generator (const char *ostree_target, const bool is_aboot, const char *nor
/* Not currently cancellable, but define a var in case we care later */
GCancellable *cancellable = NULL;
/* Some path constants to avoid typos */
static const char fstab_path[] = "/etc/fstab";
static const char var_path[] = "/var";
const char *fstab_path = "/etc/fstab";
const char *var_dst = "/var";
const char *var_src = OTCORE_RUN_OSTREE_PRIVATE "/var";

/* Written by ostree-sysroot-deploy.c. We parse out the stateroot here since we
* need to know it to mount /var. Unfortunately we can't easily use the
* libostree API to find the booted deployment since /boot might not have been
* mounted yet.
/* Prepare to write to the output unit dir; we use the "normal" dir
* that overrides /usr, but not /etc.
*/
g_autofree char *stateroot = NULL;
if (is_aboot)
{
if (!_ostree_sysroot_parse_bootlink_aboot (ostree_target, &stateroot, error))
return glnx_prefix_error (error, "Parsing aboot stateroot");
}
else if (!_ostree_sysroot_parse_bootlink (ostree_target, NULL, &stateroot, NULL, NULL, error))
return glnx_prefix_error (error, "Parsing stateroot");
glnx_autofd int normal_dir_dfd = -1;
if (!glnx_opendirat (AT_FDCWD, normal_dir, TRUE, &normal_dir_dfd, error))
return FALSE;

/* Generate a unit to unmount var_src */
if (!write_unit_file (normal_dir_dfd, "ostree-unmount-temp-var.service", cancellable, error,
"##\n# Automatically generated by ostree-system-generator\n##\n\n"
"[Unit]\n"
"Documentation=man:ostree(1)\n"
"ConditionPathIsMountPoint=%s\n"
"After=var.mount\n"
"\n"
"[Service]\n"
"Type=oneshot\n"
"ExecStart=/usr/bin/umount --lazy %s\n",
var_src, var_src))
return FALSE;

if (!glnx_shutil_mkdir_p_at (normal_dir_dfd, "local-fs.target.wants", 0755, cancellable,
error))
return FALSE;
if (symlinkat ("../ostree-unmount-temp-var.service", normal_dir_dfd,
"local-fs.target.wants/ostree-unmount-temp-var.service") < 0)
return glnx_throw_errno_prefix (error, "symlinkat");

/* Load /etc/fstab if it exists, and look for a /var mount */
g_autoptr (OtLibMountFile) fstab = setmntent (fstab_path, "re");
Expand All @@ -199,7 +215,7 @@ fstab_generator (const char *ostree_target, const bool is_aboot, const char *nor
path_kill_slashes (where);

/* We're only looking for /var here */
if (strcmp (where, var_path) != 0)
if (strcmp (where, var_dst) != 0)
continue;

found_var_mnt = TRUE;
Expand All @@ -211,59 +227,19 @@ fstab_generator (const char *ostree_target, const bool is_aboot, const char *nor
if (found_var_mnt)
return TRUE;

/* Prepare to write to the output unit dir; we use the "normal" dir
* that overrides /usr, but not /etc.
*/
glnx_autofd int normal_dir_dfd = -1;
if (!glnx_opendirat (AT_FDCWD, normal_dir, TRUE, &normal_dir_dfd, error))
return FALSE;

/* Generate our bind mount unit */
const char *stateroot_var_path = glnx_strjoina ("/sysroot/ostree/deploy/", stateroot, "/var");

g_auto (GLnxTmpfile) tmpf = {
0,
};
if (!glnx_open_tmpfile_linkable_at (normal_dir_dfd, ".", O_WRONLY | O_CLOEXEC, &tmpf, error))
return FALSE;
g_autoptr (GOutputStream) outstream = g_unix_output_stream_new (tmpf.fd, FALSE);
gsize bytes_written;
/* This code is inspired by systemd's fstab-generator.c.
*
* Note that our unit doesn't run if systemd.volatile is enabled;
* see https://github.com/ostreedev/ostree/pull/856
*
* To avoid having submounts of /var propagate into $stateroot/var, the mount
* is made with slave+shared propagation. This means that /var will receive
* mount events from the parent /sysroot mount, but not vice versa. Adding a
* shared peer group below the slave group means that submounts of /var will
* inherit normal shared propagation. See mount_namespaces(7), Linux
* Documentation/filesystems/sharedsubtree.txt and
* https://github.com/ostreedev/ostree/issues/2086. This also happens in
* ostree-prepare-root.c for the INITRAMFS_MOUNT_VAR case.
*/
if (!g_output_stream_printf (outstream, &bytes_written, cancellable, error,
"##\n# Automatically generated by ostree-system-generator\n##\n\n"
"[Unit]\n"
"Documentation=man:ostree(1)\n"
"ConditionKernelCommandLine=!systemd.volatile\n"
"Before=local-fs.target\n"
"\n"
"[Mount]\n"
"Where=%s\n"
"What=%s\n"
"Options=bind,slave,shared\n",
var_path, stateroot_var_path))
return FALSE;
if (!g_output_stream_flush (outstream, cancellable, error))
return FALSE;
g_clear_object (&outstream);
/* It should be readable */
if (!glnx_fchmod (tmpf.fd, 0644, error))
return FALSE;
/* Error out if somehow it already exists, that'll help us debug conflicts */
if (!glnx_link_tmpfile_at (&tmpf, GLNX_LINK_TMPFILE_NOREPLACE, normal_dir_dfd, "var.mount",
error))
if (!write_unit_file (normal_dir_dfd, "var.mount", cancellable, error,
"##\n# Automatically generated by ostree-system-generator\n##\n\n"
"[Unit]\n"
"Documentation=man:ostree(1)\n"
"ConditionKernelCommandLine=!systemd.volatile\n"
"Before=local-fs.target\n"
"\n"
"[Mount]\n"
"Where=%s\n"
"What=%s\n"
"Options=bind\n",
var_dst, var_src))
return FALSE;

/* And ensure it's required; newer systemd will auto-inject fs dependencies
Expand Down
2 changes: 1 addition & 1 deletion src/libostree/ostree-repo-commit.c
Original file line number Diff line number Diff line change
Expand Up @@ -1658,7 +1658,7 @@ ostree_repo_prepare_transaction (OstreeRepo *self, gboolean *out_transaction_res
self->reserved_blocks = reserved_bytes / self->txn.blocksize;

/* Use the appropriate free block count if we're unprivileged */
guint64 bfree = (getuid () != 0 ? stvfsbuf.f_bavail : stvfsbuf.f_bfree);
guint64 bfree = (ot_util_process_privileged () ? stvfsbuf.f_bfree : stvfsbuf.f_bavail);
if (bfree > self->reserved_blocks)
self->txn.max_blocks = bfree - self->reserved_blocks;
else
Expand Down
10 changes: 10 additions & 0 deletions src/libostree/ostree-sysroot-private.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,11 @@ struct OstreeSysroot
GLnxLockFile lock;

OstreeSysrootLoadState loadstate;
/*
* XXX: It's very bad that mount namespaces are per thread, not per process.
* In a multi-threading environment, it's troublesome to ensure current thread is always in the ns.
* So, do not use OstreeSysroot from another thread if you want mount namespace.
*/
gboolean mount_namespace_in_use; /* TRUE if caller has told us they used CLONE_NEWNS */
gboolean root_is_ostree_booted; /* TRUE if sysroot is / and we are booted via ostree */
/* The device/inode for / and /etc, used to detect booted deployment */
Expand Down Expand Up @@ -114,8 +119,13 @@ struct OstreeSysroot
// Relative to /boot, consumed by ostree-boot-complete.service
#define _OSTREE_FINALIZE_STAGED_FAILURE_PATH "ostree/finalize-failure.stamp"

gboolean _ostree_sysroot_ensure_visible (OstreeSysroot *self, GError **error);

gboolean _ostree_sysroot_ensure_writable (OstreeSysroot *self, GError **error);

gboolean
_ostree_sysroot_enter_mount_namespace (OstreeSysroot *self, GError **error);

void _ostree_sysroot_emit_journal_msg (OstreeSysroot *self, const char *msg);

gboolean _ostree_sysroot_read_boot_loader_configs (OstreeSysroot *self, int bootversion,
Expand Down
Loading
Loading