Skip to content

Commit

Permalink
Switch to virtiofs
Browse files Browse the repository at this point in the history
Closes: #1812

The key benefits here are:

- This also works for RHEL (this is a *big deal* for my dev workflow
  parity)
- It correctly handles symlinks
- It's more maintained (e.g. used for kata containers) and hopefully
  we won't hit obscure bugs like the 9p OOM ones.
  • Loading branch information
cgwalters committed Apr 15, 2023
1 parent 77e3d12 commit fa81b3c
Show file tree
Hide file tree
Showing 12 changed files with 117 additions and 75 deletions.
3 changes: 0 additions & 3 deletions docs/working.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,6 @@ allowing you to directly execute binaries from there. You can also use e.g.
`rpm-ostree usroverlay` and then copy binaries from your host `/run/workdir` into
the VM's rootfs.

(This currently only works on Fedora CoreOS which ships `9p`, not RHCOS. A future version
will use https://virtio-fs.gitlab.io/ )

## Using host binaries

Another related trick is:
Expand Down
14 changes: 6 additions & 8 deletions mantle/cmd/kola/qemuexec.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ import (
"context"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"

Expand Down Expand Up @@ -91,7 +90,7 @@ func init() {
cmdQemuExec.Flags().BoolVarP(&devshellConsole, "devshell-console", "c", false, "Connect directly to serial console in devshell mode")
cmdQemuExec.Flags().StringVarP(&ignition, "ignition", "i", "", "Path to Ignition config")
cmdQemuExec.Flags().StringVarP(&butane, "butane", "B", "", "Path to Butane config")
cmdQemuExec.Flags().StringArrayVar(&bindro, "bind-ro", nil, "Mount readonly via 9pfs a host directory (use --bind-ro=/path/to/host,/var/mnt/guest")
cmdQemuExec.Flags().StringArrayVar(&bindro, "bind-ro", nil, "Mount readonly via virtiofs a host directory (use --bind-ro=/path/to/host,/var/mnt/guest")
cmdQemuExec.Flags().StringArrayVar(&bindrw, "bind-rw", nil, "Same as above, but writable")
cmdQemuExec.Flags().BoolVarP(&forceConfigInjection, "inject-ignition", "", false, "Force injecting Ignition config using guestfs")
cmdQemuExec.Flags().BoolVar(&propagateInitramfsFailure, "propagate-initramfs-failure", false, "Error out if the system fails in the initramfs")
Expand Down Expand Up @@ -178,8 +177,7 @@ func runQemuExec(cmd *cobra.Command, args []string) error {
ignitionFragments = append(ignitionFragments, "autologin")
cpuCountHost = true
usernet = true
// Can't use 9p on RHEL8, need https://virtio-fs.gitlab.io/ instead in the future
if kola.Options.CosaWorkdir != "" && !strings.HasPrefix(filepath.Base(kola.QEMUOptions.DiskImage), "rhcos") && !strings.HasPrefix(filepath.Base(kola.QEMUOptions.DiskImage), "scos") && kola.Options.Distribution != "rhcos" && kola.Options.Distribution != "scos" {
if kola.Options.CosaWorkdir != "" {
// Conservatively bind readonly to avoid anything in the guest (stray tests, whatever)
// from destroying stuff
bindro = append(bindro, fmt.Sprintf("%s,/var/mnt/workdir", kola.Options.CosaWorkdir))
Expand Down Expand Up @@ -261,18 +259,18 @@ func runQemuExec(cmd *cobra.Command, args []string) error {
if err != nil {
return err
}
builder.Mount9p(src, dest, true)
builder.MountVirtiofs(src, dest)
ensureConfig()
config.Mount9p(dest, true)
config.MountVirtiofs(dest, true)
}
for _, b := range bindrw {
src, dest, err := parseBindOpt(b)
if err != nil {
return err
}
builder.Mount9p(src, dest, false)
builder.MountVirtiofs(src, dest)
ensureConfig()
config.Mount9p(dest, false)
config.MountVirtiofs(dest, false)
}
builder.ForceConfigInjection = forceConfigInjection
if len(firstbootkargs) > 0 {
Expand Down
10 changes: 5 additions & 5 deletions mantle/platform/conf/conf.go
Original file line number Diff line number Diff line change
Expand Up @@ -1362,11 +1362,11 @@ resize_terminal() {
PROMPT_COMMAND+=(resize_terminal)`, 0644)
}

// Mount9p adds an Ignition config to mount an folder with 9p
func (c *Conf) Mount9p(dest string, readonly bool) {
// MountVirtiofs adds an Ignition config to mount an folder with 9p
func (c *Conf) MountVirtiofs(dest string, readonly bool) {
readonlyStr := ""
if readonly {
readonlyStr = ",ro"
readonlyStr = "ro"
}
content := fmt.Sprintf(`[Unit]
DefaultDependencies=no
Expand All @@ -1375,8 +1375,8 @@ Before=basic.target
[Mount]
What=%s
Where=%s
Type=9p
Options=trans=virtio,version=9p2000.L%s,msize=10485760
Type=virtiofs
Options=%s
[Install]
WantedBy=multi-user.target
`, dest, dest, readonlyStr)
Expand Down
119 changes: 97 additions & 22 deletions mantle/platform/qemu.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
// Why not libvirt?
// Two main reasons. First, we really do want to use qemu, and not
// something else. We rely on qemu features/APIs and there's a general
// assumption that the qemu process is local (e.g. we expose 9p filesystem
// assumption that the qemu process is local (e.g. we expose virtiofs filesystem
// sharing). Second, libvirt runs as a daemon, but we want the
// VMs "lifecycle bound" to their creating process (e.g. kola),
// so that e.g. Ctrl-C (SIGINT) kills both reliably.
Expand Down Expand Up @@ -154,11 +154,12 @@ type bootIso struct {

// QemuInstance holds an instantiated VM through its lifecycle.
type QemuInstance struct {
qemu exec.Cmd
architecture string
tempdir string
swtpm exec.Cmd
nbdServers []exec.Cmd
qemu exec.Cmd
architecture string
tempdir string
swtpm exec.Cmd
// Helpers are child processes such as nbd or virtiofsd that should be lifecycle bound to qemu
helpers []exec.Cmd
hostForwardedPorts []HostForwardPort

journalPipe *os.File
Expand Down Expand Up @@ -296,12 +297,12 @@ func (inst *QemuInstance) Destroy() {
inst.swtpm.Kill() //nolint // Ignore errors
inst.swtpm = nil
}
for _, nbdServ := range inst.nbdServers {
if nbdServ != nil {
nbdServ.Kill() //nolint // Ignore errors
for _, p := range inst.helpers {
if p != nil {
p.Kill() //nolint // Ignore errors
}
}
inst.nbdServers = nil
inst.helpers = nil

if inst.tempdir != "" {
if err := os.RemoveAll(inst.tempdir); err != nil {
Expand Down Expand Up @@ -430,6 +431,11 @@ func (inst *QemuInstance) RemovePrimaryBlockDevice() (err2 error) {
return nil
}

type VirtiofsMount struct {
src string
tag string
}

// QemuBuilder is a configurator that can then create a qemu instance
type QemuBuilder struct {
// ConfigFile is a path to Ignition configuration
Expand Down Expand Up @@ -486,7 +492,8 @@ type QemuBuilder struct {
finalized bool
diskID uint
disks []*Disk
fs9pID uint
// virtiofs is an array of virtiofs mounts
virtiofs []VirtiofsMount
// virtioSerialID is incremented for each device
virtioSerialID uint
// fds is file descriptors we own to pass to qemu
Expand Down Expand Up @@ -711,16 +718,10 @@ func (builder *QemuBuilder) encryptIgnitionConfig() error {
return nil
}

// Mount9p sets up a mount point from the host to guest. To be replaced
// with https://virtio-fs.gitlab.io/ once it lands everywhere.
func (builder *QemuBuilder) Mount9p(source, destHint string, readonly bool) {
builder.fs9pID++
readonlyStr := ""
if readonly {
readonlyStr = ",readonly=on"
}
builder.Append("--fsdev", fmt.Sprintf("local,id=fs%d,path=%s,security_model=mapped%s", builder.fs9pID, source, readonlyStr))
builder.Append("-device", virtio(builder.architecture, "9p", fmt.Sprintf("fsdev=fs%d,mount_tag=%s", builder.fs9pID, destHint)))
// MountVirtiofs sets up a mount point from the host to guest.
// Note that virtiofs does not currently support read-only mounts (which is really surprising!)
func (builder *QemuBuilder) MountVirtiofs(source, dest string) {
builder.virtiofs = append(builder.virtiofs, VirtiofsMount{src: source, tag: dest})
}

// supportsFwCfg if the target system supports injecting
Expand Down Expand Up @@ -1636,7 +1637,7 @@ func (builder *QemuBuilder) Exec() (*QemuInstance, error) {
if err := cmd.Start(); err != nil {
return nil, errors.Wrapf(err, "spawing nbd server")
}
inst.nbdServers = append(inst.nbdServers, cmd)
inst.helpers = append(inst.helpers, cmd)
}
}

Expand Down Expand Up @@ -1716,6 +1717,80 @@ func (builder *QemuBuilder) Exec() (*QemuInstance, error) {
return nil, err
}

// Process virtiofs mounts
if len(builder.virtiofs) > 0 {
if err := builder.ensureTempdir(); err != nil {
return nil, err
}

plog.Debug("creating viritofs helpers")

// Allocate a shared memory object and set up NUMA, which is needed for virtiofs
builder.Append("-object", fmt.Sprintf("memory-backend-file,id=mem,size=%dM,mem-path=/dev/shm,share=on", builder.Memory), "-numa", "node,memdev=mem")

// We assume we're already running in a container
isolation := "none"

type helper struct {
cmd exec.Cmd
sockpath string
}

// Spawn off a virtiofsd helper per mounted path
virtiofsHelpers := make(map[string]exec.Cmd)
for i, virtiofs := range builder.virtiofs {
// By far the most common failure to spawn virtiofsd will be a typo'd source directory,
// so let's synchronously check that ourselves here.
if _, err := os.Stat(virtiofs.src); err != nil {
return nil, fmt.Errorf("Failed to access virtiofs source directory %s", virtiofs.src)
}
virtiofsChar := fmt.Sprintf("virtiofschar%d", i)
virtiofsdSocket := filepath.Join(builder.tempdir, fmt.Sprintf("virtiofsd-%d.sock", i))
builder.Append("-chardev", fmt.Sprintf("socket,id=%s,path=%s", virtiofsChar, virtiofsdSocket))
builder.Append("-device", fmt.Sprintf("vhost-user-fs-pci,queue-size=1024,chardev=%s,tag=%s", virtiofsChar, virtiofs.tag))
plog.Debugf("creating viritofs helper for %s", virtiofs.src)
p := exec.Command("/usr/libexec/virtiofsd", "--sandbox", isolation, "--socket-path", virtiofsdSocket, "--shared-dir", virtiofs.src)
// Quiet the daemon by default
p.Env = append(p.Env, "RUST_LOG=ERROR")
p.Stderr = os.Stderr
p.SysProcAttr = &syscall.SysProcAttr{
Pdeathsig: syscall.SIGTERM,
}
if err := p.Start(); err != nil {
return nil, fmt.Errorf("Failed to start virtiofsd")
}
virtiofsHelpers[virtiofsdSocket] = p
}
// Loop waiting for the sockets to appear
delay := time.Millisecond * 250
repetitions := time.Minute / delay
for i := 0; i < int(repetitions) && len(virtiofsHelpers) > 0; i++ {
// Delay between subsequent loops
if i > 0 {
time.Sleep(delay)
}
found := []string{}
for sockpath := range virtiofsHelpers {
if _, err := os.Stat(sockpath); err == nil {
found = append(found, sockpath)
}
}
for _, sockpath := range found {
delete(virtiofsHelpers, sockpath)
}
}
if len(virtiofsHelpers) > 0 {
sockets := ""
for sockpath := range virtiofsHelpers {
if len(sockets) > 0 {
sockets += " "
}
sockets += sockpath
}
return nil, fmt.Errorf("Failed to wait for virtiofsd sockets: %s", sockets)
}
}

fdnum := 3 // first additional file starts at position 3
for i := range builder.fds {
fdset := i + 1 // Start at 1
Expand Down
4 changes: 0 additions & 4 deletions src/buildextend-legacy-oscontainer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,6 @@ set -euo pipefail

# Start VM and call buildah
final_outfile=$(realpath "$1"); shift
if [ "$(uname -m)" == 'ppc64le' ]; then
# helps with 9p 'cannot allocate memory' errors on ppc64le
COSA_SUPERMIN_MEMORY=6144
fi
IMAGE_TYPE=legacy-oscontainer
prepare_build
tmp_outfile=${tmp_builddir}/legacy-oscontainer.ociarchive
Expand Down
2 changes: 1 addition & 1 deletion src/cmd-buildextend-metal
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ if jq -re '.["deploy-via-container"]' < "${image_json}"; then
deploy_via_container="true"
fi
container_imgref=$(jq -r '.["container-imgref"]//""' < "${image_json}")
# Nowadays we pull the container across 9p rather than accessing the repo, see
# Nowadays we pull the container across virtiofs rather than accessing the repo, see
# https://github.com/openshift/os/issues/594
ostree_container=ostree-unverified-image:oci-archive:$builddir/$(meta_key images.ostree.path)

Expand Down
2 changes: 1 addition & 1 deletion src/cmd-run
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ set -euo pipefail
# You can exit via either exiting the login session cleanly
# in SSH (ctrl-d/exit in bash), or via `poweroff`.
#
# If 9p is available and this is started from a coreos-assembler
# If this is started from a coreos-assembler
# working directory, the build directory will be mounted readonly
# at /var/mnt/workdir, and the tmp/ directory will be read *write*
# at /var/mnt/workdir-tmp. This allows you to easily exchange
Expand Down
16 changes: 2 additions & 14 deletions src/cmdlib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -676,7 +676,7 @@ meta_key() {
}

# runvm generates and runs a minimal VM which we use to "bootstrap" our build
# process. It mounts the workdir via 9p. If you need to add new packages into
# process. It mounts the workdir via virtiofs. If you need to add new packages into
# the vm, update `vmdeps.txt`.
# If you need to debug it, one trick is to change the `-serial file` below
# into `-serial stdio`, drop the <&- and virtio-serial stuff and then e.g. add
Expand Down Expand Up @@ -790,28 +790,16 @@ EOF
esac

kola_args=(kola qemuexec -m "${COSA_SUPERMIN_MEMORY:-${memory_default}}" --auto-cpus -U --workdir none \
--console-to-file "${runvm_console}")
--console-to-file "${runvm_console}" --bind-rw "${workdir},workdir")

base_qemu_args=(-drive 'if=none,id=root,format=raw,snapshot=on,file='"${vmbuilddir}"'/root,index=1' \
-device 'virtio-blk,drive=root' \
-kernel "${vmbuilddir}/kernel" -initrd "${vmbuilddir}/initrd" \
-no-reboot -nodefaults \
-device virtio-serial \
-virtfs 'local,id=workdir,path='"${workdir}"',security_model=none,mount_tag=workdir' \
-append "root=UUID=${superminrootfsuuid} console=${DEFAULT_TERMINAL} selinux=1 enforcing=0 autorelabel=1" \
)

# support local dev cases where src/config is a symlink. Note if you change or extend to this set,
# you also need to update supermin-init-prelude.sh to mount it inside the VM.
for maybe_symlink in "${workdir}"/{src/config,src/yumrepos,builds}; do
if [ -L "${maybe_symlink}" ]; then
# qemu follows symlinks
local bn
bn=$(basename "${maybe_symlink}")
base_qemu_args+=("-virtfs" "local,id=${bn},path=${maybe_symlink},security_model=none,mount_tag=${bn}")
fi
done

if [ -z "${RUNVM_SHELL:-}" ]; then
if ! "${kola_args[@]}" -- "${base_qemu_args[@]}" \
-device virtserialport,chardev=virtioserial0,name=cosa-cmdout \
Expand Down
4 changes: 2 additions & 2 deletions src/compose.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@ composejson=cache/repo/compose.json
rm -rf "${repo}"
ostree init --repo="${repo}" --mode=archive-z2

# we do need to pull at least the overlay bits over 9p, but it shouldn't be that
# we do need to pull at least the overlay bits over virtiofs, but it shouldn't be that
# many files
ostree refs --repo tmp/repo overlay --list | \
xargs -r ostree pull-local --repo "${repo}" tmp/repo
# And import commit metadata for all refs; this will bring in the previous
# build if there is one. Ideally, we'd import the SELinux policy too to take
# advantage of https://github.com/coreos/rpm-ostree/pull/1704 but that's yet
# more files over 9p and our pipelines don't have cache anyway (and devs likely
# more files over virtiofs and our pipelines don't have cache anyway (and devs likely
# use the privileged path).
ostree refs --repo tmp/repo | \
xargs -r ostree pull-local --commit-metadata-only --repo "${repo}" tmp/repo
Expand Down
2 changes: 1 addition & 1 deletion src/create_disk.sh
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ if [ -z "$platforms_json" ]; then
echo "Missing --platforms-json" >&2
exit 1
fi
# just copy it over to /tmp and work from there to minimize 9p I/O
# just copy it over to /tmp and work from there to minimize virtiofs I/O
cp "${platforms_json}" /tmp/platforms.json
platforms_json=/tmp/platforms.json
platform_grub_cmds=$(jq -r ".${arch}.${platform}.grub_commands // [] | join(\"\\\\n\")" < "${platforms_json}")
Expand Down
2 changes: 1 addition & 1 deletion src/deps.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ genisoimage
make git rpm-build

# virt dependencies
libguestfs-tools libguestfs-tools-c /usr/bin/qemu-img qemu-kvm swtpm
libguestfs-tools libguestfs-tools-c virtiofsd /usr/bin/qemu-img qemu-kvm swtpm
# And the main arch emulators for cross-arch testing
qemu-system-aarch64-core qemu-system-ppc-core qemu-system-s390x-core qemu-system-x86-core

Expand Down
14 changes: 1 addition & 13 deletions src/supermin-init-prelude.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@ mount -t tmpfs tmpfs /dev/shm
# load selinux policy
LANG=C /sbin/load_policy -i

# load kernel module for 9pnet_virtio for 9pfs mount
/sbin/modprobe 9pnet_virtio

# need fuse module for rofiles-fuse/bwrap during post scripts run
/sbin/modprobe fuse
Expand All @@ -36,19 +34,9 @@ fi
umask 002

# set up workdir
# For 9p mounts set msize to 100MiB
# https://github.com/coreos/coreos-assembler/issues/2171
mkdir -p "${workdir:?}"
mount -t 9p -o rw,trans=virtio,version=9p2000.L,msize=10485760 workdir "${workdir}"

# This loop pairs with virtfs setups for qemu in cmdlib.sh. Keep them in sync.
for maybe_symlink in "${workdir}"/{src/config,src/yumrepos,builds}; do
if [ -L "${maybe_symlink}" ]; then
bn=$(basename "${maybe_symlink}")
mkdir -p "$(readlink "${maybe_symlink}")"
mount -t 9p -o rw,trans=virtio,version=9p2000.L,msize=10485760 "${bn}" "${maybe_symlink}"
fi
done
mount -t virtiofs -o rw workdir "${workdir}"

mkdir -p "${workdir}"/cache
cachedev=$(blkid -lt LABEL=cosa-cache -o device || true)
Expand Down

0 comments on commit fa81b3c

Please sign in to comment.