Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch to virtiofs #3428

Merged
merged 4 commits into from
Sep 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions docs/working.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,6 @@ allowing you to directly execute binaries from there. You can also use e.g.
`rpm-ostree usroverlay` and then copy binaries from your host `/run/workdir` into
the VM's rootfs.

(This currently only works on Fedora CoreOS which ships `9p`, not RHCOS. A future version
will use https://virtio-fs.gitlab.io/ )

## Using host binaries

Another related trick is:
Expand Down
16 changes: 7 additions & 9 deletions mantle/cmd/kola/qemuexec.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ import (
"context"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"

Expand Down Expand Up @@ -91,8 +90,8 @@ func init() {
cmdQemuExec.Flags().BoolVarP(&devshellConsole, "devshell-console", "c", false, "Connect directly to serial console in devshell mode")
cmdQemuExec.Flags().StringVarP(&ignition, "ignition", "i", "", "Path to Ignition config")
cmdQemuExec.Flags().StringVarP(&butane, "butane", "B", "", "Path to Butane config")
cmdQemuExec.Flags().StringArrayVar(&bindro, "bind-ro", nil, "Mount readonly via 9pfs a host directory (use --bind-ro=/path/to/host,/var/mnt/guest")
cmdQemuExec.Flags().StringArrayVar(&bindrw, "bind-rw", nil, "Same as above, but writable")
cmdQemuExec.Flags().StringArrayVar(&bindro, "bind-ro", nil, "Mount $hostpath,$guestpath readonly; for example --bind-ro=/path/on/host,/var/mnt/guest)")
cmdQemuExec.Flags().StringArrayVar(&bindrw, "bind-rw", nil, "Mount $hostpath,$guestpath writable; for example --bind-rw=/path/on/host,/var/mnt/guest)")
cmdQemuExec.Flags().BoolVarP(&forceConfigInjection, "inject-ignition", "", false, "Force injecting Ignition config using guestfs")
cmdQemuExec.Flags().BoolVar(&propagateInitramfsFailure, "propagate-initramfs-failure", false, "Error out if the system fails in the initramfs")
cmdQemuExec.Flags().StringVarP(&consoleFile, "console-to-file", "", "", "Filepath in which to save serial console logs")
Expand Down Expand Up @@ -213,8 +212,7 @@ func runQemuExec(cmd *cobra.Command, args []string) error {
ignitionFragments = append(ignitionFragments, "autologin")
cpuCountHost = true
usernet = true
// Can't use 9p on RHEL8, need https://virtio-fs.gitlab.io/ instead in the future
if kola.Options.CosaWorkdir != "" && !strings.HasPrefix(filepath.Base(kola.QEMUOptions.DiskImage), "rhcos") && !strings.HasPrefix(filepath.Base(kola.QEMUOptions.DiskImage), "scos") && kola.Options.Distribution != "rhcos" && kola.Options.Distribution != "scos" {
if kola.Options.CosaWorkdir != "" {
// Conservatively bind readonly to avoid anything in the guest (stray tests, whatever)
// from destroying stuff
bindro = append(bindro, fmt.Sprintf("%s,/var/mnt/workdir", kola.Options.CosaWorkdir))
Expand Down Expand Up @@ -296,18 +294,18 @@ func runQemuExec(cmd *cobra.Command, args []string) error {
if err != nil {
return err
}
builder.Mount9p(src, dest, true)
builder.MountHost(src, dest, true)
ensureConfig()
config.Mount9p(dest, true)
config.MountHost(dest, true)
}
for _, b := range bindrw {
src, dest, err := parseBindOpt(b)
if err != nil {
return err
}
builder.Mount9p(src, dest, false)
builder.MountHost(src, dest, false)
ensureConfig()
config.Mount9p(dest, false)
config.MountHost(dest, false)
}
builder.ForceConfigInjection = forceConfigInjection
if len(firstbootkargs) > 0 {
Expand Down
15 changes: 8 additions & 7 deletions mantle/platform/conf/conf.go
Original file line number Diff line number Diff line change
Expand Up @@ -1362,11 +1362,12 @@ resize_terminal() {
PROMPT_COMMAND+=(resize_terminal)`, 0644)
}

// Mount9p adds an Ignition config to mount an folder with 9p
func (c *Conf) Mount9p(dest string, readonly bool) {
readonlyStr := ""
// MountHost adds an Ignition config to mount an folder
func (c *Conf) MountHost(dest string, readonly bool) {
mountType := "virtiofs"
options := ""
if readonly {
readonlyStr = ",ro"
options = "ro"
}
content := fmt.Sprintf(`[Unit]
DefaultDependencies=no
Expand All @@ -1375,11 +1376,11 @@ Before=basic.target
[Mount]
What=%s
Where=%s
Type=9p
Options=trans=virtio,version=9p2000.L%s,msize=10485760
Type=%s
Options=%s
[Install]
WantedBy=multi-user.target
`, dest, dest, readonlyStr)
`, dest, dest, mountType, options)
c.AddSystemdUnit(fmt.Sprintf("%s.mount", systemdunit.UnitNameEscape(dest[1:])), content, Enable)
}

Expand Down
136 changes: 104 additions & 32 deletions mantle/platform/qemu.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
// Why not libvirt?
// Two main reasons. First, we really do want to use qemu, and not
// something else. We rely on qemu features/APIs and there's a general
// assumption that the qemu process is local (e.g. we expose 9p filesystem
// assumption that the qemu process is local (e.g. we expose 9p/virtiofs filesystem
// sharing). Second, libvirt runs as a daemon, but we want the
// VMs "lifecycle bound" to their creating process (e.g. kola),
// so that e.g. Ctrl-C (SIGINT) kills both reliably.
Expand Down Expand Up @@ -145,11 +145,12 @@ type bootIso struct {

// QemuInstance holds an instantiated VM through its lifecycle.
type QemuInstance struct {
qemu exec.Cmd
architecture string
tempdir string
swtpm exec.Cmd
nbdServers []exec.Cmd
qemu exec.Cmd
architecture string
tempdir string
swtpm exec.Cmd
// Helpers are child processes such as nbd or virtiofsd that should be lifecycle bound to qemu
helpers []exec.Cmd
hostForwardedPorts []HostForwardPort

journalPipe *os.File
Expand Down Expand Up @@ -287,12 +288,12 @@ func (inst *QemuInstance) Destroy() {
inst.swtpm.Kill() //nolint // Ignore errors
inst.swtpm = nil
}
for _, nbdServ := range inst.nbdServers {
if nbdServ != nil {
nbdServ.Kill() //nolint // Ignore errors
for _, p := range inst.helpers {
if p != nil {
p.Kill() //nolint // Ignore errors
}
}
inst.nbdServers = nil
inst.helpers = nil

if inst.tempdir != "" {
if err := os.RemoveAll(inst.tempdir); err != nil {
Expand Down Expand Up @@ -421,6 +422,13 @@ func (inst *QemuInstance) RemovePrimaryBlockDevice() (err2 error) {
return nil
}

// A directory mounted from the host into the guest, via 9p or virtiofs
type HostMount struct {
src string
dest string
readonly bool
}

// QemuBuilder is a configurator that can then create a qemu instance
type QemuBuilder struct {
// ConfigFile is a path to Ignition configuration
Expand Down Expand Up @@ -477,9 +485,10 @@ type QemuBuilder struct {
finalized bool
diskID uint
disks []*Disk
fs9pID uint
// virtioSerialID is incremented for each device
virtioSerialID uint
// hostMounts is an array of directories mounted (via 9p or virtiofs) from the host
hostMounts []HostMount
// fds is file descriptors we own to pass to qemu
fds []*os.File

Expand Down Expand Up @@ -702,16 +711,11 @@ func (builder *QemuBuilder) encryptIgnitionConfig() error {
return nil
}

// Mount9p sets up a mount point from the host to guest. To be replaced
// with https://virtio-fs.gitlab.io/ once it lands everywhere.
func (builder *QemuBuilder) Mount9p(source, destHint string, readonly bool) {
builder.fs9pID++
readonlyStr := ""
if readonly {
readonlyStr = ",readonly=on"
}
builder.Append("--fsdev", fmt.Sprintf("local,id=fs%d,path=%s,security_model=mapped%s", builder.fs9pID, source, readonlyStr))
builder.Append("-device", virtio(builder.architecture, "9p", fmt.Sprintf("fsdev=fs%d,mount_tag=%s", builder.fs9pID, destHint)))
// MountHost sets up a mount point from the host to guest.
// Note that virtiofs does not currently support read-only mounts (which is really surprising!).
// We do mount it read-only by default in the guest, however.
func (builder *QemuBuilder) MountHost(source, dest string, readonly bool) {
builder.hostMounts = append(builder.hostMounts, HostMount{src: source, dest: dest, readonly: readonly})
cgwalters marked this conversation as resolved.
Show resolved Hide resolved
}

// supportsFwCfg if the target system supports injecting
Expand Down Expand Up @@ -1276,37 +1280,43 @@ func (builder *QemuBuilder) Append(args ...string) {

// baseQemuArgs takes a board and returns the basic qemu
// arguments needed for the current architecture.
func baseQemuArgs(arch string) ([]string, error) {
accel := "accel=kvm"
func baseQemuArgs(arch string, memoryMiB int) ([]string, error) {
// memoryDevice is the object identifier we use for the backing RAM
const memoryDevice = "mem"

kvm := true
hostArch := coreosarch.CurrentRpmArch()
// The machine argument needs to reference our memory device; see below
machineArg := "memory-backend=" + memoryDevice
accel := "accel=kvm"
if _, ok := os.LookupEnv("COSA_NO_KVM"); ok || hostArch != arch {
accel = "accel=tcg"
kvm = false
}
machineArg += "," + accel
var ret []string
switch arch {
case "x86_64":
ret = []string{
"qemu-system-x86_64",
"-machine", accel,
"-machine", machineArg,
}
case "aarch64":
ret = []string{
"qemu-system-aarch64",
"-machine", "virt,gic-version=max," + accel,
"-machine", "virt,gic-version=max," + machineArg,
}
case "s390x":
ret = []string{
"qemu-system-s390x",
"-machine", "s390-ccw-virtio," + accel,
"-machine", "s390-ccw-virtio," + machineArg,
}
case "ppc64le":
ret = []string{
"qemu-system-ppc64",
// kvm-type=HV ensures we use bare metal KVM and not "user mode"
// https://qemu.readthedocs.io/en/latest/system/ppc/pseries.html#switching-between-the-kvm-pr-and-kvm-hv-kernel-module
"-machine", "pseries,kvm-type=HV," + accel,
"-machine", "pseries,kvm-type=HV," + machineArg,
}
default:
return nil, fmt.Errorf("architecture %s not supported for qemu", arch)
Expand All @@ -1321,6 +1331,9 @@ func baseQemuArgs(arch string) ([]string, error) {
ret = append(ret, "-cpu", "Nehalem")
}
}
// And define memory using a memfd (in shared mode), which is needed for virtiofs
ret = append(ret, "-object", fmt.Sprintf("memory-backend-memfd,id=%s,size=%dM,share=on", memoryDevice, memoryMiB))
ret = append(ret, "-m", fmt.Sprintf("%d", memoryMiB))
return ret, nil
}

Expand Down Expand Up @@ -1591,13 +1604,10 @@ func (builder *QemuBuilder) Exec() (*QemuInstance, error) {
}
}()

argv, err := baseQemuArgs(builder.architecture)
argv, err := baseQemuArgs(builder.architecture, builder.MemoryMiB)
if err != nil {
return nil, err
}
// Allocate a shared memory object, which is needed for virtiofs
argv = append(argv, "-object", fmt.Sprintf("memory-backend-memfd,id=mem,size=%dM,share=on", builder.MemoryMiB))
argv = append(argv, "-m", fmt.Sprintf("%d", builder.MemoryMiB))

if builder.Processors < 0 {
nproc, err := system.GetProcessors()
Expand Down Expand Up @@ -1693,7 +1703,7 @@ func (builder *QemuBuilder) Exec() (*QemuInstance, error) {
if err := cmd.Start(); err != nil {
return nil, errors.Wrapf(err, "spawing nbd server")
}
inst.nbdServers = append(inst.nbdServers, cmd)
inst.helpers = append(inst.helpers, cmd)
}
}

Expand Down Expand Up @@ -1773,6 +1783,68 @@ func (builder *QemuBuilder) Exec() (*QemuInstance, error) {
return nil, err
}

// Process virtiofs mounts
if len(builder.hostMounts) > 0 {
if err := builder.ensureTempdir(); err != nil {
return nil, err
}

plog.Debug("creating virtiofs helpers")

// Spawn off a virtiofsd helper per mounted path
virtiofsHelpers := make(map[string]exec.Cmd)
for i, hostmnt := range builder.hostMounts {
// By far the most common failure to spawn virtiofsd will be a typo'd source directory,
// so let's synchronously check that ourselves here.
if _, err := os.Stat(hostmnt.src); err != nil {
return nil, fmt.Errorf("failed to access virtiofs source directory %s", hostmnt.src)
}
virtiofsChar := fmt.Sprintf("virtiofschar%d", i)
virtiofsdSocket := filepath.Join(builder.tempdir, fmt.Sprintf("virtiofsd-%d.sock", i))
builder.Append("-chardev", fmt.Sprintf("socket,id=%s,path=%s", virtiofsChar, virtiofsdSocket))
builder.Append("-device", fmt.Sprintf("vhost-user-fs-pci,queue-size=1024,chardev=%s,tag=%s", virtiofsChar, hostmnt.dest))
plog.Debugf("creating virtiofs helper for %s", hostmnt.src)
// TODO: Honor hostmnt.readonly somehow here (add an option to virtiofsd)
p := exec.Command("/usr/libexec/virtiofsd", "--sandbox", "none", "--socket-path", virtiofsdSocket, "--shared-dir", ".")
p.Dir = hostmnt.src
// Quiet the daemon by default
p.Env = append(p.Env, "RUST_LOG=ERROR")
p.Stderr = os.Stderr
p.SysProcAttr = &syscall.SysProcAttr{
Pdeathsig: syscall.SIGTERM,
}
if err := p.Start(); err != nil {
return nil, fmt.Errorf("failed to start virtiofsd")
}
virtiofsHelpers[virtiofsdSocket] = p
}
// Loop waiting for the sockets to appear
err := util.RetryUntilTimeout(10*time.Minute, 1*time.Second, func() error {
found := []string{}
for sockpath := range virtiofsHelpers {
if _, err := os.Stat(sockpath); err == nil {
found = append(found, sockpath)
}
}
for _, sockpath := range found {
helper := virtiofsHelpers[sockpath]
inst.helpers = append(inst.helpers, helper)
delete(virtiofsHelpers, sockpath)
cgwalters marked this conversation as resolved.
Show resolved Hide resolved
}
if len(virtiofsHelpers) == 0 {
return nil
}
waitingFor := []string{}
for socket := range virtiofsHelpers {
waitingFor = append(waitingFor, socket)
}
return fmt.Errorf("waiting for virtiofsd sockets: %s", strings.Join(waitingFor, " "))
})
if err != nil {
return nil, err
}
}

fdnum := 3 // first additional file starts at position 3
for i := range builder.fds {
fdset := i + 1 // Start at 1
Expand Down
4 changes: 0 additions & 4 deletions src/buildextend-legacy-oscontainer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,6 @@ set -euo pipefail

# Start VM and call buildah
final_outfile=$(realpath "$1"); shift
if [ "$(uname -m)" == 'ppc64le' ]; then
# helps with 9p 'cannot allocate memory' errors on ppc64le
COSA_SUPERMIN_MEMORY=6144
fi
IMAGE_TYPE=legacy-oscontainer
prepare_build
tmp_outfile=${tmp_builddir}/legacy-oscontainer.ociarchive
Expand Down
2 changes: 1 addition & 1 deletion src/cmd-buildextend-metal
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ if jq -re '.["deploy-via-container"]' < "${image_json}"; then
deploy_via_container="true"
fi
container_imgref=$(jq -r '.["container-imgref"]//""' < "${image_json}")
# Nowadays we pull the container across 9p rather than accessing the repo, see
# Nowadays we pull the container across virtiofs rather than accessing the repo, see
# https://github.com/openshift/os/issues/594
ostree_container=ostree-unverified-image:oci-archive:$builddir/$(meta_key images.ostree.path)

Expand Down
2 changes: 1 addition & 1 deletion src/cmd-run
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ set -euo pipefail
# You can exit via either exiting the login session cleanly
# in SSH (ctrl-d/exit in bash), or via `poweroff`.
#
# If 9p is available and this is started from a coreos-assembler
# If this is started from a coreos-assembler
# working directory, the build directory will be mounted readonly
# at /var/mnt/workdir, and the tmp/ directory will be read *write*
# at /var/mnt/workdir-tmp. This allows you to easily exchange
Expand Down
Loading