Skip to content

Commit

Permalink
neonvm: Add delegated CPU limiting with neonvm-daemon
Browse files Browse the repository at this point in the history
todo:
- Implement the backend in neonvm-daemon
- Fix build-test-vm needing access to neonvm-daemon (how to get it?)
- Add iptables rules inside the VM to prevent access to neonvm-daemon
- Figure out why scripts/run-bench.sh has 0.1x TPS at 0.25 CPU...
  • Loading branch information
sharnoff committed Sep 25, 2024
1 parent e533771 commit 6fbc3e2
Show file tree
Hide file tree
Showing 13 changed files with 371 additions and 14 deletions.
12 changes: 12 additions & 0 deletions .github/workflows/build-images.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ env:
IMG_CONTROLLER: "neondatabase/neonvm-controller"
IMG_VXLAN_CONTROLLER: "neondatabase/neonvm-vxlan-controller"
IMG_RUNNER: "neondatabase/neonvm-runner"
IMG_DAEMON: "neondatabase/neonvm-daemon"
IMG_KERNEL: "neondatabase/vm-kernel"
IMG_SCHEDULER: "neondatabase/autoscale-scheduler"
IMG_AUTOSCALER_AGENT: "neondatabase/autoscaler-agent"
Expand Down Expand Up @@ -85,6 +86,7 @@ jobs:
echo "controller=${{ env.IMG_CONTROLLER }}:${{ inputs.tag }}" | tee -a $GITHUB_OUTPUT
echo "vxlan-controller=${{ env.IMG_VXLAN_CONTROLLER }}:${{ inputs.tag }}" | tee -a $GITHUB_OUTPUT
echo "runner=${{ env.IMG_RUNNER }}:${{ inputs.tag }}" | tee -a $GITHUB_OUTPUT
echo "daemon=${{ env.IMG_DAEMON }}:${{ inputs.tag }}" | tee -a $GITHUB_OUTPUT
echo "scheduler=${{ env.IMG_SCHEDULER }}:${{ inputs.tag }}" | tee -a $GITHUB_OUTPUT
echo "autoscaler-agent=${{ env.IMG_AUTOSCALER_AGENT }}:${{ inputs.tag }}" | tee -a $GITHUB_OUTPUT
echo "cluster-autoscaler=${{ env.IMG_CLUSTER_AUTOSCALER }}:${{ inputs.tag }}" | tee -a $GITHUB_OUTPUT
Expand Down Expand Up @@ -214,6 +216,15 @@ jobs:
build-args: |
GO_BASE_IMG=${{ env.GO_BASE_IMG }}
- name: Build and push neonvm-daemon image
uses: docker/build-push-action@v3
with:
context: .
platforms: linux/amd64
push: true
file: neonvm/daemon/Dockerfile
tags: ${{ needs.tags.outputs.daemon }}

Check failure on line 226 in .github/workflows/build-images.yaml

View workflow job for this annotation

GitHub Actions / actionlint

[actionlint] .github/workflows/build-images.yaml#L226

property "daemon" is not defined in object type {autoscaler-agent: string; cluster-autoscaler: string; controller: string; runner: string; scheduler: string; vxlan-controller: string} [expression]
Raw output
.github/workflows/build-images.yaml:226:21: property "daemon" is not defined in object type {autoscaler-agent: string; cluster-autoscaler: string; controller: string; runner: string; scheduler: string; vxlan-controller: string} [expression]

- name: Generate neonvm-controller build tags
id: controller-build-tags
env:
Expand Down Expand Up @@ -301,6 +312,7 @@ jobs:
neonvm-controller \
neonvm-vxlan-controller \
neonvm-runner \
neonvm-daemon \
vm-kernel \
autoscale-scheduler \
autoscaler-agent \
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/e2e-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ jobs:
IMG_CONTROLLER: ${{ needs.build-images.outputs.controller }}
IMG_VXLAN_CONTROLLER: ${{ needs.build-images.outputs.vxlan-controller }}
IMG_RUNNER: ${{ needs.build-images.outputs.runner }}
IMG_DAEMON: ${{ needs.build-images.outputs.daemon }}

Check failure on line 115 in .github/workflows/e2e-test.yaml

View workflow job for this annotation

GitHub Actions / actionlint

[actionlint] .github/workflows/e2e-test.yaml#L115

property "daemon" is not defined in object type {autoscaler-agent: string; controller: string; runner: string; scheduler: string; vxlan-controller: string} [expression]
Raw output
.github/workflows/e2e-test.yaml:115:37: property "daemon" is not defined in object type {autoscaler-agent: string; controller: string; runner: string; scheduler: string; vxlan-controller: string} [expression]
IMG_SCHEDULER: ${{ needs.build-images.outputs.scheduler }}
IMG_AUTOSCALER_AGENT: ${{ needs.build-images.outputs.autoscaler-agent }}

Expand Down
11 changes: 8 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
IMG_CONTROLLER ?= controller:dev
IMG_VXLAN_CONTROLLER ?= vxlan-controller:dev
IMG_RUNNER ?= runner:dev
IMG_DAEMON ?= daemon:dev
IMG_SCHEDULER ?= autoscale-scheduler:dev
IMG_AUTOSCALER_AGENT ?= autoscaler-agent:dev

Expand Down Expand Up @@ -132,8 +133,8 @@ build: fmt vet bin/vm-builder ## Build all neonvm binaries.
GOOS=linux go build -o bin/runner neonvm/runner/*.go

.PHONY: bin/vm-builder
bin/vm-builder: ## Build vm-builder binary.
GOOS=linux CGO_ENABLED=0 go build -o bin/vm-builder -ldflags "-X main.Version=${GIT_INFO}" neonvm/tools/vm-builder/main.go
bin/vm-builder: docker-build-daemon ## Build vm-builder binary.
GOOS=linux CGO_ENABLED=0 go build -o bin/vm-builder -ldflags "-X main.Version=${GIT_INFO} -X main.NeonvmDaemonImage=${IMG_DAEMON}" neonvm/tools/vm-builder/main.go

.PHONY: run
run: fmt vet ## Run a controller from your host.
Expand All @@ -147,7 +148,7 @@ lint: ## Run golangci-lint against code.
# (i.e. docker build --platform linux/arm64 ). However, you must enable docker buildKit for it.
# More info: https://docs.docker.com/develop/develop-images/build_enhancements/
.PHONY: docker-build
docker-build: docker-build-controller docker-build-runner docker-build-vxlan-controller docker-build-autoscaler-agent docker-build-scheduler ## Build docker images for NeonVM controllers, NeonVM runner, autoscaler-agent, scheduler
docker-build: docker-build-controller docker-build-runner docker-build-daemon docker-build-vxlan-controller docker-build-autoscaler-agent docker-build-scheduler ## Build docker images for NeonVM controllers, NeonVM runner, autoscaler-agent, scheduler

.PHONY: docker-push
docker-push: docker-build ## Push docker images to docker registry
Expand Down Expand Up @@ -182,6 +183,10 @@ docker-build-runner: docker-build-go-base ## Build docker image for NeonVM runne
--file neonvm/runner/Dockerfile \
.

.PHONY: docker-build-daemon
docker-build-daemon: ## Build docker image for NeonVM daemon.
docker build -t $(IMG_DAEMON) -f neonvm/daemon/Dockerfile .

.PHONY: docker-build-vxlan-controller
docker-build-vxlan-controller: docker-build-go-base ## Build docker image for NeonVM vxlan controller
docker build \
Expand Down
5 changes: 5 additions & 0 deletions neonvm/apis/neonvm/v1/virtualmachine_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,11 @@ type VirtualMachineSpec struct {
// +optional
RunnerImage *string `json:"runnerImage,omitempty"`

// Rely on neonvm-daemon inside the VM for fractional CPU limiting
// +kubebuilder:default:=false
// +optional
DelegatedCPULimits *bool `json:"delegatedCPULimits,omitempty"`

// Enable SSH on the VM. It works only if the VM image is built using VM Builder that
// has SSH support (TODO: mention VM Builder version).
// +kubebuilder:default:=true
Expand Down
5 changes: 5 additions & 0 deletions neonvm/apis/neonvm/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions neonvm/config/crd/bases/vm.neon.tech_virtualmachines.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -831,6 +831,11 @@ spec:
type: array
type: object
type: object
delegatedCPULimits:
default: false
description: Rely on neonvm-daemon inside the VM for fractional CPU
limiting
type: boolean
disks:
description: List of disk that can be mounted by virtual machine.
items:
Expand Down
16 changes: 11 additions & 5 deletions neonvm/controllers/vm_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1420,6 +1420,8 @@ func podSpec(
return nil, fmt.Errorf("marshal VM Status: %w", err)
}

delegatedCPULimits := lo.FromPtr(vm.Spec.DelegatedCPULimits)

pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: vm.Status.PodName,
Expand Down Expand Up @@ -1486,7 +1488,9 @@ func podSpec(
}},
Command: func() []string {
cmd := []string{"runner"}
if config.UseContainerMgr || config.DisableRunnerCgroup {
if delegatedCPULimits {
cmd = append(cmd, "-delegated-cgroup")
} else if config.UseContainerMgr || config.DisableRunnerCgroup {
cmd = append(cmd, "-skip-cgroup-management")
}
if config.DisableRunnerCgroup {
Expand Down Expand Up @@ -1535,7 +1539,7 @@ func podSpec(
MountPropagation: lo.ToPtr(corev1.MountPropagationNone),
}

if config.UseContainerMgr || config.DisableRunnerCgroup {
if config.UseContainerMgr || config.DisableRunnerCgroup || delegatedCPULimits {
return []corev1.VolumeMount{images}
} else {
// the /sys/fs/cgroup mount is only necessary if neonvm-runner has to
Expand Down Expand Up @@ -1595,7 +1599,7 @@ func podSpec(
},
}

if config.UseContainerMgr {
if config.UseContainerMgr && !delegatedCPULimits {
return []corev1.Container{runner, containerMgr}
} else {
// Return only the runner if we aren't supposed to use container-mgr
Expand Down Expand Up @@ -1628,7 +1632,9 @@ func podSpec(
},
}

if config.UseContainerMgr {
if delegatedCPULimits {
return []corev1.Volume{images}
} else if config.UseContainerMgr {
return []corev1.Volume{images, containerdSock}
} else if config.DisableRunnerCgroup {
return []corev1.Volume{images}
Expand Down Expand Up @@ -1687,7 +1693,7 @@ func podSpec(
// If a custom neonvm-runner image is requested, use that instead:
if vm.Spec.RunnerImage != nil {
pod.Spec.Containers[0].Image = *vm.Spec.RunnerImage
if config.UseContainerMgr {
if config.UseContainerMgr && !delegatedCPULimits {
pod.Spec.Containers[1].Image = *vm.Spec.RunnerImage
}
}
Expand Down
25 changes: 25 additions & 0 deletions neonvm/daemon/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Build the Go binary
FROM golang:1.21 AS builder
ARG TARGETOS
ARG TARGETARCH

WORKDIR /workspace
# Copy the Go Modules manifests
COPY go.mod go.mod
COPY go.sum go.sum
# cache deps before building and copying source so that we don't need to re-download as much
# and so that source changes don't invalidate our downloaded layer
RUN go mod download

# Copy the go source
COPY neonvm/daemon/main.go neonvm/daemon/main.go

# Build
# the GOARCH has not a default value to allow the binary be built according to the host where the command
# was called. For example, if we call make docker-build in a local env which has the Apple Silicon M1 SO
# the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore,
# by leaving it empty we can ensure that the container and binary shipped on it will have the same platform.
RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -a -o /neonvmd neonvm/daemon/main.go

FROM alpine:3.18
COPY --from=builder /neonvmd /neonvmd
179 changes: 179 additions & 0 deletions neonvm/daemon/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
package main

import (
"errors"
"flag"
"fmt"
"io"
"net/http"
"os"
"strconv"
"strings"
"time"

"go.uber.org/zap"
)

// the default period is 100000 (i.e. 100 milliseconds). We use 5 milliseconds here because
// running out of quota can result in stalling until the end of the period, and a shorter period
// *generally* helps keep latencies more consistent (at the cost of using more CPU for scheduling).
const cpuPeriodMicroseconds = 5000

func main() {
addr := flag.String("addr", "", `address to bind for HTTP requests`)
cgroup := flag.String("cgroup", "", `cgroup for CPU limits`)
flag.Parse()

if *addr == "" {
fmt.Println("neonvm-daemon missing -addr flag")
os.Exit(1)
}

logConfig := zap.NewProductionConfig()
logConfig.Sampling = nil // Disable sampling, which the production config enables by default.
logConfig.Level.SetLevel(zap.InfoLevel) // Only "info" level and above (i.e. not debug logs)
logger := zap.Must(logConfig.Build()).Named("neonvm-daemon")
defer logger.Sync() //nolint:errcheck // what are we gonna do, log something about it?

logger.Info("Starting neonvm-daemon", zap.String("addr", *addr), zap.String("cgroup", *cgroup))

srv := cpuServer{
cgroup: *cgroup,
}
srv.run(logger, *addr)
}

type cpuServer struct {
cgroup string
}

func (s *cpuServer) run(logger *zap.Logger, addr string) {
logger = logger.Named("cpu-srv")

mux := http.NewServeMux()
mux.HandleFunc("/cpu", func(w http.ResponseWriter, r *http.Request) {
if r.Method == http.MethodGet {
_ = r.Body.Close()

cpu, err := s.getCPU(logger)
if err != nil {
w.WriteHeader(http.StatusInternalServerError)
return
}

w.WriteHeader(http.StatusOK)
w.Write([]byte(fmt.Sprintf("%d", cpu)))

Check failure on line 65 in neonvm/daemon/main.go

View workflow job for this annotation

GitHub Actions / golangci-lint

Error return value of `w.Write` is not checked (errcheck)
} else if r.Method == http.MethodPut {
body, err := io.ReadAll(r.Body)
if err != nil {
logger.Error("could not read request body", zap.Error(err))
w.WriteHeader(http.StatusBadRequest)
return
}

milliCPU, err := strconv.ParseUint(string(body), 10, 32)
if err != nil {
logger.Error("could not parse request body as uint32", zap.Error(err))
w.WriteHeader(http.StatusBadRequest)
return
}

s.setCPU(logger, uint32(milliCPU))

Check failure on line 81 in neonvm/daemon/main.go

View workflow job for this annotation

GitHub Actions / golangci-lint

Error return value of `s.setCPU` is not checked (errcheck)
} else {
// unknown method
w.WriteHeader(http.StatusNotFound)
}
})

timeout := 5 * time.Second
server := http.Server{
Addr: addr,
Handler: mux,
ReadTimeout: timeout,
ReadHeaderTimeout: timeout,
WriteTimeout: timeout,
}

err := server.ListenAndServe()
if err != nil {
logger.Fatal("CPU server exited with error", zap.Error(err))
}
logger.Info("CPU server exited without error")
}

func (s *cpuServer) cpuMaxPath() string {
return fmt.Sprintf("/sys/fs/cgroup/%s/cpu.max", s.cgroup)
}

func (s *cpuServer) setCPU(logger *zap.Logger, milliCPU uint32) error {
path := s.cpuMaxPath()
quota := milliCPU * (cpuPeriodMicroseconds / 1000)

fileContents := fmt.Sprintf("%d %d", quota, cpuPeriodMicroseconds)
file, err := os.OpenFile(path, os.O_WRONLY, 0)
if err != nil {
logger.Error("could not open cgroup cpu.max file for writing", zap.Error(err))
return err
}

_, err = file.WriteString(fileContents)
if err != nil {
logger.Error("could not write to cgroup cpu.max", zap.Error(err))
return err
}

return nil
}

// returns the current CPU limit, measured in milli-CPUs
func (s *cpuServer) getCPU(logger *zap.Logger) (uint32, error) {
data, err := os.ReadFile(s.cpuMaxPath())
if err != nil {
logger.Error("could not read cgroup cpu.max", zap.Error(err))
return 0, err
}

cpuLimit, err := parseCgroupCPUMax(string(data))
if err != nil {
logger.Error("could not parse cgroup cpu.max", zap.Error(err))
return 0, err
}

if cpuLimit.quota == nil {
// "0" isn't quite correct here (maybe it should be 1<<32 - 1), but zero is a more typical
// sentinel value, and will still produce the same results.
return 0, nil
}
return uint32(1000 * (*cpuLimit.quota) / cpuLimit.period), nil
}

type cpuMax struct {
quota *uint64
period uint64
}

func parseCgroupCPUMax(data string) (*cpuMax, error) {
// the contents of cpu.max are "$MAX $PERIOD", where:
// - $MAX is either a number of microseconds or the literal string "max" (meaning no limit), and
// - $PERIOD is a number of microseconds over which to account $MAX
arr := strings.Split(strings.Trim(string(data), "\n"), " ")
if len(arr) != 2 {
return nil, errors.New("unexpected contents of cgroup cpu.max")
}

var quota *uint64
if arr[0] != "max" {
q, err := strconv.ParseUint(arr[0], 10, 64)
if err != nil {
return nil, fmt.Errorf("could not parse cpu quota: %w", err)
}
quota = &q
}

period, err := strconv.ParseUint(arr[1], 10, 64)
if err != nil {
return nil, fmt.Errorf("could not parse cpu period: %w", err)
}

return &cpuMax{quota: quota, period: period}, nil
}
Loading

0 comments on commit 6fbc3e2

Please sign in to comment.