Skip to content

Commit

Permalink
Add metrics support.
Browse files Browse the repository at this point in the history
  • Loading branch information
porridge committed May 22, 2024
1 parent 5a2d59a commit 2af7fc5
Show file tree
Hide file tree
Showing 23 changed files with 963 additions and 27 deletions.
6 changes: 6 additions & 0 deletions .github/dependabot.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,9 @@ updates:
interval: 'weekly'
day: 'tuesday'
open-pull-requests-limit: 3
- package-ecosystem: 'gomod'
directory: '/internal/metrics/tools'
schedule:
interval: 'weekly'
day: 'tuesday'
open-pull-requests-limit: 3
4 changes: 4 additions & 0 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ jobs:
./deploy/deploy --k8s-flavor ocp my-images > manifests/ocp.yaml
./deploy/deploy --k8s-flavor vanilla --secret my-secret my-images > manifests/vanilla-with-secret.yaml
./deploy/deploy --k8s-flavor ocp --secret my-secret my-images > manifests/ocp-with-secret.yaml
./deploy/deploy --k8s-flavor vanilla --collect-metrics my-images > manifests/vanilla-metrics.yaml
./deploy/deploy --k8s-flavor ocp --collect-metrics my-images > manifests/ocp-metrics.yaml
./deploy/deploy --k8s-flavor vanilla --secret my-secret --collect-metrics my-images > manifests/vanilla-with-secret-metrics.yaml
./deploy/deploy --k8s-flavor ocp --secret my-secret --collect-metrics my-images > manifests/ocp-with-secret-metrics.yaml
- name: kubeconform
run: |
Expand Down
28 changes: 27 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,22 @@ Talks directly to Container Runtime Interface ([CRI](https://kubernetes.io/docs/
- fetch all images on all nodes in parallel,
- retry pulls with increasingly longer timeouts. This prevents getting stuck on stalled connections to image registry.

It also optionally collects each pull attempt's duration and result.

## Architecture

### `image-prefetcher`

- main binary,
- shipped as an OCI image,
- provides two subcommands:
- provides three subcommands:
- `fetch`: runs the actual image pulls via CRI, meant to run as an init container
of DaemonSet pods.
Requires access to the CRI UNIX domain socket from the host.
- `sleep`: just sleeps forever, meant to run as the main container of DaemonSet pods.
- `aggregate-metrics`: runs a gRPC server which collects data points pushed by the
`fetch` pods, and makes the data available for download over HTTP.
Meant to run as a standalone pod.

### `deploy`

Expand All @@ -40,6 +45,7 @@ Talks directly to Container Runtime Interface ([CRI](https://kubernetes.io/docs/
- `--secret`: image pull `Secret` name. Required if the images are not pullable anonymously.
This image pull secret should be usable for all images fetched by the given instance.
If provided, it must be of type `kubernetes.io/dockerconfigjson` and exist in the same namespace.
- `--collect-metrics`: if the image pull metrics should be collected.

Example:

Expand Down Expand Up @@ -71,6 +77,26 @@ Talks directly to Container Runtime Interface ([CRI](https://kubernetes.io/docs/
kubectl logs -n prefetch-images daemonset/my-images -c prefetch
```

6. If metrics collection was requested, wait for the endpoint to appear, and fetch them:
```
attempt=0
service="service/my-images-metrics"
while [[ -z $(kubectl -n "${ns}" get "${service}" -o jsonpath="{.status.loadBalancer.ingress}" 2>/dev/null) ]]; do
if [ "$attempt" -lt "10" ]; then
echo "Waiting for ${service} to obtain endpoint ..."
((attempt++))
sleep 10
else
echo "Timeout waiting for ${service} to obtain endpoint!"
exit 1
fi
done
endpoint="$(kubectl -n "${ns}" get "${service}" -o json | jq -r '.status.loadBalancer.ingress[] | .ip')"
curl "http://${endpoint}:8080/metrics" | jq
```

See the [Result](internal/metrics/metrics.proto) message definition for a list of fields.

### Customization

You can tweak certain parameters such as timeouts by editing `args` in the above manifest.
Expand Down
4 changes: 3 additions & 1 deletion cmd/fetch.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,15 @@ It talks to Container Runtime Interface API to pull images in parallel, with ret
return err
}
imageList = append(imageList, args...)
return internal.Run(logger, criSocket, dockerConfigJSONPath, timing, imageList...)
return internal.Run(logger, criSocket, dockerConfigJSONPath, timing, metricsEndpoint, imageList...)
},
}

var (
criSocket string
dockerConfigJSONPath string
imageListFile string
metricsEndpoint string
imageListTimeout = time.Minute
initialPullAttemptTimeout = 30 * time.Second
maxPullAttemptTimeout = 5 * time.Minute
Expand All @@ -56,6 +57,7 @@ func init() {
fetchCmd.Flags().StringVar(&criSocket, "cri-socket", "/run/containerd/containerd.sock", "Path to CRI UNIX socket.")
fetchCmd.Flags().StringVar(&dockerConfigJSONPath, "docker-config", "", "Path to docker config json file.")
fetchCmd.Flags().StringVar(&imageListFile, "image-list-file", "", "Path to text file containing images to pull (one per line).")
fetchCmd.Flags().StringVar(&metricsEndpoint, "metrics-endpoint", "", "A host:port to submit image pull metrics to.")

fetchCmd.Flags().DurationVar(&imageListTimeout, "image-list-timeout", imageListTimeout, "Timeout for image list calls (for debugging).")
fetchCmd.Flags().DurationVar(&initialPullAttemptTimeout, "initial-pull-attempt-timeout", initialPullAttemptTimeout, "Timeout for initial image pull call. Each subsequent attempt doubles it until max.")
Expand Down
34 changes: 34 additions & 0 deletions cmd/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package cmd

import (
"github.com/stackrox/image-prefetcher/internal/logging"
"github.com/stackrox/image-prefetcher/internal/metrics/server"

"github.com/spf13/cobra"
)

// aggregateMetricsCmd represents the aggregate-metrics command
var aggregateMetricsCmd = &cobra.Command{
Use: "aggregate-metrics",
Short: "Accept metrics submissions and serve them.",
Long: `This subcommand is intended to run in a single pod.
It serves:
- a gRPC endpoint to which individual metrics can be submitted,
- an HTTP endpoint from which the aggregate metrics can be fetched.`,
RunE: func(cmd *cobra.Command, args []string) error {
return server.Run(logging.GetLogger(), grpcPort, httpPort)
},
}

var (
grpcPort int
httpPort int
)

func init() {
rootCmd.AddCommand(aggregateMetricsCmd)
logging.AddFlags(aggregateMetricsCmd.Flags())
aggregateMetricsCmd.Flags().IntVar(&grpcPort, "grpc-port", 8443, "Port for metrics submission gRPC endpoint to listen on.")
aggregateMetricsCmd.Flags().IntVar(&httpPort, "http-port", 8080, "Port for metrics retrieval HTTP endpoint to listen on.")
}
51 changes: 51 additions & 0 deletions deploy/deployment.yaml.gotpl
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,54 @@ roleRef:
name: privileged-scc-use
---
{{ end }}
{{ if .CollectMetrics }}
apiVersion: v1
kind: Pod
metadata:
name: {{ .Name }}-metrics
labels:
app: {{ .Name }}-metrics
spec:
containers:
- name: aggregator
image: {{ .Image }}:{{ .Version }}
args:
- "aggregate-metrics"
- "--debug"
ports:
- containerPort: 8443
name: grpc
- containerPort: 8080
name: http
resources:
requests:
cpu: "5m"
memory: "16Mi"
limits:
cpu: "100m"
memory: "64Mi"
securityContext:
readOnlyRootFilesystem: true
runAsUser: 1000
runAsNonRoot: true
---
apiVersion: v1
kind: Service
metadata:
name: {{ .Name }}-metrics
spec:
ports:
- name: grpc
port: 8443
protocol: TCP
- name: http
port: 8080
protocol: TCP
selector:
app: {{ .Name }}-metrics
type: LoadBalancer
---
{{ end }}
apiVersion: apps/v1
kind: DaemonSet
metadata:
Expand Down Expand Up @@ -64,6 +112,9 @@ spec:
{{ else }}
- "--cri-socket=/tmp/cri/containerd.sock"
{{ end }}
{{ if .CollectMetrics }}
- "--metrics-endpoint={{ .Name }}-metrics:8443"
{{ end }}
resources:
requests:
cpu: "20m"
Expand Down
10 changes: 7 additions & 3 deletions deploy/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ type settings struct {
Secret string
IsCRIO bool
NeedsPrivileged bool
CollectMetrics bool
}

const (
Expand All @@ -32,15 +33,17 @@ const imageRepo = "quay.io/stackrox-io/image-prefetcher"
var deploymentTemplate string

var (
version string
k8sFlavor k8sFlavorType
secret string
version string
k8sFlavor k8sFlavorType
secret string
collectMetrics bool
)

func init() {
flag.StringVar(&version, "version", "v0.1.0", "Version of image prefetcher OCI image.")
flag.TextVar(&k8sFlavor, "k8s-flavor", flavor(vanillaFlavor), fmt.Sprintf("Kubernetes flavor. Accepted values: %s", strings.Join(allFlavors, ",")))
flag.StringVar(&secret, "secret", "", "Kubernetes image pull Secret to use when pulling.")
flag.BoolVar(&collectMetrics, "collect-metrics", false, "Whether to collect and expose image pull metrics.")
}

func main() {
Expand All @@ -59,6 +62,7 @@ func main() {
Secret: secret,
IsCRIO: isOcp,
NeedsPrivileged: isOcp,
CollectMetrics: collectMetrics,
}
tmpl := template.Must(template.New("deployment").Parse(deploymentTemplate))
if err := tmpl.Execute(os.Stdout, s); err != nil {
Expand Down
20 changes: 11 additions & 9 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,25 +1,27 @@
module github.com/stackrox/image-prefetcher

go 1.21
go 1.22.0

toolchain go1.21.7
toolchain go1.22.1

require (
github.com/cenkalti/backoff/v4 v4.2.1
github.com/google/uuid v1.6.0
github.com/spf13/cobra v1.8.0
github.com/spf13/pflag v1.0.5
google.golang.org/grpc v1.63.2
k8s.io/apimachinery v0.29.3
google.golang.org/protobuf v1.33.0
k8s.io/apimachinery v0.30.0
k8s.io/cri-api v0.29.3
k8s.io/klog/v2 v2.110.1
k8s.io/klog/v2 v2.120.1
)

require (
github.com/go-logr/logr v1.3.0 // indirect
github.com/go-logr/logr v1.4.1 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/spf13/pflag v1.0.5 // indirect
golang.org/x/net v0.21.0 // indirect
golang.org/x/sys v0.17.0 // indirect
golang.org/x/net v0.23.0 // indirect
golang.org/x/sys v0.18.0 // indirect
golang.org/x/text v0.14.0 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de // indirect
google.golang.org/protobuf v1.33.0 // indirect
)
24 changes: 14 additions & 10 deletions go.sum

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 2af7fc5

Please sign in to comment.