Skip to content

Commit

Permalink
Fix metrics and handle pods baked by custom controllers (#12)
Browse files Browse the repository at this point in the history
  • Loading branch information
adriananeci authored Dec 21, 2022
1 parent b438290 commit 4b56a9a
Show file tree
Hide file tree
Showing 7 changed files with 84 additions and 12 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ require (
github.com/go-co-op/gocron v1.16.2
github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.13.0
github.com/prometheus/common v0.37.0
github.com/sirupsen/logrus v1.9.0
github.com/spf13/cobra v1.5.0
github.com/spf13/viper v1.12.0
Expand Down Expand Up @@ -53,7 +54,6 @@ require (
github.com/pelletier/go-toml v1.9.5 // indirect
github.com/pelletier/go-toml/v2 v2.0.1 // indirect
github.com/prometheus/client_model v0.2.0 // indirect
github.com/prometheus/common v0.37.0 // indirect
github.com/prometheus/procfs v0.8.0 // indirect
github.com/robfig/cron/v3 v3.0.1 // indirect
github.com/spf13/afero v1.8.2 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ github.com/inconshreveable/mousetrap v1.0.0 h1:Z8tu5sraLXCXIcARxBp/8cbvlwVa7Z1NH
github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8=
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA=
github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4=
github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU=
github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
Expand Down Expand Up @@ -237,6 +238,7 @@ github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00/go.mod
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU=
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e h1:fD57ERR4JtEqsWbfPhv4DMiApHyliiK5xCTNVSPiaAs=
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
Expand Down
56 changes: 54 additions & 2 deletions internal/testing/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,19 @@ governing permissions and limitations under the License.
package e2e

import (
"context"
"fmt"
"github.com/adobe/k8s-shredder/pkg/config"
"github.com/adobe/k8s-shredder/pkg/handler"
"github.com/adobe/k8s-shredder/pkg/utils"
"github.com/prometheus/client_golang/api"
v1 "github.com/prometheus/client_golang/api/prometheus/v1"
"github.com/prometheus/common/model"
log "github.com/sirupsen/logrus"
"golang.org/x/exp/slices"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"os"
"strings"
"testing"
"time"
)
Expand Down Expand Up @@ -91,6 +98,51 @@ func compareTime(expirationTime time.Time, t *testing.T, ch chan time.Time) {
}

func TestShredderMetrics(t *testing.T) {
// TODO add metrics validation tests
t.Log("Metrics validation test passed!")

var warnings []string
var results []string

// Intentionally skipped the gauge metrics as they are going to be wiped out before every eviction loop
shredderMetrics := []string{
"shredder_loops_total",
"shredder_loops_duration_seconds",
"shredder_processed_nodes_total",
"shredder_processed_pods_total",
"shredder_errors_total",
}

for _, shredderMetric := range shredderMetrics {
result, warning, err := prometheusQuery(shredderMetric)
if err != nil {
t.Errorf("Error querying Prometheus: %v\n", err)
}
warnings = append(warnings, warning...)
results = append(results, result.String())
}

if len(warnings) > 0 {
t.Logf("Warnings: %v\n", strings.Join(warnings, "\n"))
}

t.Logf("Results: \n%v\n", strings.Join(results, "\n"))

if len(results) == len(shredderMetrics) {
t.Log("Metrics validation test passed!")
}
}

func prometheusQuery(query string) (model.Value, v1.Warnings, error) {

client, err := api.NewClient(api.Config{
Address: "http://localhost:30007",
})
if err != nil {
fmt.Printf("Error creating client: %v\n", err)
os.Exit(1)
}

v1api := v1.NewAPI(client)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
return v1api.Query(ctx, query, time.Now(), v1.WithTimeout(5*time.Second))
}
3 changes: 3 additions & 0 deletions internal/testing/kind.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ networking:
apiServerAddress: 0.0.0.0
nodes:
- role: control-plane
extraPortMappings:
- containerPort: 30007
hostPort: 30007
kubeadmConfigPatches:
- |
kind: InitConfiguration
Expand Down
2 changes: 2 additions & 0 deletions internal/testing/prometheus_stuffs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,13 @@ metadata:
name: prometheus
namespace: kube-system
spec:
type: NodePort
selector:
app: prometheus
ports:
- port: 9090
targetPort: 9090
nodePort: 30007
---
apiVersion: v1
kind: ConfigMap
Expand Down
25 changes: 19 additions & 6 deletions pkg/handler/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import (
log "github.com/sirupsen/logrus"
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
policy "k8s.io/api/policy/v1beta1"
policy "k8s.io/api/policy/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
Expand Down Expand Up @@ -69,9 +69,16 @@ func NewHandler(appContext *utils.AppContext) *Handler {

// Run starts an eviction loop
func (h *Handler) Run() error {
// start measuring the loop duration
loopTimer := prometheus.NewTimer(prometheus.ObserverFunc(func(v float64) {
metrics.ShredderLoopsDurationSeconds.Observe(v * 10e6)
}))

// reset gauge metrics
metrics.ShredderNodeForceToEvictTime.Reset()
metrics.ShredderPodForceToEvictTime.Reset()
metrics.ShredderPodErrorsTotal.Reset()

h.logger.Infof("Starting eviction loop")

// sync all nodes goroutines
Expand Down Expand Up @@ -208,7 +215,14 @@ func (h *Handler) processNode(node v1.Node, rr chan *controllerObject) error {
h.logger.WithFields(log.Fields{
"namespace": pod.Namespace,
"pod": pod.Name,
}).Warnf("Failed to get pod controller object: %s", err.Error())
}).Warnf("Failed to get pod controller object: %s. Proceeding directly with pod eviction", err.Error())
err := h.evictPod(pod, deleteOptions)
if err != nil {
h.logger.WithFields(log.Fields{
"namespace": pod.Namespace,
"pod": pod.Name,
}).Warnf("Failed to evict pod: %s", err.Error())
}
continue
}

Expand Down Expand Up @@ -296,8 +310,7 @@ func (h *Handler) GetPodsForNode(node v1.Node) ([]v1.Pod, error) {
// evictPod evict a pod using the eviction API
func (h *Handler) evictPod(pod v1.Pod, deleteOptions *metav1.DeleteOptions) error {
h.logger.Infof("Evicting pod %s from %s namespace", pod.Name, pod.Namespace)
// TODO switch to stable V1 API version once we switch to k8s 1.22
err := h.appContext.K8sClient.PolicyV1beta1().Evictions(pod.Namespace).Evict(h.appContext.Context, &policy.Eviction{
err := h.appContext.K8sClient.PolicyV1().Evictions(pod.Namespace).Evict(h.appContext.Context, &policy.Eviction{
ObjectMeta: metav1.ObjectMeta{
Name: pod.Name,
Namespace: pod.Namespace,
Expand Down Expand Up @@ -367,9 +380,9 @@ func (h *Handler) getControllerObject(pod v1.Pod) (*controllerObject, error) {
return co, err
}
return newControllerObject("StatefulSet", sts.Name, sts.Namespace, sts), nil
default:
return co, errors.Errorf("Controller object of type %s is not a standard controller", pod.OwnerReferences[0].Kind)
}

return co, errors.Errorf("could not find controller object for type %s\n", pod.OwnerReferences[0].Kind)
}

func (h *Handler) isRolloutRestartInProgress(co *controllerObject) (bool, error) {
Expand Down
6 changes: 3 additions & 3 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,10 @@ var (
)

// ShredderPodErrorsTotal = Total pod errors
ShredderPodErrorsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
ShredderPodErrorsTotal = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "shredder_pod_errors_total",
Help: "Total pod errors",
Help: "Total pod errors per eviction loop",
},
[]string{"pod_name", "namespace", "reason", "action"},
)
Expand Down

0 comments on commit 4b56a9a

Please sign in to comment.