Skip to content

Commit

Permalink
feat: match-up health check with latency; add additional health configs
Browse files Browse the repository at this point in the history
  • Loading branch information
lvlcn-t committed Jan 8, 2024
1 parent ab56ca0 commit 03c2fdb
Show file tree
Hide file tree
Showing 3 changed files with 191 additions and 142 deletions.
36 changes: 29 additions & 7 deletions pkg/checks/checks.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,33 @@ package checks
import (
"context"
"net/http"
"sync"
"time"

"github.com/getkin/kin-openapi/openapi3"
"github.com/prometheus/client_golang/prometheus"

"github.com/caas-team/sparrow/internal/helper"
"github.com/caas-team/sparrow/pkg/api"
)

// RegisteredChecks will be registered in this map
// The key is the name of the Check
// The name needs to map the configuration item key
var RegisteredChecks = map[string]func() Check{
"health": NewHealthCheck,
"latency": NewLatencyCheck,
}
var (
// RegisteredChecks will be registered in this map
// The key is the name of the Check
// The name needs to map the configuration item key
RegisteredChecks = map[string]func() Check{
"health": NewHealthCheck,
"latency": NewLatencyCheck,
}
// BasicRetryConfig provides a default configuration for the retry mechanism
DefaultRetry = helper.RetryConfig{
Count: 3,
Delay: time.Second,
}
)

// Check implementations are expected to perform specific monitoring tasks and report results.
//
//go:generate moq -out checks_moq.go . Check
type Check interface {
// Run is called once per check interval
Expand Down Expand Up @@ -67,6 +78,16 @@ type Check interface {
GetMetricCollectors() []prometheus.Collector
}

// CheckBase is a struct providing common fields used by implementations of the Check interface.
// It serves as a foundational structure that should be embedded in specific check implementations.
type CheckBase struct {
mu sync.Mutex
cResult chan<- Result
done chan bool
client *http.Client
}

// Result encapsulates the outcome of a check run.
type Result struct {
// data contains performance metrics about the check run
Data any `json:"data"`
Expand All @@ -84,6 +105,7 @@ type GlobalTarget struct {
LastSeen time.Time `json:"lastSeen"`
}

// ResultDTO is a data transfer object used to associate a check's name with its result.
type ResultDTO struct {
Name string
Result *Result
Expand Down
156 changes: 87 additions & 69 deletions pkg/checks/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ package checks
import (
"context"
"fmt"
"io"
"net/http"
"sync"
"time"
Expand All @@ -33,82 +34,94 @@ import (
"github.com/prometheus/client_golang/prometheus"
)

var stateMapping = map[int]string{
0: "unhealthy",
1: "healthy",
}
var (
_ Check = (*Health)(nil)
stateMapping = map[int]string{
0: "unhealthy",
1: "healthy",
}
)

// Health is a check that measures the availability of an endpoint
type Health struct {
CheckBase
route string
config HealthConfig
c chan<- Result
done chan bool
metrics healthMetrics
}

// HealthConfig contains the health check config
type HealthConfig struct {
Targets []string `json:"targets,omitempty"`
}

// Data that will be stored in the database
type healthData struct {
Targets []Target `json:"targets"`
// NewHealthCheck creates a new instance of the health check
func NewHealthCheck() Check {
return &Health{
CheckBase: CheckBase{
mu: sync.Mutex{},
cResult: nil,
done: make(chan bool, 1),
client: &http.Client{},
},
route: "health",
config: HealthConfig{
Retry: DefaultRetry,
},
metrics: newHealthMetrics(),
}
}

// Defined metric collectors of health check
type healthMetrics struct {
health *prometheus.GaugeVec
// HealthConfig defines the configuration parameters for a health check
type HealthConfig struct {
Targets []string
Interval time.Duration
Timeout time.Duration
Retry helper.RetryConfig
}

type Target struct {
// HealthResult represents the result of a single health check for a specific target
type HealthResult struct {
Target string `json:"target"`
Status string `json:"status"`
}

// NewHealthCheck creates a new HealthCheck
func NewHealthCheck() Check {
return &Health{
route: "health",
config: HealthConfig{},
metrics: newHealthMetrics(),
c: nil,
done: make(chan bool, 1),
}
// Defined metric collectors of health check
type healthMetrics struct {
health *prometheus.GaugeVec
}

// Run starts the health check
func (h *Health) Run(ctx context.Context) error {
ctx, cancel := logger.NewContextWithLogger(ctx, "health")
defer cancel()
log := logger.FromContext(ctx)
log.Info(fmt.Sprintf("Using latency check interval of %s", h.config.Interval.String()))

for {
delay := time.Second * 15
log.Info("Next health check will run after delay", "delay", delay.String())
select {
case <-ctx.Done():
log.Debug("Context closed. Stopping health check")
log.Error("Context canceled", "err", ctx.Err())
return ctx.Err()
case <-h.done:
log.Debug("Soft shut down")
return nil
case <-time.After(delay):
log.Info("Start health check run")
hd := h.check(ctx)

log.Debug("Saving health check data to database")
h.c <- Result{Timestamp: time.Now(), Data: hd}
case <-time.After(h.config.Interval):
res := h.check(ctx)
errval := ""
r := Result{
Data: res,
Err: errval,
Timestamp: time.Now(),
}

log.Info("Successfully finished health check run")
h.cResult <- r
log.Debug("Successfully finished health check run")
}
}
}

// Startup is called once when the health check is registered
func (h *Health) Startup(_ context.Context, cResult chan<- Result) error {
h.c = cResult
func (h *Health) Startup(ctx context.Context, cResult chan<- Result) error {
log := logger.FromContext(ctx).WithGroup("latency")
log.Debug("Starting latency check")

h.cResult = cResult
return nil
}

Expand All @@ -122,23 +135,30 @@ func (h *Health) Shutdown(_ context.Context) error {

// SetConfig sets the configuration for the health check
func (h *Health) SetConfig(_ context.Context, config any) error {
var checkCfg HealthConfig
if err := mapstructure.Decode(config, &checkCfg); err != nil {
var c HealthConfig
if err := mapstructure.Decode(config, &c); err != nil {
return ErrInvalidConfig
}
h.config = checkCfg
c.Interval *= time.Second
c.Retry.Delay *= time.Second
h.mu.Lock()
defer h.mu.Unlock()
h.config = c

return nil
}

// SetClient sets the http client for the health check
func (h *Health) SetClient(_ *http.Client) {
// TODO: implement with issue #31
func (h *Health) SetClient(c *http.Client) {
h.mu.Lock()
defer h.mu.Unlock()
h.client = c
}

// Schema provides the schema of the data that will be provided
// by the heath check
// by the health check
func (h *Health) Schema() (*openapi3.SchemaRef, error) {
return OpenapiFromPerfData[healthData](healthData{})
return OpenapiFromPerfData[[]HealthResult]([]HealthResult{})
}

// RegisterHandler dynamically registers a server handler
Expand All @@ -147,7 +167,7 @@ func (h *Health) RegisterHandler(ctx context.Context, router *api.RoutingTree) {
router.Add(http.MethodGet, h.route, func(w http.ResponseWriter, _ *http.Request) {
_, err := w.Write([]byte("ok"))
if err != nil {
log.Error("Could not write response", "error", err.Error())
log.Error("Could not write response", "error", err)
}
})
}
Expand Down Expand Up @@ -181,30 +201,30 @@ func (h *Health) GetMetricCollectors() []prometheus.Collector {

// check performs a health check using a retry function
// to get the health status for all targets
func (h *Health) check(ctx context.Context) healthData {
func (h *Health) check(ctx context.Context) []HealthResult {
log := logger.FromContext(ctx).WithGroup("check")
log.Debug("Checking health")
if len(h.config.Targets) == 0 {
log.Debug("No targets defined")
return healthData{}
return []HealthResult{}
}
log.Debug("Getting health status for each target in separate routine", "amount", len(h.config.Targets))

var hd healthData
var wg sync.WaitGroup
var mu sync.Mutex
results := []HealthResult{}

h.mu.Lock()
h.client.Timeout = h.config.Timeout * time.Second
h.mu.Unlock()
for _, t := range h.config.Targets {
target := t
wg.Add(1)
l := log.With("target", target)

getHealthRetry := helper.Retry(func(ctx context.Context) error {
return getHealth(ctx, target)
}, helper.RetryConfig{
Count: 3,
Delay: time.Second,
})
return getHealth(ctx, h.client, target)
}, h.config.Retry)

go func() {
defer wg.Done()
Expand All @@ -213,12 +233,13 @@ func (h *Health) check(ctx context.Context) healthData {
l.Debug("Starting retry routine to get health status")
if err := getHealthRetry(ctx); err != nil {
state = 0
l.Warn("Error while checking health", "error", err)
}

l.Debug("Successfully got health status of target", "status", stateMapping[state])
mu.Lock()
defer mu.Unlock()
hd.Targets = append(hd.Targets, Target{
results = append(results, HealthResult{
Target: target,
Status: stateMapping[state],
})
Expand All @@ -230,34 +251,31 @@ func (h *Health) check(ctx context.Context) healthData {
wg.Wait()

log.Debug("Successfully got health status from all targets")
return hd
return results
}

// getHealth performs a http get request
// returns ok if status code is 200
func getHealth(ctx context.Context, url string) error {
// getHealth performs an HTTP get request and returns ok if status code is 200
func getHealth(ctx context.Context, client *http.Client, url string) error {
log := logger.FromContext(ctx).With("url", url)

client := &http.Client{
Timeout: time.Second * 5,
}

req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, http.NoBody)
if err != nil {
log.Error("Could not create http GET request", "error", err.Error())
return err
}

res, err := client.Do(req)
resp, err := client.Do(req) //nolint:bodyclose // Closed in defer below
if err != nil {
log.Error("Http get request failed", "error", err.Error())
return err
}
defer res.Body.Close()
defer func(Body io.ReadCloser) {
_ = Body.Close()
}(resp.Body)

if res.StatusCode != http.StatusOK {
log.Error("Http get request failed", "status", res.Status)
return fmt.Errorf("request failed, status is %s", res.Status)
if resp.StatusCode != http.StatusOK {
log.Error("Http get request failed", "status", resp.Status)
return fmt.Errorf("request failed, status is %s", resp.Status)
}

return nil
Expand Down
Loading

0 comments on commit 03c2fdb

Please sign in to comment.