Skip to content

Commit

Permalink
eventstream: rework Client's channels
Browse files Browse the repository at this point in the history
This change started by creating two separate channels for events from
the Event Stream API and from the replay phase. During debugging, I came
across multiple bugs, which are somewhat all addressed here.

- I used buffered channels to pass events from the producers to the
  single dispatcher consumer, even as I should have known better. When
  the last replay producer has finished, another channel is used to
  communicate this state change. Because of my lazy buffered channel
  hack, the last events were raced by the finish signal.
- After restoring (channel processing) order, the producers need a way
  to quickly exit when the consumer has finished. Thus, a both reading
  and writing switch - checking the context's Done channel - was
  introduced to all producers.
- Some safeguard checks were introduced, which, e.g., detected the
  channel race error listed above.
- Somehow during a prior refactoring, the Client.fetchServiceGroups
  method was broken to also query the host groups instead of, as its
  name says, the service groups.
- My Docker-based testing environment sends SIGTERM instead of SIGINT,
  which, for other reasons, does not even reached the binary. Now
  SIGTERM is honored for the main context as well.
- Some documentation especially regarding error messages had either
  typos or grammatically mistakes.
  • Loading branch information
oxzi committed Oct 31, 2023
1 parent db8f8b7 commit 26a3295
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 43 deletions.
3 changes: 2 additions & 1 deletion cmd/icinga-notifications-daemon/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"os"
"os/signal"
"runtime"
"syscall"
"time"
)

Expand Down Expand Up @@ -75,7 +76,7 @@ func main() {
}
}

ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
defer cancel()

runtimeConfig := config.NewRuntimeConfig(db, logs)
Expand Down
70 changes: 40 additions & 30 deletions internal/eventstream/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"crypto/tls"
"crypto/x509"
"errors"
"fmt"
"github.com/icinga/icinga-notifications/internal/config"
"github.com/icinga/icinga-notifications/internal/event"
Expand All @@ -21,12 +22,6 @@ import (

// This file contains the main resp. common methods for the Client.

// outgoingEvent is a wrapper around an event.Event and its producer's origin to be sent to the eventDispatcher.
type outgoingEvent struct {
event *event.Event
fromEventStream bool
}

// Client for the Icinga 2 Event Stream API with extended support for other Icinga 2 APIs to gather additional
// information and allow a replay in case of a connection loss.
type Client struct {
Expand All @@ -48,8 +43,11 @@ type Client struct {
// Logger to log to.
Logger *logging.Logger

// eventDispatch communicates Events to be processed between producer and consumer.
eventDispatch chan *outgoingEvent
// eventDispatcherEventStream communicates Events to be processed from the Event Stream API.
eventDispatcherEventStream chan *event.Event
// eventDispatcherReplay communicates Events to be processed from the Icinga 2 API replay during replay phase.
eventDispatcherReplay chan *event.Event

// replayTrigger signals the eventDispatcher method that the replay phase is finished.
replayTrigger chan struct{}
// replayPhase indicates that Events will be cached as the Event Stream Client is in the replay phase.
Expand Down Expand Up @@ -192,22 +190,22 @@ func (client *Client) buildHostServiceEvent(result CheckResult, state int, host,

if service != "" {
switch state {
case 0:
case 0: // OK
eventSeverity = event.SeverityOK
case 1:
case 1: // WARNING
eventSeverity = event.SeverityWarning
case 2:
case 2: // CRITICAL
eventSeverity = event.SeverityCrit
default:
default: // UNKNOWN or faulty
eventSeverity = event.SeverityErr
}
} else {
switch state {
case 0:
case 0: // UP
eventSeverity = event.SeverityOK
case 1:
case 1: // DOWN
eventSeverity = event.SeverityCrit
default:
default: // faulty
eventSeverity = event.SeverityErr
}
}
Expand Down Expand Up @@ -248,7 +246,7 @@ func (client *Client) eventDispatcher() {
for {
select {
case <-client.Ctx.Done():
client.Logger.Warnw("Closing event dispatcher as context is done", zap.Error(client.Ctx.Err()))
client.Logger.Warnw("Closing event dispatcher as its context is done", zap.Error(client.Ctx.Err()))
return

case <-client.replayTrigger:
Expand All @@ -259,12 +257,18 @@ func (client *Client) eventDispatcher() {
client.replayPhase.Store(false)
replayBuffer = []*event.Event{}

case ev := <-client.eventDispatch:
if client.replayPhase.Load() && ev.fromEventStream {
replayBuffer = append(replayBuffer, ev.event)
case ev := <-client.eventDispatcherEventStream:
if client.replayPhase.Load() {
replayBuffer = append(replayBuffer, ev)
} else {
client.CallbackFn(ev.event)
client.CallbackFn(ev)
}

case ev := <-client.eventDispatcherReplay:
if !client.replayPhase.Load() {
client.Logger.Errorw("Dispatcher received replay event during normal operation", zap.Stringer("event", ev))
}
client.CallbackFn(ev)
}
}
}
Expand All @@ -275,7 +279,10 @@ func (client *Client) eventDispatcher() {
// all of those have finished, the replayTrigger will be used to indicate that the buffered Events should be replayed.
func (client *Client) enterReplayPhase() {
client.Logger.Info("Entering replay phase to replay stored events first")
client.replayPhase.Store(true)
if !client.replayPhase.CompareAndSwap(false, true) {
client.Logger.Error("The Event Stream Client is already in the replay phase")
return
}

queryFns := []func(string){client.checkMissedAcknowledgements, client.checkMissedStateChanges}
objTypes := []string{"host", "service"}
Expand All @@ -293,7 +300,9 @@ func (client *Client) enterReplayPhase() {
}

go func() {
startTime := time.Now()
replayWg.Wait()
client.Logger.Debugw("All replay phase workers have finished", zap.Duration("duration", time.Since(startTime)))
client.replayTrigger <- struct{}{}
}()
}
Expand All @@ -304,22 +313,23 @@ func (client *Client) enterReplayPhase() {
// loop takes care of reconnections, all those events will be logged while generated Events will be dispatched to the
// callback function.
func (client *Client) Process() {
// These two channels will be used to communicate the Events and are crucial. As there are multiple producers and
// only one consumer, eventDispatcher, there is no ideal closer. However, producers and the consumer will be
// finished by the Client's context. When this happens, the main application should either be stopped or the Client
// is restarted, and we can hope for the GC. To make sure that nothing gets stuck, make the event channel buffered.
client.eventDispatch = make(chan *outgoingEvent, 1024)
client.eventDispatcherEventStream = make(chan *event.Event)
client.eventDispatcherReplay = make(chan *event.Event)
client.replayTrigger = make(chan struct{})

defer client.Logger.Info("Event Stream Client has stopped")

go client.eventDispatcher()

for {
err := client.listenEventStream()
if err != nil {
switch {
case errors.Is(err, context.Canceled):
client.Logger.Warnw("Stopping Event Stream Client as its context is done", zap.Error(err))
return

case err != nil:
client.Logger.Errorw("Event Stream processing failed", zap.Error(err))
} else {

default:
client.Logger.Warn("Event Stream closed stream; maybe Icinga 2 is reloading")
}
}
Expand Down
30 changes: 18 additions & 12 deletions internal/eventstream/client_api.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ func (client *Client) queryObjectsApiQuery(objType string, query map[string]any)
})
}

// fetchHostGroup fetches all Host Groups for this host.
// fetchHostGroups fetches all Host Groups for this host.
func (client *Client) fetchHostGroups(host string) ([]string, error) {
jsonRaw, err := client.queryObjectsApiDirect("host", host)
if err != nil {
Expand All @@ -114,7 +114,7 @@ func (client *Client) fetchHostGroups(host string) ([]string, error) {

// fetchServiceGroups fetches all Service Groups for this service on this host.
func (client *Client) fetchServiceGroups(host, service string) ([]string, error) {
jsonRaw, err := client.queryObjectsApiDirect("host", host)
jsonRaw, err := client.queryObjectsApiDirect("service", host+"!"+service)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -231,15 +231,18 @@ func (client *Client) checkMissedChanges(objType, filterExpr string, attrsCallba
// checkMissedStateChanges fetches all objects of the requested type and feeds them into the handler.
func (client *Client) checkMissedStateChanges(objType string) {
client.checkMissedChanges(objType, "", func(attrs HostServiceRuntimeAttributes, host, service string) {
logger := client.Logger.With(zap.String("object type", objType))

ev, err := client.buildHostServiceEvent(attrs.LastCheckResult, attrs.State, host, service)
if err != nil {
client.Logger.Errorw("Failed to construct Event from API", zap.String("object type", objType), zap.Error(err))
logger.Errorw("Failed to construct Event from API", zap.Error(err))
return
}

client.eventDispatch <- &outgoingEvent{
event: ev,
fromEventStream: false,
select {
case <-client.Ctx.Done():
logger.Warnw("Cannot dispatch replayed event as context is finished", zap.Error(client.Ctx.Err()))
case client.eventDispatcherReplay <- ev:
}
})
}
Expand All @@ -264,9 +267,10 @@ func (client *Client) checkMissedAcknowledgements(objType string) {
return
}

client.eventDispatch <- &outgoingEvent{
event: ev,
fromEventStream: false,
select {
case <-client.Ctx.Done():
logger.Warnw("Cannot dispatch replayed event as context is finished", zap.Error(client.Ctx.Err()))
case client.eventDispatcherReplay <- ev:
}
})
}
Expand Down Expand Up @@ -356,9 +360,11 @@ func (client *Client) listenEventStream() error {
return err
}

client.eventDispatch <- &outgoingEvent{
event: ev,
fromEventStream: true,
select {
case <-client.Ctx.Done():
client.Logger.Warnw("Cannot dispatch Event Stream event as context is finished", zap.Error(client.Ctx.Err()))
return client.Ctx.Err()
case client.eventDispatcherEventStream <- ev:
}
}
return lineScanner.Err()
Expand Down

0 comments on commit 26a3295

Please sign in to comment.