-
Notifications
You must be signed in to change notification settings - Fork 14
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix wrong end_time #946
base: master
Are you sure you want to change the base?
Fix wrong end_time #946
Changes from 8 commits
a75a351
f837230
234cc22
240a44f
aa56697
c0975fb
ed86365
4849416
1fdac45
5cd1754
9f5c925
b4fd6f8
3d8fad4
68b22a7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
Ensure that `end_time` is correct according to state: `None` for stateless, | ||
less than datetime.max for closed. | ||
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -4,6 +4,7 @@ | |||||
import logging | ||||||
from operator import and_ | ||||||
from random import randint, choice | ||||||
from typing import Optional | ||||||
from urllib.parse import urljoin | ||||||
|
||||||
from django.contrib.auth import get_user_model | ||||||
|
@@ -215,7 +216,28 @@ class Type(models.TextChoices): | |||||
Type.INCIDENT_CHANGE, | ||||||
Type.STATELESS, | ||||||
} | ||||||
ALLOWED_TYPES_FOR_END_USERS = {Type.CLOSE, Type.REOPEN, Type.ACKNOWLEDGE, Type.OTHER} | ||||||
ALLOWED_TYPES_FOR_END_USERS = { | ||||||
Type.ACKNOWLEDGE, | ||||||
Type.OTHER, | ||||||
Type.CLOSE, | ||||||
Type.REOPEN, | ||||||
} | ||||||
CLOSING_TYPES = { | ||||||
Type.INCIDENT_END, | ||||||
Type.CLOSE, | ||||||
} | ||||||
OPENING_TYPES = { | ||||||
Type.INCIDENT_START, | ||||||
Type.REOPEN, | ||||||
} | ||||||
STATE_TYPES = OPENING_TYPES | CLOSING_TYPES | ||||||
SHARED_TYPES = { | ||||||
Type.ACKNOWLEDGE, | ||||||
Type.OTHER, | ||||||
Type.INCIDENT_CHANGE, | ||||||
} | ||||||
STATELESS_TYPES = SHARED_TYPES | {Type.STATELESS} | ||||||
STATEFUL_TYPES = SHARED_TYPES | STATE_TYPES | ||||||
|
||||||
incident = models.ForeignKey(to="Incident", on_delete=models.PROTECT, related_name="events") | ||||||
actor = models.ForeignKey(to=User, on_delete=models.PROTECT, related_name="caused_events") | ||||||
|
@@ -334,8 +356,9 @@ def create_events(self, actor: User, event_type: Event.Type, timestamp=None, des | |||||
|
||||||
def close(self, actor: User, timestamp=None, description=""): | ||||||
"Close incidents correctly and create the needed events" | ||||||
timestamp = timestamp or timezone.now() | ||||||
qs = self.open() | ||||||
qs.update(end_time=timestamp or timezone.now()) | ||||||
qs.update(end_time=timestamp) | ||||||
qs = self.all() # Reload changes from database | ||||||
event_type = Event.Type.CLOSE | ||||||
events = qs.create_events(actor, event_type, timestamp, description) | ||||||
|
@@ -439,17 +462,36 @@ def tags(self): | |||||
def incident_relations(self): | ||||||
return IncidentRelation.objects.filter(Q(incident1=self) | Q(incident2=self)) | ||||||
|
||||||
def all_opening_events(self): | ||||||
open_events = Event.OPENING_TYPES | ||||||
return self.events.filter(type__in=open_events).order_by("timestamp") | ||||||
|
||||||
def all_reopen_events(self): | ||||||
return self.events.filter(type=Event.Type.REOPEN).order_by("timestamp") | ||||||
|
||||||
def all_closing_events(self): | ||||||
close_events = Event.CLOSING_TYPES | ||||||
return self.events.filter(type__in=close_events).order_by("timestamp") | ||||||
|
||||||
@property | ||||||
def start_event(self): | ||||||
return self.events.filter(type=Event.Type.INCIDENT_START).order_by("timestamp").first() | ||||||
|
||||||
@property | ||||||
def reopen_event(self): | ||||||
return self.all_reopen_events().last() | ||||||
|
||||||
@property | ||||||
def end_event(self): | ||||||
return self.events.filter(type=Event.Type.INCIDENT_END).order_by("timestamp").first() | ||||||
|
||||||
@property | ||||||
def close_event(self): | ||||||
return self.events.filter(type=Event.Type.CLOSE).order_by("timestamp").first() | ||||||
|
||||||
@property | ||||||
def last_close_or_end_event(self): | ||||||
return self.events.filter(type__in=(Event.Type.CLOSE, Event.Type.INCIDENT_END)).order_by("timestamp").last() | ||||||
return self.all_closing_events().last() | ||||||
|
||||||
@property | ||||||
def latest_change_event(self): | ||||||
|
@@ -475,6 +517,68 @@ def acked(self): | |||||
|
||||||
return self.events.filter((acks_query & acks_not_expired_query) | ack_is_just_being_created).exists() | ||||||
|
||||||
def event_already_exists(self, event_type): | ||||||
return self.events.filter(type=event_type).exists() | ||||||
|
||||||
def repair_end_time(self) -> Optional[bool]: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make sure that you have prefetched all events here, because otherwise thie function may cause a bunch of round trips to the db There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure what you mean. This is on the model instance. Prefetch before a loop over instances?
Did you mean something in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I meant in |
||||||
"""Repairs end_time if there is a mismatch between events and end_time | ||||||
|
||||||
This can happen under race-conditions and because we still cannot use | ||||||
the ``atomic``-decorator everwhere. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
|
||||||
Returns: | ||||||
* True if a repair needed to be made | ||||||
* False if it was stateful and ok | ||||||
* None if it was stateless and ok | ||||||
""" | ||||||
LOG.info("Incident %s: Detected potential mismatch of end_time and events", self.pk) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the first line of the method. Where did the mentioned detection take place? |
||||||
|
||||||
if not self.stateful: | ||||||
# the vital part for statelessness is set correctly | ||||||
LOG.info("Incident %s: No mismatch, correctly stateless", self.pk) | ||||||
return | ||||||
Comment on lines
+537
to
+540
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This implies that you only want to call repair_end_time on stateful events? is that worth documenting? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't see how that is implied.
If I removed this block, or the comment in it, then I would almost be willing to bet the big bux (I don't bet though, not even LOTTO. Too much knowledge of the right kind of mathematics.) that some time in the future some helpful soul complains that the method is incomplete because it lacks this block or its comment. |
||||||
|
||||||
if self.stateless_event: | ||||||
# Weird, stateless event without stateless end_time, fix | ||||||
self.end_time = None | ||||||
self.save() | ||||||
LOG.warn("Mismatch between self %s end_time and event type: set stateless", self.pk) | ||||||
return True | ||||||
Comment on lines
+542
to
+547
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems like a normalization issue? Two ways of indicating that an event is stateless? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, and There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh well 🤷 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is a question what should happen if there is a stateless event but end_time is wrong. Here, I trust the statetless event and unset end_time, but we could also delete the event. Though, then we would have to check that there is an incident-start event etc. etc. etc. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would trust the event log more than the actual event attributes, since these are both timestamped and have an authenticated author - i.e. I think it is OK to repair the |
||||||
|
||||||
# Only stateful incidents from this point on | ||||||
|
||||||
close_events = self.all_closing_events() | ||||||
if not close_events.exists(): | ||||||
if self.open: | ||||||
# Golden path for open incidents | ||||||
LOG.info("Incident %s: No mismatch, correctly stateful and open", self.pk) | ||||||
return False | ||||||
else: | ||||||
# missing close event. This is serious. | ||||||
message = "Incident %s has been closed without adding an event" | ||||||
LOG.error(message, self.pk) | ||||||
raise ValueError(message) | ||||||
|
||||||
# Only incidents with at least one close event from this point on | ||||||
|
||||||
if not self.open: | ||||||
# Golden path for closed incidents | ||||||
LOG.info("Incident %s: No mismatch, correctly stateful and closed", self.pk) | ||||||
return False | ||||||
|
||||||
reopen_event = self.reopen_event | ||||||
last_close_event = close_events.last() | ||||||
if not reopen_event or reopen_event.timestamp < last_close_event.timestamp: | ||||||
hmpf marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
# end_time was not set when making closing event, fix | ||||||
self.end_time = last_close_event.timestamp | ||||||
self.save() | ||||||
LOG.warn("Mismatch between self %s end_time and event type: set end_time to less than infinity", self.pk) | ||||||
return True | ||||||
|
||||||
# a reopen event correctly exists and the incident is correctly open | ||||||
LOG.info("Incident %s: No mismatch, correctly stateful and reopened", self.pk) | ||||||
return False | ||||||
|
||||||
def is_acked_by(self, group: str) -> bool: | ||||||
return group in self.acks.active().group_names() | ||||||
|
||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -465,6 +465,9 @@ def perform_create(self, serializer: EventSerializer): | |
# is sent after the incident has been manually closed | ||
if not user.is_source_system: | ||
raise e | ||
except AttributeError: | ||
# Do not save new event, it was redundant | ||
return | ||
else: | ||
# Only update incident if everything is valid; otherwise, just record the event | ||
self.update_incident(serializer.validated_data, incident) | ||
|
@@ -474,37 +477,50 @@ def perform_create(self, serializer: EventSerializer): | |
def validate_event_type_for_user(self, event_type: str, user: User): | ||
if user.is_source_system: | ||
if event_type not in Event.ALLOWED_TYPES_FOR_SOURCE_SYSTEMS: | ||
self._raise_type_validation_error(f"A source system cannot post events of type '{event_type}'.") | ||
self._abort_due_to_type_validation_error(f"A source system cannot post events of type '{event_type}'.") | ||
else: | ||
if event_type not in Event.ALLOWED_TYPES_FOR_END_USERS: | ||
self._raise_type_validation_error(f"An end user cannot post events of type '{event_type}'.") | ||
self._abort_due_to_type_validation_error(f"An end user cannot post events of type '{event_type}'.") | ||
Comment on lines
+482
to
+485
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure I like this rename. I find myself questioning what "abort" means in this context, and I find I have to read the called function to understand it. I find |
||
|
||
def validate_event_type_for_incident(self, event_type: str, incident: Incident): | ||
def validate_incident_has_no_relation_to_event_type(): | ||
if incident.events.filter(type=event_type).exists(): | ||
self._raise_type_validation_error(f"The incident already has a related event of type '{event_type}'.") | ||
|
||
if incident.stateful: | ||
if event_type in {Event.Type.INCIDENT_START, Event.Type.INCIDENT_END}: | ||
validate_incident_has_no_relation_to_event_type() | ||
if event_type in {Event.Type.INCIDENT_END, Event.Type.CLOSE} and not incident.open: | ||
self._raise_type_validation_error("The incident is already closed.") | ||
elif event_type == Event.Type.REOPEN and incident.open: | ||
self._raise_type_validation_error("The incident is already open.") | ||
else: | ||
if event_type == Event.Type.STATELESS: | ||
validate_incident_has_no_relation_to_event_type() | ||
elif event_type == Event.Type.INCIDENT_START: | ||
self._raise_type_validation_error("Stateless incident cannot have an INCIDENT_START event.") | ||
elif event_type in {Event.Type.INCIDENT_END, Event.Type.CLOSE, Event.Type.REOPEN}: | ||
self._raise_type_validation_error("Cannot change the state of a stateless incident.") | ||
def abort_due_to_too_many_events(incident, event_type): | ||
error_msg = f"Incident #{incident.pk} can only have one event of type '{event_type}'." | ||
LOG.warn(error_msg) | ||
self._abort_due_to_type_validation_error(error_msg) | ||
|
||
if event_type == Event.Type.ACKNOWLEDGE: | ||
acks_endpoint = reverse("incident:incident-acks", args=[incident.pk], request=self.request) | ||
self._raise_type_validation_error( | ||
f"Acknowledgements of this incidents should be posted through {acks_endpoint}." | ||
self._abort_due_to_type_validation_error( | ||
f"Acknowledgement of an incident should be posted through {acks_endpoint}." | ||
) | ||
|
||
if incident.stateful: | ||
if incident.event_already_exists(event_type): | ||
if event_type == Event.Type.INCIDENT_START: | ||
# Only ever 1 | ||
abort_due_to_too_many_events(incident, event_type) | ||
if event_type == Event.Type.INCIDENT_END: | ||
# Only ever 1, but might not have been saved correctly earlier | ||
repaired = incident.repair_end_time() | ||
if repaired: | ||
raise AttributeError("end_time mismatch repaired, see logs") | ||
# should never happen | ||
LOG.error("Something weird happened, see other logs") | ||
raise AttributeError("end_time mismatch was in error, see logs") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. AttributeError seems like a strange exception to raise here. Perhaps a custom Exception class instead? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See commit add new exceptions |
||
if event_type in Event.CLOSING_TYPES and not incident.open: | ||
self._abort_due_to_type_validation_error("The incident is already closed.") | ||
if event_type == Event.Type.REOPEN and incident.open: | ||
self._abort_due_to_type_validation_error("The incident is already open.") | ||
hmpf marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# type ok for stateful | ||
return | ||
|
||
# stateless from here | ||
if event_type == Event.Type.STATELESS and incident.event_already_exists(event_type): | ||
abort_due_to_too_many_events(incident, event_type) | ||
if event_type in Event.STATE_TYPES: | ||
self._abort_due_to_type_validation_error("Cannot change the state of a stateless incident.") | ||
|
||
def update_incident(self, validated_data: dict, incident: Incident): | ||
timestamp = validated_data["timestamp"] | ||
event_type = validated_data["type"] | ||
|
@@ -516,7 +532,7 @@ def update_incident(self, validated_data: dict, incident: Incident): | |
incident.save() | ||
|
||
@staticmethod | ||
def _raise_type_validation_error(message: str): | ||
def _abort_due_to_type_validation_error(message: str): | ||
raise serializers.ValidationError({"type": message}) | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.