Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: PagerDuty service level operations #2439

Open
wants to merge 38 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
ad1be89
feat: pagerduty oauth
35C4n0r Nov 4, 2024
9ef71c1
fix: uncomment needed changes
35C4n0r Nov 4, 2024
5c60897
fix: type checks
35C4n0r Nov 4, 2024
7e01ab5
fix: CI pass
35C4n0r Nov 4, 2024
091c7ba
fix: update tests
35C4n0r Nov 4, 2024
a5da877
fix: update tests
35C4n0r Nov 4, 2024
02970fe
Merge branch 'main' into feat-pd-app
35C4n0r Nov 4, 2024
aee8a92
fix: imports
35C4n0r Nov 4, 2024
4a5edad
fix: update typescript
35C4n0r Nov 4, 2024
f2ddfd5
Merge branch 'main' into feat-pd-app
35C4n0r Nov 4, 2024
2c84a40
fix: resolve merge conflicts
35C4n0r Nov 4, 2024
bda66a1
fix: minor fixes
35C4n0r Nov 4, 2024
6a80845
fix: CI pass
35C4n0r Nov 4, 2024
a1daa04
fix: typos
35C4n0r Nov 4, 2024
4200317
chore: minor refactors
35C4n0r Nov 4, 2024
5120384
fix: extra flags for ai incident creation
35C4n0r Nov 5, 2024
b17b146
Merge branch 'main' into feat-pd-app
35C4n0r Nov 5, 2024
e2bff6d
Merge branch 'main' into feat-pd-app
talboren Nov 6, 2024
08b263c
fix(alertdto): something with url
talboren Nov 6, 2024
1cf1802
fix: improvements
talboren Nov 6, 2024
b017990
chore: add docstrings
35C4n0r Nov 7, 2024
c561a0c
Merge remote-tracking branch 'origin/feat-pd-app' into feat-pd-app
35C4n0r Nov 7, 2024
48f37ea
Merge branch 'main' into feat-pd-app
talboren Nov 10, 2024
9dc47e4
fix: wip
talboren Nov 10, 2024
e7112cf
fix: wip
talboren Nov 10, 2024
b9b5e04
fix: wip
talboren Nov 10, 2024
1f407ad
Merge branch 'main' into feat-pd-app
talboren Nov 10, 2024
66acb18
fix: providers
talboren Nov 10, 2024
cba0006
fix: fix
talboren Nov 10, 2024
ce075c3
fix: fix
talboren Nov 10, 2024
8b02198
fix: fix
talboren Nov 10, 2024
b339aed
fix: improvements
talboren Nov 10, 2024
00b9111
docs: pagerduty oauth in self hosted
talboren Nov 11, 2024
595cbee
Merge branch 'main' into feat-pd-app
talboren Nov 11, 2024
7573aac
fix: reverting a wrong description
35C4n0r Nov 11, 2024
b3241fb
feat: service level operations
35C4n0r Nov 11, 2024
532dde9
Merge branch 'main' into feat-pd-service
talboren Nov 12, 2024
b02fc31
Merge branch 'main' into feat-pd-service
shahargl Nov 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/providers/documentation/pagerduty-provider.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ To connect Keep to PagerDuty:

- **Routing Key**: Use for event posting via the PagerDuty Events API.
- **API Key**: Use for incident creation and management through the PagerDuty Incidents API.
- **Service Id** (Optional): If provided, keep operates within the service's scope.
- **OAuth2**: Token management handled automatically by Keep.

<Frame>
Expand Down
2 changes: 2 additions & 0 deletions keep/api/models/alert.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,7 @@ def from_db_incident(cls, db_incident: "Incident"):
assignee=db_incident.assignee,
services=db_incident.affected_services or [],
rule_fingerprint=db_incident.rule_fingerprint,
fingerprint=db_incident.fingerprint,
same_incident_in_the_past_id=db_incident.same_incident_in_the_past_id,
merged_into_incident_id=db_incident.merged_into_incident_id,
merged_by=db_incident.merged_by,
Expand Down Expand Up @@ -550,6 +551,7 @@ def to_db_incident(self) -> "Incident":
is_predicted=self.is_predicted,
is_confirmed=self.is_confirmed,
rule_fingerprint=self.rule_fingerprint,
fingerprint=self.fingerprint,
same_incident_in_the_past_id=self.same_incident_in_the_past_id,
merged_into_incident_id=self.merged_into_incident_id,
merged_by=self.merged_by,
Expand Down
103 changes: 96 additions & 7 deletions keep/providers/pagerduty_provider/pagerduty_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,24 @@ class PagerdutyProviderAuthConfig:
default="",
)

service_id: str | None = dataclasses.field(
metadata={
"required": False,
"description": "Service Id (if provided, keep will only operate on this service)",
"sensitive": False,
},
default=None,
)
oauth_data: dict = dataclasses.field(
metadata={
"description": "For oauth flow",
"required": False,
"sensitive": True,
"hidden": True,
},
default="",
)


class PagerdutyProvider(BaseTopologyProvider, BaseIncidentProvider):
"""Pull alerts and query incidents from PagerDuty."""
Expand Down Expand Up @@ -505,7 +523,14 @@ def setup_incident_webhook(
"incident.triggered",
"incident.unacknowledged",
],
"filter": {"type": "account_reference"},
"filter": (
{
"type": "service_reference",
"id": self.authentication_config.service_id,
}
if self.authentication_config.service_id
else {"type": "account_reference"}
),
},
}
if webhook_exists:
Expand Down Expand Up @@ -563,6 +588,67 @@ def _notify(
return self._trigger_incident(
service_id, title, alert_body, requester, incident_id
)
incident_alerts = [self._format_alert(alert) for alert in incident_alerts]
incident_dto._alerts = incident_alerts
incidents.append(incident_dto)
return incidents

@staticmethod
def _get_incident_id(incident_id: str) -> str:
"""
Create a UUID from the incident id.

Args:
incident_id (str): The original incident id

Returns:
str: The UUID
"""
md5 = hashlib.md5()
md5.update(incident_id.encode("utf-8"))
return uuid.UUID(md5.hexdigest())

@staticmethod
def _format_incident(
event: dict, provider_instance: "BaseProvider" = None
) -> IncidentDto | list[IncidentDto]:

event = event["event"]["data"]

# This will be the same for the same incident
original_incident_id = event.get("id", "ping")

incident_id = PagerdutyProvider._get_incident_id(original_incident_id)

status = PagerdutyProvider.INCIDENT_STATUS_MAP.get(
event.get("status", "firing"), IncidentStatus.FIRING
)
priority_summary = (event.get("priority", {}) or {}).get("summary", "P4")
severity = PagerdutyProvider.INCIDENT_SEVERITIES_MAP.get(
priority_summary, IncidentSeverity.INFO
)
service = event.pop("service", {}).get("summary", "unknown")

created_at = event.get("created_at")
if created_at:
created_at = datetime.datetime.fromisoformat(created_at)
else:
created_at = datetime.datetime.now(tz=datetime.timezone.utc)

return IncidentDto(
id=incident_id,
creation_time=created_at,
user_generated_name=f'PD-{event.get("title", "unknown")}-{original_incident_id}',
status=status,
severity=severity,
alert_sources=["pagerduty"],
alerts_count=event.get("alert_counts", {}).get("all", 0),
services=[service],
is_predicted=False,
is_confirmed=True,
# This is the reference to the incident in PagerDuty
fingerprint=original_incident_id,
)

def _query(self, incident_id: str = None):
incidents = self.__get_all_incidents_or_alerts()
Expand Down Expand Up @@ -678,14 +764,17 @@ def __get_all_incidents_or_alerts(self, incident_id: str = None):
url += f"/{incident_id}/alerts"
include = ["teams", "services"]
resource = "alerts"
params = {
"include[]": include,
"offset": offset,
"limit": 100,
}
if not incident_id and self.authentication_config.service_id:
params["service_ids[]"] = [self.authentication_config.service_id]
response = requests.get(
url=url,
headers=self.__get_headers(),
params={
"include[]": include,
"offset": offset,
"limit": 100,
},
params=params,
)
response.raise_for_status()
response = response.json()
Expand All @@ -696,7 +785,7 @@ def __get_all_incidents_or_alerts(self, incident_id: str = None):
paginated_response.extend(response.get(resource, []))
self.logger.info("Fetched incidents or alerts", extra={"offset": offset})
# No more results
if response.get("more", False) == False:
if not response.get("more", False):
self.logger.info("No more incidents or alerts")
break
self.logger.info(
Expand Down
Loading