Skip to content

Commit

Permalink
incident name generation added
Browse files Browse the repository at this point in the history
  • Loading branch information
GlebBerjoskin committed Sep 1, 2024
1 parent d47d733 commit 84fe143
Show file tree
Hide file tree
Showing 7 changed files with 254 additions and 51 deletions.
16 changes: 8 additions & 8 deletions ee/experimental/graph_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,17 +91,17 @@ def create_graph(tenant_id: str, fingerprints: List[str], temp_dir: str, pmi_thr
if weight > pmi_threshold:
graph.add_edge(fingerprint_i, fingerprint_j, weight=weight)

nodes_to_delete = []
logger.info(f'Preparing candidate nodes for deletion', extra={'tenant_id': tenant_id})
# nodes_to_delete = []
# logger.info(f'Preparing candidate nodes for deletion', extra={'tenant_id': tenant_id})

for node in graph.nodes:
weights = sorted([edge['weight'] for edge in graph[node].values()])
# for node in graph.nodes:
# weights = sorted([edge['weight'] for edge in graph[node].values()])

knee_index, knee_statistic = detect_knee_1d_auto_increasing(weights)
# knee_index, knee_statistic = detect_knee_1d_auto_increasing(weights)

if knee_statistic < knee_threshold:
nodes_to_delete.append(node)
# if knee_statistic < knee_threshold:
# nodes_to_delete.append(node)

graph.remove_nodes_from(nodes_to_delete)
# graph.remove_nodes_from(nodes_to_delete)

return graph
159 changes: 143 additions & 16 deletions ee/experimental/incident_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,24 +23,25 @@
write_pmi_matrix_to_temp_file,
create_incident_from_dict,
update_incident_summary,
update_incident_name,
)

from keep.api.core.dependencies import (
AuthenticatedEntity,
AuthVerifier,
get_pusher_client,
)

logger = logging.getLogger(__name__)

ALGORITHM_VERBOSE_NAME = "Correlation algorithm v0.2"
SUMMARY_GENERATOR_VERBOSE_NAME = "Summary generator v0.1"
NAME_GENERATOR_VERBOSE_NAME = "Name generator v0.1"
USE_N_HISTORICAL_ALERTS_MINING = 10e4
USE_N_HISTORICAL_ALERTS_PMI = 10e4
USE_N_HISTORICAL_INCIDENTS = 10e4
MIN_ALERT_NUMBER = 100
DEFAULT_TEMP_DIR_LOCATION = './ee/experimental/ai_temp'
MAX_SUMMARY_LENGTH = 900
MAX_NAME_LENGTH = 75


def calculate_pmi_matrix(
Expand Down Expand Up @@ -73,7 +74,7 @@ def calculate_pmi_matrix(
sliding_window = os.environ.get('PMI_SLIDING_WINDOW', 4 * 60 * 60)

if not stride:
stride = os.environ.get('PMI_STRIDE', 60 * 60)
stride = os.environ.get('PMI_STRIDE', int(sliding_window // 4))

if not temp_dir:
temp_dir = os.environ.get('AI_TEMP_FOLDER', DEFAULT_TEMP_DIR_LOCATION)
Expand Down Expand Up @@ -205,7 +206,7 @@ async def mine_incidents_and_create_objects(
temp_dir = f'{general_temp_dir}/{tenant_id}'
os.makedirs(temp_dir, exist_ok=True)

status = calculate_pmi_matrix(ctx, tenant_id)
status = calculate_pmi_matrix(ctx, tenant_id, min_alert_number=min_alert_number)
if status.get('status') == 'failed':
return {"incidents": []}

Expand Down Expand Up @@ -239,7 +240,7 @@ async def mine_incidents_and_create_objects(
},
)

incident_ids_for_summary_generation = []
incident_ids_to_update = []

for component in nx.connected_components(graph):
if len(component) > min_incident_size:
Expand All @@ -254,7 +255,7 @@ async def mine_incidents_and_create_objects(

add_alerts_to_incident_by_incident_id(tenant_id, incident.id, [
alert.id for alert in alerts if alert.fingerprint in component])
incident_ids_for_summary_generation.append(incident.id)
incident_ids_to_update.append(incident.id)

if not alerts_appended:
incident_start_time = min(
Expand All @@ -269,21 +270,32 @@ async def mine_incidents_and_create_objects(

add_alerts_to_incident_by_incident_id(tenant_id, incident.id, [
alert.id for alert in alerts if alert.fingerprint in component])
incident_ids_for_summary_generation.append(incident.id)
incident_ids_to_update.append(incident.id)

if not ctx:
pool = get_pool()
else:
pool = ctx["redis"]
for incident_id in incident_ids_for_summary_generation:
job = await pool.enqueue_job(
for incident_id in incident_ids_to_update:
job_summary = await pool.enqueue_job(
"process_summary_generation",
tenant_id=tenant_id,
incident_id=incident_id,
)
logger.info(
f"Summary generation for incident {incident_id} scheduled, job: {job}",
extra={"algorithm": ALGORITHM_VERBOSE_NAME,
f"Summary generation for incident {incident_id} scheduled, job: {job_summary}",
extra={"algorithm": SUMMARY_GENERATOR_VERBOSE_NAME,
"tenant_id": tenant_id, "incident_id": incident_id},
)

job_name = await pool.enqueue_job(
"process_name_generation",
tenant_id=tenant_id,
incident_id=incident_id,
)
logger.info(
f"Name generation for incident {incident_id} scheduled, job: {job_name}",
extra={"algorithm": NAME_GENERATOR_VERBOSE_NAME,
"tenant_id": tenant_id, "incident_id": incident_id},
)

Expand Down Expand Up @@ -447,13 +459,17 @@ def shape_incidents(alerts: pd.DataFrame, unique_alert_identifier: str, incident
def generate_incident_summary(incident: Incident, use_n_alerts_for_summary: int = -1, generate_summary: str = None, max_summary_length: int = None) -> str:
if "OPENAI_API_KEY" not in os.environ:
logger.error(
"OpenAI API key is not set. Incident summary generation is not available.")
"OpenAI API key is not set. Incident summary generation is not available.",
extra={"algorithm": SUMMARY_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id}
)
return ""

if not generate_summary:
generate_summary = os.environ.get("GENERATE_INCIDENT_SUMMARY", "True")

if generate_summary == "False":
logger.info(f"Incident summary generation is disabled. Aborting.",
extra={"algorithm": SUMMARY_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id})
return ""

if incident.user_summary:
Expand Down Expand Up @@ -511,11 +527,11 @@ def generate_incident_summary(incident: Incident, use_n_alerts_for_summary: int
]).choices[0].message.content

logger.info(f"Generated incident summary with length {len(summary)} symbols",
extra={"incident_id": incident.id, "tenant_id": incident.tenant_id})
extra={"algorithm": SUMMARY_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id})

if len(summary) > max_summary_length:
logger.info(f"Generated incident summary is too long. Applying smart truncation",
extra={"incident_id": incident.id, "tenant_id": incident.tenant_id})
extra={"algorithm": SUMMARY_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id})

summary = client.chat.completions.create(model=model, messages=[
{
Expand All @@ -532,17 +548,120 @@ def generate_incident_summary(incident: Incident, use_n_alerts_for_summary: int
]).choices[0].message.content

logger.info(f"Generated new incident summary with length {len(summary)} symbols",
extra={"incident_id": incident.id, "tenant_id": incident.tenant_id})
extra={"algorithm": SUMMARY_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id})

if len(summary) > max_summary_length:
logger.info(f"Generated incident summary is too long. Applying hard truncation",
extra={"incident_id": incident.id, "tenant_id": incident.tenant_id})
extra={"algorithm": SUMMARY_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id})
summary = summary[: max_summary_length]

return summary
except Exception as e:
logger.error(f"Error in generating incident summary: {e}")
return ""


def generate_incident_name(incident: Incident, generate_name: str = None, max_name_length: int = None, use_n_alerts_for_name: int = -1) -> str:
if "OPENAI_API_KEY" not in os.environ:
logger.error(
"OpenAI API key is not set. Incident name generation is not available.",
extra={"algorithm": NAME_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id}
)
return ""

if not generate_name:
generate_name = os.environ.get("GENERATE_INCIDENT_NAME", "True")

if generate_name == "False":
logger.info(f"Incident name generation is disabled. Aborting.",
extra={"algorithm": NAME_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id})
return ""

# if incident.name:
# return ""

if not max_name_length:
max_name_length = os.environ.get(
"MAX_NAME_LENGTH", MAX_NAME_LENGTH)

if not max_name_length:
max_name_length = os.environ.get(
"MAX_NAME_LENGTH", MAX_NAME_LENGTH)

try:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

incident = get_incident_by_id(incident.tenant_id, incident.id)

description_strings = np.unique(
[f'{alert.event["name"]}' for alert in incident.alerts]).tolist()

if use_n_alerts_for_name > 0:
incident_description = "\n".join(
description_strings[:use_n_alerts_for_name])
else:
incident_description = "\n".join(description_strings)

timestamps = [alert.timestamp for alert in incident.alerts]
incident_start = min(timestamps).replace(microsecond=0)
incident_end = max(timestamps).replace(microsecond=0)

model = os.environ.get("OPENAI_MODEL", "gpt-4o-mini")

name = client.chat.completions.create(model=model, messages=[
{
"role": "system",
"content": f"""You are a very skilled DevOps specialist who can name any incident based on alert descriptions.
When provided with information, output a short descriptive name of incident that could cause these alerts.
Add information about start time to the name. ONLY USE WHAT YOU SEE. Answer with max a {int(max_name_length * 0.9)}
symbols excerpt.
EXAMPLE:
Kubernetes rollout stuck (started on 2022.11.17 14:11)"""
},
{
"role": "user",
"content": f"""Here are alerts of an incident:\n{incident_description}\n This incident started on
{incident_start}"""
}
]).choices[0].message.content

logger.info(f"Generated incident name with length {len(name)} symbols",
extra={"incident_id": incident.id, "tenant_id": incident.tenant_id})

if len(name) > max_name_length:
logger.info(f"Generated incident name is too long. Applying smart truncation",
extra={"algorithm": NAME_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id})

name = client.chat.completions.create(model=model, messages=[
{
"role": "system",
"content": f"""You are a very skilled DevOps specialist who can name any incident based on a description.
Add information about start time to the name.When provided with information, answer with max a
{int(max_name_length * 0.9)} symbols.
EXAMPLE:
Kubernetes rollout stuck (started on 2022.11.17 14:11)"""
},
{
"role": "user",
"content": f"""Here is the description of an incident to name:\n{name}.
This incident started on {incident_start}"""
}
]).choices[0].message.content

logger.info(f"Generated new incident name with length {len(name)} symbols",
extra={"algorithm": NAME_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id})

if len(name) > max_name_length:
logger.info(f"Generated incident name is too long. Applying hard truncation",
extra={"algorithm": NAME_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id})
name = name[: max_name_length]

return name
except Exception as e:
logger.error(f"Error in generating incident name: {e}")
return ""


async def generate_update_incident_summary(ctx, tenant_id: str, incident_id: str):
Expand All @@ -551,3 +670,11 @@ async def generate_update_incident_summary(ctx, tenant_id: str, incident_id: str
update_incident_summary(tenant_id, incident_id, summary)

return summary


async def generate_update_incident_name(ctx, tenant_id: str, incident_id: str):
incident = get_incident_by_id(tenant_id, incident_id)
name = generate_incident_name(incident)
update_incident_name(tenant_id, incident_id, name)

return name
1 change: 1 addition & 0 deletions keep/api/arq_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
"keep.api.tasks.process_background_ai_task.process_background_ai_task",
"keep.api.tasks.process_background_ai_task.process_correlation",
"keep.api.tasks.process_background_ai_task.process_summary_generation",
"keep.api.tasks.process_background_ai_task.process_name_generation",
]

ARQ_BACKGROUND_FUNCTIONS: Optional[CommaSeparatedStrings] = config(
Expand Down
15 changes: 15 additions & 0 deletions keep/api/core/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -2573,6 +2573,21 @@ def update_incident_summary(tenant_id: str, incident_id: UUID, summary: str) ->
session.refresh(incident)

return

def update_incident_name(tenant_id: str, incident_id: UUID, name: str) -> Incident:
if not name:
return

with Session(engine) as session:
incident = session.exec(
select(Incident).where(Incident.tenant_id == tenant_id).where(Incident.id == incident_id)
).first()

incident.name = name
session.commit()
session.refresh(incident)

return incident


# Fetch all topology data
Expand Down
24 changes: 23 additions & 1 deletion keep/api/tasks/process_background_ai_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from keep.api.core.tenant_configuration import TenantConfiguration
from keep.api.utils.import_ee import mine_incidents_and_create_objects, ALGORITHM_VERBOSE_NAME, \
SUMMARY_GENERATOR_VERBOSE_NAME, is_ee_enabled_for_tenant, generate_update_incident_summary
SUMMARY_GENERATOR_VERBOSE_NAME, NAME_GENERATOR_VERBOSE_NAME, is_ee_enabled_for_tenant, generate_update_incident_summary, generate_update_incident_name
from keep.api.core.db import get_tenants_configurations

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -52,6 +52,28 @@ async def process_summary_generation(ctx, tenant_id: str, incident_id:str):
},
)

async def process_name_generation(ctx, tenant_id: str, incident_id: str):
logger.info(
f"Background name generation started, {NAME_GENERATOR_VERBOSE_NAME}",
extra={"algorithm": NAME_GENERATOR_VERBOSE_NAME, "incident_id": incident_id},
)

start_time = datetime.datetime.now()
await generate_update_incident_name(
ctx,
tenant_id=tenant_id,
incident_id=incident_id
)
end_time = datetime.datetime.now()
logger.info(
f"Background name generation finished, {NAME_GENERATOR_VERBOSE_NAME}, took {(end_time - start_time).total_seconds()} seconds",
extra={
"algorithm": NAME_GENERATOR_VERBOSE_NAME,
"incident_id": incident_id,
"duration_ms": (end_time - start_time).total_seconds() * 1000
},
)


async def process_background_ai_task(
ctx: dict | None, # arq context
Expand Down
8 changes: 5 additions & 3 deletions keep/api/utils/import_ee.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,16 @@
)
sys.path.insert(0, path_with_ee)

from ee.experimental.incident_utils import mine_incidents_and_create_objects, generate_update_incident_summary # noqa
from ee.experimental.incident_utils import ALGORITHM_VERBOSE_NAME, SUMMARY_GENERATOR_VERBOSE_NAME # noqa
from ee.experimental.incident_utils import mine_incidents_and_create_objects, generate_update_incident_summary, generate_update_incident_name # noqa
from ee.experimental.incident_utils import ALGORITHM_VERBOSE_NAME, SUMMARY_GENERATOR_VERBOSE_NAME, NAME_GENERATOR_VERBOSE_NAME # noqa
else:
mine_incidents_and_create_objects = NotImplemented
generate_update_incident_summary = NotImplemented
generate_update_incident_name = NotImplemented
ALGORITHM_VERBOSE_NAME = NotImplemented
SUMMARY_GENERATOR_VERBOSE_NAME = NotImplemented

NAME_GENERATOR_VERBOSE_NAME = NotImplemented

def is_ee_enabled_for_tenant(tenant_id: str, tenant_configuration=None) -> bool:
if not EE_ENABLED:
return False
Expand Down
Loading

0 comments on commit 84fe143

Please sign in to comment.