Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -169,20 +169,27 @@ def get_labels_for_instance(self, instance: str) -> dict:
return alert["labels"]
return {}

def past_alert_instances(self, status: DF.Literal["Firing", "Resolved"]) -> set[str]:
def past_alert_instances(
self,
status: DF.Literal["Firing", "Resolved"],
since: str | None = None,
) -> set[str]:
filters = {
"alert": self.alert,
"severity": self.severity,
"status": status,
"group_key": ("like", f"%{self.incident_scope}%"),
"modified": [
">",
add_to_date(frappe.utils.now(), hours=-self.get_repeat_interval()),
],
}
if since:
filters["creation"] = [">", since]
past_alerts = frappe.get_all(
self.doctype,
fields=["payload"],
filters={
"alert": self.alert,
"severity": self.severity,
"status": status,
"group_key": ("like", f"%{self.incident_scope}%"),
"modified": [
">",
add_to_date(frappe.utils.now(), hours=-self.get_repeat_interval()),
],
},
filters=filters,
group_by="group_key",
ignore_ifnull=True,
) # get site down alerts grouped by benches
Expand All @@ -199,14 +206,14 @@ def total_instances(self) -> int:
{"status": "Active", INCIDENT_SCOPE: self.incident_scope},
)

@property
def is_enough_firing(self):
def is_enough_firing(self, since: str | None = None):
if self.status == "Resolved":
firing_instances = len(
self.past_alert_instances("Firing") - self.past_alert_instances("Resolved")
self.past_alert_instances("Firing", since=since)
- self.past_alert_instances("Resolved", since=since)
)
else:
firing_instances = len(self.past_alert_instances("Firing"))
firing_instances = len(self.past_alert_instances("Firing", since=since))

return firing_instances > min(
math.floor(MIN_FIRING_INSTANCES_FRACTION * self.total_instances), MIN_FIRING_INSTANCES
Expand All @@ -221,7 +228,7 @@ def validate_and_create_incident(self):
rule: PrometheusAlertRule = frappe.get_doc("Prometheus Alert Rule", self.alert)
if find(rule.ignore_on_clusters, lambda x: x.cluster == cluster):
return
if self.is_enough_firing:
if self.is_enough_firing():
self.create_incident()

def get_repeat_interval(self):
Expand Down
3 changes: 2 additions & 1 deletion press/press/doctype/incident/incident.py
Original file line number Diff line number Diff line change
Expand Up @@ -789,12 +789,13 @@ def check_resolved(self):
"status": "Resolved",
"group_key": ("like", f"%{self.incident_scope}%"),
"alert": self.alert,
"creation": (">", self.creation),
},
)
except frappe.DoesNotExistError:
return
else:
if not last_resolved.is_enough_firing:
if not last_resolved.is_enough_firing(since=self.creation):
self.create_log_for_server(is_resolved=True)
self.resolve()

Expand Down
38 changes: 35 additions & 3 deletions press/press/doctype/incident/test_incident.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,38 @@ def test_incident_gets_auto_resolved_when_resolved_alerts_fire(self):
incident.reload()
self.assertEqual(incident.status, "Auto-Resolved")

def test_subsequent_incident_not_resolved_by_previous_resolved_alerts(self):
"""When a server goes down and recovers, subsequent incidents should not
auto-resolve due to resolved alerts from the previous recovery."""
site = create_test_site()
alert = create_test_prometheus_alert_rule()

# First incident: server goes down and recovers
create_test_alertmanager_webhook_log(site=site, alert=alert, status="firing")
first_incident: Incident = frappe.get_last_doc("Incident")
self.assertEqual(first_incident.status, "Validating")
create_test_alertmanager_webhook_log(site=site, alert=alert, status="resolved")
resolve_incidents()
first_incident.reload()
self.assertEqual(first_incident.status, "Auto-Resolved")

# Second incident: server goes down again
create_test_alertmanager_webhook_log(site=site, alert=alert, status="firing")
second_incident: Incident = frappe.get_last_doc("Incident")
self.assertNotEqual(first_incident.name, second_incident.name)
self.assertEqual(second_incident.status, "Validating")

# Resolve should NOT auto-resolve the second incident using old resolved alerts
resolve_incidents()
second_incident.reload()
self.assertEqual(second_incident.status, "Validating")

# Only when new resolved alerts come in after the second incident
create_test_alertmanager_webhook_log(site=site, alert=alert, status="resolved")
resolve_incidents()
second_incident.reload()
self.assertEqual(second_incident.status, "Auto-Resolved")

@given(get_total_and_firing_for_ongoing_incident())
@settings(max_examples=20, deadline=timedelta(seconds=5))
def test_is_enough_firing_is_true_for_ongoing_incident(self, total_firing):
Expand All @@ -381,10 +413,10 @@ def test_is_enough_firing_is_true_for_ongoing_incident(self, total_firing):
patch.object(
AlertmanagerWebhookLog,
"past_alert_instances",
new=lambda x, y: firing_instances,
new=lambda x, y, since=None: firing_instances,
),
):
self.assertTrue(alert.is_enough_firing)
self.assertTrue(alert.is_enough_firing())

@given(get_total_firing_and_resolved_for_resolved_incident())
@settings(max_examples=20, deadline=timedelta(seconds=5))
Expand All @@ -402,7 +434,7 @@ def test_is_enough_firing_is_false_for_resolved_incident(self, total_firing_reso
side_effect=[firing_instances, resolved_instances],
),
):
self.assertFalse(alert.is_enough_firing)
self.assertFalse(alert.is_enough_firing())

def test_incident_does_not_resolve_when_other_alerts_are_still_firing_but_does_when_less_than_required_sites_are_down(
self,
Expand Down
Loading