wandb · gtarpenning · Jun 12, 2026
@@ -6902,8 +6902,11 @@ def agent_conversation_chat(
     def genai_otel_export(self, req: GenAIOTelExportReq) -> GenAIOTelExportRes:
         res, span_rows = AgentWriteHandler(self.ch_client).insert_otel_spans(req)
 
-        # Return early without emitting kafka events if online eval or agent scoring are disabled
-        if not wf_env.wf_enable_online_eval() or not wf_env.wf_enable_agent_scoring():
+        # Skip the kafka emit unless online eval is on and at least one consumer
+        # (agent scoring or spans alerting) wants the turn-ended events.
+        if not wf_env.wf_enable_online_eval() or not (
+            wf_env.wf_enable_agent_scoring() or wf_env.wf_enable_agent_alerting()
+        ):
             return res
 
         # Emit for each row that produces a valid event type

@@ -66,6 +66,15 @@ def wf_enable_agent_scoring() -> bool:
     return os.environ.get("WEAVE_ENABLE_AGENT_SCORING", "false").lower() == "true"
 
 
+def wf_enable_agent_alerting() -> bool:
+    """Whether the alert worker consumes turn-ended events for spans alerts.
+
+    Gates the same OTel-ingest emit as agent scoring so spans alerting works
+    even when scoring is off. Also requires `wf_enable_online_eval`.
+    """
+    return os.environ.get("WEAVE_ENABLE_AGENT_ALERTING", "false").lower() == "true"
+
+
 def wf_scoring_worker_batch_size() -> int:
     """The batch size for the scoring worker."""
     return int(

@@ -19,6 +19,10 @@ class WeaveMetricThresholdSpec(BaseModel):
         default="THRESHOLD",
         description="Trigger condition type (e.g. 'THRESHOLD')",
     )
+    source_type: Literal["calls", "spans"] = Field(
+        default="calls",
+        description="Data source this alert evaluates: calls table or agent spans",
+    )
     comparison_operator: Literal["GREATER_THAN", "LESS_THAN", "EQUAL"] = Field(
         default="GREATER_THAN",
         description="Direction of the threshold comparison",
@@ -53,10 +57,16 @@ class WeaveMetricThresholdSpec(BaseModel):
         "Disambiguates scorers that share the same op class (e.g. multiple LLMAsAJudgeScorer instances). "
         "Filterable via inputs.self on scorer calls or input_refs on CallsFilter.",
     )
-    aggregation_function: Literal["mean", "median", "min", "max", "mode"] = Field(
+    aggregation_function: Literal[
+        "mean", "median", "min", "max", "mode", "percentage", "count"
+    ] = Field(
         default="mean",
         description="Aggregation function applied to metric values within the window before threshold comparison",
     )
+    status_match_value: str | None = Field(
+        default=None,
+        description="For percentage/count aggregation: the status value to match (e.g. '2' for error)",
+    )
 
 
 class AlertSpec(base_object_def.BaseObject):