Skip to content

Commit b68f5cf

Browse files
authored
feat(aci): Track DataCondition errors and propagate such results as tainted (#103003)
DataCondition failures are currently reported to Sentry/logs individually, then treated as a non-triggering. Here we return a special error type, so that the caller knows the result isn't necessarily reliable, and in DataConditionGroup logic we aggregate these results in such a way that we know at the end if an untrustworthy result influenced the outcome. These "tainted" results are reported for each Detector evaluation, so we can track how often our results were influenced by errors, giving us a better high level picture of user impact than we'd have purely by looking at individual reported error counts. This pattern will be extended to cover over DataCondition evaluation cases.
1 parent f4f642d commit b68f5cf

File tree

18 files changed

+475
-125
lines changed

18 files changed

+475
-125
lines changed

src/sentry/grouping/grouptype.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,23 +5,20 @@
55
from sentry.models.group import DEFAULT_TYPE_ID
66
from sentry.types.group import PriorityLevel
77
from sentry.workflow_engine.endpoints.validators.error_detector import ErrorDetectorValidator
8-
from sentry.workflow_engine.handlers.detector.base import DetectorHandler
9-
from sentry.workflow_engine.models.data_source import DataPacket
10-
from sentry.workflow_engine.types import (
11-
DetectorEvaluationResult,
12-
DetectorGroupKey,
13-
DetectorSettings,
8+
from sentry.workflow_engine.handlers.detector.base import (
9+
DetectorHandler,
10+
GroupedDetectorEvaluationResult,
1411
)
12+
from sentry.workflow_engine.models.data_source import DataPacket
13+
from sentry.workflow_engine.types import DetectorSettings
1514

1615
T = TypeVar("T")
1716

1817

1918
class ErrorDetectorHandler(DetectorHandler):
20-
def evaluate(
21-
self, data_packet: DataPacket[T]
22-
) -> dict[DetectorGroupKey, DetectorEvaluationResult]:
19+
def evaluate_impl(self, data_packet: DataPacket[T]) -> GroupedDetectorEvaluationResult:
2320
# placeholder
24-
return {}
21+
return GroupedDetectorEvaluationResult(result={}, tainted=False)
2522

2623

2724
@dataclass(frozen=True)

src/sentry/workflow_engine/handlers/detector/__init__.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,15 @@
44
"DetectorHandler",
55
"DetectorOccurrence",
66
"DetectorStateData",
7+
"GroupedDetectorEvaluationResult",
78
"StatefulDetectorHandler",
89
]
910

10-
from .base import DataPacketEvaluationType, DataPacketType, DetectorHandler, DetectorOccurrence
11+
from .base import (
12+
DataPacketEvaluationType,
13+
DataPacketType,
14+
DetectorHandler,
15+
DetectorOccurrence,
16+
GroupedDetectorEvaluationResult,
17+
)
1118
from .stateful import DetectorStateData, StatefulDetectorHandler

src/sentry/workflow_engine/handlers/detector/base.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from sentry.issues.grouptype import GroupType
1212
from sentry.issues.issue_occurrence import IssueEvidence, IssueOccurrence
1313
from sentry.types.actor import Actor
14+
from sentry.utils import metrics
1415
from sentry.workflow_engine.models import DataConditionGroup, DataPacket, Detector
1516
from sentry.workflow_engine.processors.data_condition_group import ProcessedDataConditionGroup
1617
from sentry.workflow_engine.types import (
@@ -77,6 +78,12 @@ def to_issue_occurrence(
7778
)
7879

7980

81+
@dataclass(frozen=True)
82+
class GroupedDetectorEvaluationResult:
83+
result: dict[DetectorGroupKey, DetectorEvaluationResult]
84+
tainted: bool
85+
86+
8087
# TODO - @saponifi3d - Change this class to be a pure ABC and remove the `__init__` method.
8188
# TODO - @saponifi3d - Once the change is made, we should introduce a `BaseDetector` class to evaluate simple cases
8289
class DetectorHandler(abc.ABC, Generic[DataPacketType, DataPacketEvaluationType]):
@@ -102,10 +109,27 @@ def __init__(self, detector: Detector):
102109
else:
103110
self.condition_group = None
104111

105-
@abc.abstractmethod
106112
def evaluate(
107113
self, data_packet: DataPacket[DataPacketType]
108-
) -> dict[DetectorGroupKey, DetectorEvaluationResult] | None:
114+
) -> dict[DetectorGroupKey, DetectorEvaluationResult]:
115+
tags = {
116+
"detector_type": self.detector.type,
117+
"result": "unknown",
118+
}
119+
try:
120+
value = self.evaluate_impl(data_packet)
121+
tags["result"] = "tainted" if value.tainted else "success"
122+
metrics.incr("workflow_engine_detector.evaluation", tags=tags, sample_rate=1.0)
123+
return value.result
124+
except Exception:
125+
tags["result"] = "failure"
126+
metrics.incr("workflow_engine_detector.evaluation", tags=tags, sample_rate=1.0)
127+
raise
128+
129+
@abc.abstractmethod
130+
def evaluate_impl(
131+
self, data_packet: DataPacket[DataPacketType]
132+
) -> GroupedDetectorEvaluationResult:
109133
"""
110134
This method is used to evaluate the data packet's value against the conditions on the detector.
111135
"""

src/sentry/workflow_engine/handlers/detector/stateful.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
DetectorHandler,
2020
DetectorOccurrence,
2121
EventData,
22+
GroupedDetectorEvaluationResult,
2223
)
2324
from sentry.workflow_engine.models import DataPacket, Detector, DetectorState
2425
from sentry.workflow_engine.processors.data_condition_group import (
@@ -371,14 +372,16 @@ def _build_workflow_engine_evidence_data(
371372
],
372373
}
373374

374-
def evaluate(
375+
def evaluate_impl(
375376
self, data_packet: DataPacket[DataPacketType]
376-
) -> dict[DetectorGroupKey, DetectorEvaluationResult]:
377+
) -> GroupedDetectorEvaluationResult:
377378
dedupe_value = self.extract_dedupe_value(data_packet)
378379
group_data_values = self._extract_value_from_packet(data_packet)
379380
state = self.state_manager.get_state_data(list(group_data_values.keys()))
380381
results: dict[DetectorGroupKey, DetectorEvaluationResult] = {}
381382

383+
tainted = False
384+
382385
for group_key, data_value in group_data_values.items():
383386
state_data: DetectorStateData = state[group_key]
384387
if dedupe_value <= state_data.dedupe_value:
@@ -391,7 +394,10 @@ def evaluate(
391394
group_data_values[group_key]
392395
)
393396

394-
if condition_results is None or condition_results.logic_result is False:
397+
if condition_results is not None and condition_results.logic_result.is_tainted():
398+
tainted = True
399+
400+
if condition_results is None or condition_results.logic_result.triggered is False:
395401
# Invalid condition result, nothing we can do
396402
# Or if we didn't match any conditions in the evaluation
397403
continue
@@ -441,7 +447,7 @@ def evaluate(
441447
)
442448

443449
self.state_manager.commit_state_updates()
444-
return results
450+
return GroupedDetectorEvaluationResult(result=results, tainted=tainted)
445451

446452
def _create_resolve_message(
447453
self,
@@ -624,7 +630,7 @@ def _evaluation_detector_conditions(
624630
},
625631
)
626632

627-
if condition_evaluation.logic_result:
633+
if condition_evaluation.logic_result.triggered:
628634
validated_condition_results: list[DetectorPriorityLevel] = [
629635
condition_result.result
630636
for condition_result in condition_evaluation.condition_results

src/sentry/workflow_engine/models/data_condition.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from sentry.db.models import DefaultFieldsModel, region_silo_model, sane_repr
1515
from sentry.utils import metrics, registry
1616
from sentry.workflow_engine.registry import condition_handler_registry
17-
from sentry.workflow_engine.types import DataConditionResult, DetectorPriorityLevel
17+
from sentry.workflow_engine.types import ConditionError, DataConditionResult, DetectorPriorityLevel
1818
from sentry.workflow_engine.utils import scopedstats
1919

2020
logger = logging.getLogger(__name__)
@@ -145,7 +145,7 @@ def get_snapshot(self) -> dict[str, Any]:
145145
"condition_result": self.condition_result,
146146
}
147147

148-
def get_condition_result(self) -> DataConditionResult:
148+
def get_condition_result(self) -> DataConditionResult | ConditionError:
149149
match self.condition_result:
150150
case float() | bool():
151151
return self.condition_result
@@ -159,15 +159,15 @@ def get_condition_result(self) -> DataConditionResult:
159159
"Invalid condition result",
160160
extra={"condition_result": self.condition_result, "id": self.id},
161161
)
162+
return ConditionError(msg="Invalid condition result")
162163

163-
return None
164-
165-
def _evaluate_operator(self, condition_type: Condition, value: T) -> DataConditionResult:
164+
def _evaluate_operator(
165+
self, condition_type: Condition, value: T
166+
) -> DataConditionResult | ConditionError:
166167
# If the condition is a base type, handle it directly
167168
op = CONDITION_OPS[condition_type]
168-
result = None
169169
try:
170-
result = op(cast(Any, value), self.comparison)
170+
return op(cast(Any, value), self.comparison)
171171
except TypeError:
172172
logger.exception(
173173
"Invalid comparison for data condition",
@@ -178,19 +178,20 @@ def _evaluate_operator(self, condition_type: Condition, value: T) -> DataConditi
178178
"condition_id": self.id,
179179
},
180180
)
181-
182-
return result
181+
return ConditionError(msg="Invalid comparison for data condition")
183182

184183
@scopedstats.timer()
185-
def _evaluate_condition(self, condition_type: Condition, value: T) -> DataConditionResult:
184+
def _evaluate_condition(
185+
self, condition_type: Condition, value: T
186+
) -> DataConditionResult | ConditionError:
186187
try:
187188
handler = condition_handler_registry.get(condition_type)
188189
except registry.NoRegistrationExistsError:
189190
logger.exception(
190191
"No registration exists for condition",
191192
extra={"type": self.type, "id": self.id},
192193
)
193-
return None
194+
return ConditionError(msg="No registration exists for condition")
194195

195196
should_be_fast = not is_slow_condition(self)
196197
start_time = time.time()
@@ -212,7 +213,7 @@ def _evaluate_condition(self, condition_type: Condition, value: T) -> DataCondit
212213
"error": str(e),
213214
},
214215
)
215-
return None
216+
return ConditionError(msg=str(e))
216217
finally:
217218
duration = time.time() - start_time
218219
if should_be_fast and duration >= FAST_CONDITION_TOO_SLOW_THRESHOLD.total_seconds():
@@ -230,17 +231,17 @@ def _evaluate_condition(self, condition_type: Condition, value: T) -> DataCondit
230231

231232
return result
232233

233-
def evaluate_value(self, value: T) -> DataConditionResult:
234+
def evaluate_value(self, value: T) -> DataConditionResult | ConditionError:
234235
try:
235236
condition_type = Condition(self.type)
236237
except ValueError:
237238
logger.exception(
238239
"Invalid condition type",
239240
extra={"type": self.type, "id": self.id},
240241
)
241-
return None
242+
return ConditionError(msg="Invalid condition type")
242243

243-
result: DataConditionResult
244+
result: DataConditionResult | ConditionError
244245
if condition_type in CONDITION_OPS:
245246
result = self._evaluate_operator(condition_type, value)
246247
else:

src/sentry/workflow_engine/models/workflow.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
from sentry.models.owner_base import OwnerModel
1919
from sentry.workflow_engine.models.data_condition import DataCondition, is_slow_condition
2020
from sentry.workflow_engine.models.data_condition_group import DataConditionGroup
21-
from sentry.workflow_engine.types import WorkflowEventData
21+
from sentry.workflow_engine.processors.data_condition_group import TriggerResult
22+
from sentry.workflow_engine.types import ConditionError, WorkflowEventData
2223

2324
from .json_config import JSONConfigBase
2425

@@ -93,7 +94,7 @@ def get_audit_log_data(self) -> dict[str, Any]:
9394

9495
def evaluate_trigger_conditions(
9596
self, event_data: WorkflowEventData, when_data_conditions: list[DataCondition] | None = None
96-
) -> tuple[bool, list[DataCondition]]:
97+
) -> tuple[TriggerResult, list[DataCondition]]:
9798
"""
9899
Evaluate the conditions for the workflow trigger and return if the evaluation was successful.
99100
If there aren't any workflow trigger conditions, the workflow is considered triggered.
@@ -104,7 +105,7 @@ def evaluate_trigger_conditions(
104105
)
105106

106107
if self.when_condition_group_id is None:
107-
return True, []
108+
return TriggerResult.TRUE, []
108109

109110
workflow_event_data = replace(event_data, workflow_env=self.environment)
110111
try:
@@ -116,7 +117,7 @@ def evaluate_trigger_conditions(
116117
"DataConditionGroup does not exist",
117118
extra={"id": self.when_condition_group_id},
118119
)
119-
return False, []
120+
return TriggerResult(False, ConditionError(msg="DataConditionGroup does not exist")), []
120121
group_evaluation, remaining_conditions = process_data_condition_group(
121122
group, workflow_event_data, when_data_conditions
122123
)

0 commit comments

Comments
 (0)