Skip to content

Commit f24d8cc

Browse files
feat(data-retention): granular PII redaction stages (input + block outputs)
1 parent c7bb37d commit f24d8cc

21 files changed

Lines changed: 1199 additions & 245 deletions

File tree

apps/pii/server.py

Lines changed: 90 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,13 @@
1010
from typing import Any
1111

1212
from fastapi import FastAPI
13-
from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer, RecognizerResult
13+
from presidio_analyzer import (
14+
AnalyzerEngine,
15+
BatchAnalyzerEngine,
16+
Pattern,
17+
PatternRecognizer,
18+
RecognizerResult,
19+
)
1420
from presidio_analyzer.nlp_engine import NlpEngineProvider
1521
from presidio_analyzer.predefined_recognizers import (
1622
AuAbnRecognizer,
@@ -133,6 +139,7 @@ def build_analyzer() -> AnalyzerEngine:
133139

134140

135141
analyzer = build_analyzer()
142+
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
136143
anonymizer = AnonymizerEngine()
137144

138145
# Propagates to uvicorn's root handler, so timing lands in the container log stream.
@@ -149,13 +156,65 @@ class AnalyzeRequest(BaseModel):
149156
return_decision_process: bool = False
150157

151158

159+
class AnalyzeBatchRequest(BaseModel):
160+
texts: list[str]
161+
language: str = "en"
162+
entities: list[str] | None = None
163+
score_threshold: float | None = None
164+
165+
152166
class AnonymizeRequest(BaseModel):
153167
text: str
154168
analyzer_results: list[dict[str, Any]] = []
155169
anonymizers: dict[str, dict[str, Any]] | None = None
156170
operators: dict[str, dict[str, Any]] | None = None
157171

158172

173+
class AnonymizeBatchItem(BaseModel):
174+
text: str
175+
analyzer_results: list[dict[str, Any]] = []
176+
177+
178+
class AnonymizeBatchRequest(BaseModel):
179+
items: list[AnonymizeBatchItem] = []
180+
anonymizers: dict[str, dict[str, Any]] | None = None
181+
operators: dict[str, dict[str, Any]] | None = None
182+
183+
184+
def build_operators(
185+
raw_operators: dict[str, dict[str, Any]] | None,
186+
) -> dict[str, OperatorConfig] | None:
187+
if not raw_operators:
188+
return None
189+
operators: dict[str, OperatorConfig] = {}
190+
for entity, raw_cfg in raw_operators.items():
191+
op_cfg = dict(raw_cfg)
192+
op_type = op_cfg.pop("type", "replace")
193+
operators[entity] = OperatorConfig(op_type, op_cfg)
194+
return operators
195+
196+
197+
def run_anonymize(
198+
text: str,
199+
raw_results: list[dict[str, Any]],
200+
operators: dict[str, OperatorConfig] | None,
201+
):
202+
analyzer_results = [
203+
RecognizerResult(
204+
entity_type=r["entity_type"],
205+
start=r["start"],
206+
end=r["end"],
207+
score=r.get("score", 1.0),
208+
)
209+
for r in raw_results
210+
]
211+
return anonymizer.anonymize(
212+
text=text,
213+
analyzer_results=analyzer_results,
214+
operators=operators,
215+
)
216+
217+
159218
@app.get("/health")
160219
def health() -> dict[str, str]:
161220
return {"status": "ok"}
@@ -186,35 +245,28 @@ def analyze(req: AnalyzeRequest) -> list[dict[str, Any]]:
186245
return [r.to_dict() for r in results]
187246

188247

248+
@app.post("/analyze_batch")
249+
def analyze_batch(req: AnalyzeBatchRequest) -> list[list[dict[str, Any]]]:
250+
"""Analyze many texts in one pass (spaCy nlp.pipe), returning one span list
251+
per input in request order — the batched counterpart to /analyze."""
252+
results = batch_analyzer.analyze_iterator(
253+
texts=req.texts,
254+
language=req.language,
255+
entities=req.entities or None,
256+
score_threshold=req.score_threshold,
257+
)
258+
return [[r.to_dict() for r in per_text] for per_text in results]
259+
260+
189261
@app.post("/anonymize")
190262
def anonymize(req: AnonymizeRequest) -> dict[str, Any]:
191263
started = time.perf_counter()
192-
analyzer_results = [
193-
RecognizerResult(
194-
entity_type=r["entity_type"],
195-
start=r["start"],
196-
end=r["end"],
197-
score=r.get("score", 1.0),
198-
)
199-
for r in req.analyzer_results
200-
]
201-
raw_operators = req.anonymizers or req.operators
202-
operators = None
203-
if raw_operators:
204-
operators = {}
205-
for entity, raw_cfg in raw_operators.items():
206-
op_cfg = dict(raw_cfg)
207-
op_type = op_cfg.pop("type", "replace")
208-
operators[entity] = OperatorConfig(op_type, op_cfg)
209-
result = anonymizer.anonymize(
210-
text=req.text,
211-
analyzer_results=analyzer_results,
212-
operators=operators,
213-
)
264+
operators = build_operators(req.anonymizers or req.operators)
265+
result = run_anonymize(req.text, req.analyzer_results, operators)
214266
logger.info(
215267
"anonymize chars=%d spans=%d duration_ms=%.1f",
216268
len(req.text),
217-
len(analyzer_results),
269+
len(req.analyzer_results),
218270
(time.perf_counter() - started) * 1000,
219271
)
220272
return {
@@ -230,3 +282,17 @@ def anonymize(req: AnonymizeRequest) -> dict[str, Any]:
230282
for item in result.items
231283
],
232284
}
285+
286+
287+
@app.post("/anonymize_batch")
288+
def anonymize_batch(req: AnonymizeBatchRequest) -> dict[str, list[str]]:
289+
"""Mask many texts in one pass, returning masked text per item in request
290+
order — the batched counterpart to /anonymize. Anonymization is pure string
291+
work (no NLP), so callers should send only items with detected spans."""
292+
operators = build_operators(req.anonymizers or req.operators)
293+
return {
294+
"texts": [
295+
run_anonymize(item.text, item.analyzer_results, operators).text
296+
for item in req.items
297+
]
298+
}

apps/sim/app/api/organizations/[id]/data-retention/route.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,22 @@ function normalizeConfigured(
4242
rules: settings.piiRedaction.rules.map((rule) => ({
4343
...rule,
4444
language: coercePiiLanguage(rule.language),
45+
stages: rule.stages
46+
? {
47+
input: {
48+
...rule.stages.input,
49+
language: coercePiiLanguage(rule.stages.input.language),
50+
},
51+
blockOutputs: {
52+
...rule.stages.blockOutputs,
53+
language: coercePiiLanguage(rule.stages.blockOutputs.language),
54+
},
55+
logs: {
56+
...rule.stages.logs,
57+
language: coercePiiLanguage(rule.stages.logs.language),
58+
},
59+
}
60+
: undefined,
4561
})),
4662
}
4763
: null,

0 commit comments

Comments
 (0)