1010from typing import Any
1111
1212from fastapi import FastAPI
13- from presidio_analyzer import AnalyzerEngine , Pattern , PatternRecognizer , RecognizerResult
13+ from presidio_analyzer import (
14+ AnalyzerEngine ,
15+ BatchAnalyzerEngine ,
16+ Pattern ,
17+ PatternRecognizer ,
18+ RecognizerResult ,
19+ )
1420from presidio_analyzer .nlp_engine import NlpEngineProvider
1521from presidio_analyzer .predefined_recognizers import (
1622 AuAbnRecognizer ,
@@ -133,6 +139,7 @@ def build_analyzer() -> AnalyzerEngine:
133139
134140
135141analyzer = build_analyzer ()
142+ batch_analyzer = BatchAnalyzerEngine (analyzer_engine = analyzer )
136143anonymizer = AnonymizerEngine ()
137144
138145# Propagates to uvicorn's root handler, so timing lands in the container log stream.
@@ -149,13 +156,65 @@ class AnalyzeRequest(BaseModel):
149156 return_decision_process : bool = False
150157
151158
159+ class AnalyzeBatchRequest (BaseModel ):
160+ texts : list [str ]
161+ language : str = "en"
162+ entities : list [str ] | None = None
163+ score_threshold : float | None = None
164+
165+
152166class AnonymizeRequest (BaseModel ):
153167 text : str
154168 analyzer_results : list [dict [str , Any ]] = []
155169 anonymizers : dict [str , dict [str , Any ]] | None = None
156170 operators : dict [str , dict [str , Any ]] | None = None
157171
158172
173+ class AnonymizeBatchItem (BaseModel ):
174+ text : str
175+ analyzer_results : list [dict [str , Any ]] = []
176+
177+
178+ class AnonymizeBatchRequest (BaseModel ):
179+ items : list [AnonymizeBatchItem ] = []
180+ anonymizers : dict [str , dict [str , Any ]] | None = None
181+ operators : dict [str , dict [str , Any ]] | None = None
182+
183+
184+ def build_operators (
185+ raw_operators : dict [str , dict [str , Any ]] | None ,
186+ ) -> dict [str , OperatorConfig ] | None :
187+ if not raw_operators :
188+ return None
189+ operators : dict [str , OperatorConfig ] = {}
190+ for entity , raw_cfg in raw_operators .items ():
191+ op_cfg = dict (raw_cfg )
192+ op_type = op_cfg .pop ("type" , "replace" )
193+ operators [entity ] = OperatorConfig (op_type , op_cfg )
194+ return operators
195+
196+
197+ def run_anonymize (
198+ text : str ,
199+ raw_results : list [dict [str , Any ]],
200+ operators : dict [str , OperatorConfig ] | None ,
201+ ):
202+ analyzer_results = [
203+ RecognizerResult (
204+ entity_type = r ["entity_type" ],
205+ start = r ["start" ],
206+ end = r ["end" ],
207+ score = r .get ("score" , 1.0 ),
208+ )
209+ for r in raw_results
210+ ]
211+ return anonymizer .anonymize (
212+ text = text ,
213+ analyzer_results = analyzer_results ,
214+ operators = operators ,
215+ )
216+
217+
159218@app .get ("/health" )
160219def health () -> dict [str , str ]:
161220 return {"status" : "ok" }
@@ -186,35 +245,28 @@ def analyze(req: AnalyzeRequest) -> list[dict[str, Any]]:
186245 return [r .to_dict () for r in results ]
187246
188247
248+ @app .post ("/analyze_batch" )
249+ def analyze_batch (req : AnalyzeBatchRequest ) -> list [list [dict [str , Any ]]]:
250+ """Analyze many texts in one pass (spaCy nlp.pipe), returning one span list
251+ per input in request order — the batched counterpart to /analyze."""
252+ results = batch_analyzer .analyze_iterator (
253+ texts = req .texts ,
254+ language = req .language ,
255+ entities = req .entities or None ,
256+ score_threshold = req .score_threshold ,
257+ )
258+ return [[r .to_dict () for r in per_text ] for per_text in results ]
259+
260+
189261@app .post ("/anonymize" )
190262def anonymize (req : AnonymizeRequest ) -> dict [str , Any ]:
191263 started = time .perf_counter ()
192- analyzer_results = [
193- RecognizerResult (
194- entity_type = r ["entity_type" ],
195- start = r ["start" ],
196- end = r ["end" ],
197- score = r .get ("score" , 1.0 ),
198- )
199- for r in req .analyzer_results
200- ]
201- raw_operators = req .anonymizers or req .operators
202- operators = None
203- if raw_operators :
204- operators = {}
205- for entity , raw_cfg in raw_operators .items ():
206- op_cfg = dict (raw_cfg )
207- op_type = op_cfg .pop ("type" , "replace" )
208- operators [entity ] = OperatorConfig (op_type , op_cfg )
209- result = anonymizer .anonymize (
210- text = req .text ,
211- analyzer_results = analyzer_results ,
212- operators = operators ,
213- )
264+ operators = build_operators (req .anonymizers or req .operators )
265+ result = run_anonymize (req .text , req .analyzer_results , operators )
214266 logger .info (
215267 "anonymize chars=%d spans=%d duration_ms=%.1f" ,
216268 len (req .text ),
217- len (analyzer_results ),
269+ len (req . analyzer_results ),
218270 (time .perf_counter () - started ) * 1000 ,
219271 )
220272 return {
@@ -230,3 +282,17 @@ def anonymize(req: AnonymizeRequest) -> dict[str, Any]:
230282 for item in result .items
231283 ],
232284 }
285+
286+
287+ @app .post ("/anonymize_batch" )
288+ def anonymize_batch (req : AnonymizeBatchRequest ) -> dict [str , list [str ]]:
289+ """Mask many texts in one pass, returning masked text per item in request
290+ order — the batched counterpart to /anonymize. Anonymization is pure string
291+ work (no NLP), so callers should send only items with detected spans."""
292+ operators = build_operators (req .anonymizers or req .operators )
293+ return {
294+ "texts" : [
295+ run_anonymize (item .text , item .analyzer_results , operators ).text
296+ for item in req .items
297+ ]
298+ }
0 commit comments