|
1 | 1 | """Combined Presidio REST service: analyzer + anonymizer on one port. |
2 | 2 |
|
3 | | -Constructs one warm AnalyzerEngine (with a native check-digit VIN recognizer) |
4 | | -and one AnonymizerEngine at startup, exposing stock-compatible endpoints so a |
5 | | -single PRESIDIO_URL serves both. English only. |
| 3 | +Constructs one warm AnalyzerEngine (multi-language NLP + a native check-digit |
| 4 | +VIN recognizer) and one AnonymizerEngine at startup, exposing stock-compatible |
| 5 | +endpoints so a single PRESIDIO_URL serves both. |
6 | 6 | """ |
7 | 7 |
|
8 | 8 | from typing import Any |
9 | 9 |
|
10 | 10 | from fastapi import Body, FastAPI |
11 | 11 | from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer, RecognizerResult |
| 12 | +from presidio_analyzer.nlp_engine import NlpEngineProvider |
12 | 13 | from presidio_analyzer.predefined_recognizers import ( |
13 | 14 | AuAbnRecognizer, |
14 | 15 | AuAcnRecognizer, |
15 | 16 | AuMedicareRecognizer, |
16 | 17 | AuTfnRecognizer, |
| 18 | + EsNieRecognizer, |
| 19 | + EsNifRecognizer, |
| 20 | + FiPersonalIdentityCodeRecognizer, |
17 | 21 | InAadhaarRecognizer, |
18 | 22 | InPanRecognizer, |
19 | 23 | InPassportRecognizer, |
20 | 24 | InVehicleRegistrationRecognizer, |
21 | 25 | InVoterRecognizer, |
| 26 | + ItDriverLicenseRecognizer, |
| 27 | + ItFiscalCodeRecognizer, |
| 28 | + ItIdentityCardRecognizer, |
| 29 | + ItPassportRecognizer, |
| 30 | + ItVatCodeRecognizer, |
| 31 | + PlPeselRecognizer, |
22 | 32 | SgFinRecognizer, |
23 | 33 | SgUenRecognizer, |
24 | 34 | UkNinoRecognizer, |
25 | 35 | ) |
26 | 36 | from presidio_anonymizer import AnonymizerEngine |
27 | 37 | from presidio_anonymizer.entities import OperatorConfig |
28 | 38 |
|
29 | | -# English-capable predefined recognizers Presidio ships but does NOT load by |
30 | | -# default (UK_NINO, AU_*, IN_*, SG_*). es/it/pl/fi/th/ko recognizers are |
31 | | -# language-locked and excluded — this image is English only. |
| 39 | +# Languages served. Each needs its spaCy model installed in the image; the |
| 40 | +# es/it/pl/fi predefined recognizers (ES_NIF, IT_FISCAL_CODE, PL_PESEL, ...) |
| 41 | +# auto-load once their NLP engine is present. |
| 42 | +NLP_CONFIGURATION = { |
| 43 | + "nlp_engine_name": "spacy", |
| 44 | + "models": [ |
| 45 | + {"lang_code": "en", "model_name": "en_core_web_lg"}, |
| 46 | + {"lang_code": "es", "model_name": "es_core_news_lg"}, |
| 47 | + {"lang_code": "it", "model_name": "it_core_news_lg"}, |
| 48 | + {"lang_code": "pl", "model_name": "pl_core_news_lg"}, |
| 49 | + {"lang_code": "fi", "model_name": "fi_core_news_lg"}, |
| 50 | + ], |
| 51 | +} |
| 52 | +SUPPORTED_LANGUAGES = [m["lang_code"] for m in NLP_CONFIGURATION["models"]] |
| 53 | + |
| 54 | +# Predefined recognizers Presidio ships but does NOT load into the default |
| 55 | +# registry — they must be added explicitly. Each carries its own |
| 56 | +# supported_language, so it fires under that language once its NLP model is |
| 57 | +# loaded. en: UK/AU/IN/SG locale ids; es/it/pl/fi: national ids. |
32 | 58 | EXTRA_RECOGNIZERS = [ |
33 | 59 | UkNinoRecognizer, |
34 | 60 | AuAbnRecognizer, |
|
42 | 68 | InPassportRecognizer, |
43 | 69 | SgFinRecognizer, |
44 | 70 | SgUenRecognizer, |
| 71 | + EsNifRecognizer, |
| 72 | + EsNieRecognizer, |
| 73 | + ItFiscalCodeRecognizer, |
| 74 | + ItDriverLicenseRecognizer, |
| 75 | + ItVatCodeRecognizer, |
| 76 | + ItPassportRecognizer, |
| 77 | + ItIdentityCardRecognizer, |
| 78 | + PlPeselRecognizer, |
| 79 | + FiPersonalIdentityCodeRecognizer, |
45 | 80 | ] |
46 | 81 |
|
47 | 82 |
|
@@ -75,7 +110,8 @@ def validate_result(self, pattern_text: str): |
75 | 110 |
|
76 | 111 |
|
77 | 112 | def build_analyzer() -> AnalyzerEngine: |
78 | | - analyzer = AnalyzerEngine() |
| 113 | + nlp_engine = NlpEngineProvider(nlp_configuration=NLP_CONFIGURATION).create_engine() |
| 114 | + analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=SUPPORTED_LANGUAGES) |
79 | 115 | vin_pattern = Pattern(name="vin", regex=r"\b[A-HJ-NPR-Z0-9]{17}\b", score=0.7) |
80 | 116 | analyzer.registry.add_recognizer( |
81 | 117 | VinRecognizer( |
|
0 commit comments