Skip to content

Commit 815d875

Browse files
feat(presidio): add multi-language support (es/it/pl/fi)
Configure a multi-language spaCy NLP engine (en/es/it/pl/fi lg models) and explicitly register the national-id recognizers Presidio ships but does not load by default: ES_NIF/NIE, IT_FISCAL_CODE/DRIVER_LICENSE/VAT_CODE/PASSPORT/ IDENTITY_CARD, PL_PESEL, FI_PERSONAL_IDENTITY_CODE. Verified the NLP-engine + explicit-registration path detects in-language (Finnish id, score 1.0).
1 parent 1001ca9 commit 815d875

2 files changed

Lines changed: 55 additions & 16 deletions

File tree

docker/presidio.Dockerfile

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,19 @@ COPY docker/presidio/requirements.txt ./requirements.txt
1818
RUN --mount=type=cache,target=/root/.cache/pip \
1919
pip install --no-cache-dir -r requirements.txt
2020

21-
# Pinned English spaCy model. Downloaded with retries/resume (the wheel is
22-
# ~400MB and truncates on flaky networks if pip fetches the URL directly).
23-
ARG SPACY_MODEL_VERSION=3.8.0
21+
# Pinned spaCy models (en + es/it/pl/fi, ~2.2GB total). Downloaded with
22+
# retries/resume — the large wheels truncate on flaky networks if pip fetches
23+
# the URLs directly.
24+
ARG SPACY_MODELS="en_core_web_lg-3.8.0 es_core_news_lg-3.8.0 it_core_news_lg-3.8.0 pl_core_news_lg-3.8.0 fi_core_news_lg-3.8.0"
2425
RUN --mount=type=cache,target=/root/.cache/pip \
25-
MODEL_WHL="en_core_web_lg-${SPACY_MODEL_VERSION}-py3-none-any.whl" && \
26-
curl -fL --retry 5 --retry-delay 5 --retry-all-errors -C - \
27-
-o "/tmp/${MODEL_WHL}" \
28-
"https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-${SPACY_MODEL_VERSION}/${MODEL_WHL}" && \
29-
pip install --no-cache-dir "/tmp/${MODEL_WHL}" && \
30-
rm "/tmp/${MODEL_WHL}"
26+
for model in ${SPACY_MODELS}; do \
27+
whl="${model}-py3-none-any.whl"; \
28+
curl -fL --retry 5 --retry-delay 5 --retry-all-errors -C - \
29+
-o "/tmp/${whl}" \
30+
"https://github.com/explosion/spacy-models/releases/download/${model}/${whl}" || exit 1; \
31+
done && \
32+
pip install --no-cache-dir /tmp/*.whl && \
33+
rm /tmp/*.whl
3134

3235
COPY docker/presidio/server.py ./server.py
3336

docker/presidio/server.py

Lines changed: 43 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,60 @@
11
"""Combined Presidio REST service: analyzer + anonymizer on one port.
22
3-
Constructs one warm AnalyzerEngine (with a native check-digit VIN recognizer)
4-
and one AnonymizerEngine at startup, exposing stock-compatible endpoints so a
5-
single PRESIDIO_URL serves both. English only.
3+
Constructs one warm AnalyzerEngine (multi-language NLP + a native check-digit
4+
VIN recognizer) and one AnonymizerEngine at startup, exposing stock-compatible
5+
endpoints so a single PRESIDIO_URL serves both.
66
"""
77

88
from typing import Any
99

1010
from fastapi import Body, FastAPI
1111
from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer, RecognizerResult
12+
from presidio_analyzer.nlp_engine import NlpEngineProvider
1213
from presidio_analyzer.predefined_recognizers import (
1314
AuAbnRecognizer,
1415
AuAcnRecognizer,
1516
AuMedicareRecognizer,
1617
AuTfnRecognizer,
18+
EsNieRecognizer,
19+
EsNifRecognizer,
20+
FiPersonalIdentityCodeRecognizer,
1721
InAadhaarRecognizer,
1822
InPanRecognizer,
1923
InPassportRecognizer,
2024
InVehicleRegistrationRecognizer,
2125
InVoterRecognizer,
26+
ItDriverLicenseRecognizer,
27+
ItFiscalCodeRecognizer,
28+
ItIdentityCardRecognizer,
29+
ItPassportRecognizer,
30+
ItVatCodeRecognizer,
31+
PlPeselRecognizer,
2232
SgFinRecognizer,
2333
SgUenRecognizer,
2434
UkNinoRecognizer,
2535
)
2636
from presidio_anonymizer import AnonymizerEngine
2737
from presidio_anonymizer.entities import OperatorConfig
2838

29-
# English-capable predefined recognizers Presidio ships but does NOT load by
30-
# default (UK_NINO, AU_*, IN_*, SG_*). es/it/pl/fi/th/ko recognizers are
31-
# language-locked and excluded — this image is English only.
39+
# Languages served. Each needs its spaCy model installed in the image; the
40+
# es/it/pl/fi predefined recognizers (ES_NIF, IT_FISCAL_CODE, PL_PESEL, ...)
41+
# auto-load once their NLP engine is present.
42+
NLP_CONFIGURATION = {
43+
"nlp_engine_name": "spacy",
44+
"models": [
45+
{"lang_code": "en", "model_name": "en_core_web_lg"},
46+
{"lang_code": "es", "model_name": "es_core_news_lg"},
47+
{"lang_code": "it", "model_name": "it_core_news_lg"},
48+
{"lang_code": "pl", "model_name": "pl_core_news_lg"},
49+
{"lang_code": "fi", "model_name": "fi_core_news_lg"},
50+
],
51+
}
52+
SUPPORTED_LANGUAGES = [m["lang_code"] for m in NLP_CONFIGURATION["models"]]
53+
54+
# Predefined recognizers Presidio ships but does NOT load into the default
55+
# registry — they must be added explicitly. Each carries its own
56+
# supported_language, so it fires under that language once its NLP model is
57+
# loaded. en: UK/AU/IN/SG locale ids; es/it/pl/fi: national ids.
3258
EXTRA_RECOGNIZERS = [
3359
UkNinoRecognizer,
3460
AuAbnRecognizer,
@@ -42,6 +68,15 @@
4268
InPassportRecognizer,
4369
SgFinRecognizer,
4470
SgUenRecognizer,
71+
EsNifRecognizer,
72+
EsNieRecognizer,
73+
ItFiscalCodeRecognizer,
74+
ItDriverLicenseRecognizer,
75+
ItVatCodeRecognizer,
76+
ItPassportRecognizer,
77+
ItIdentityCardRecognizer,
78+
PlPeselRecognizer,
79+
FiPersonalIdentityCodeRecognizer,
4580
]
4681

4782

@@ -75,7 +110,8 @@ def validate_result(self, pattern_text: str):
75110

76111

77112
def build_analyzer() -> AnalyzerEngine:
78-
analyzer = AnalyzerEngine()
113+
nlp_engine = NlpEngineProvider(nlp_configuration=NLP_CONFIGURATION).create_engine()
114+
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=SUPPORTED_LANGUAGES)
79115
vin_pattern = Pattern(name="vin", regex=r"\b[A-HJ-NPR-Z0-9]{17}\b", score=0.7)
80116
analyzer.registry.add_recognizer(
81117
VinRecognizer(

0 commit comments

Comments
 (0)