Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
supported_languages:
- en
- es
- de
- fr
- it
- pt
- nl
- sv
default_score_threshold: 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
nlp_engine_name: spacy


models:
-
lang_code: en
model_name: xx_ent_wiki_sm
-
lang_code: es
model_name: xx_ent_wiki_sm
-
lang_code: de
model_name: xx_ent_wiki_sm
-
lang_code: fr
model_name: xx_ent_wiki_sm
-
lang_code: it
model_name: xx_ent_wiki_sm
-
lang_code: pt
model_name: xx_ent_wiki_sm
-
lang_code: nl
model_name: xx_ent_wiki_sm
-
lang_code: sv
model_name: xx_ent_wiki_sm

Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
supported_languages:
- en
- es
- de
- fr
- it
- pt
- nl
- sv
global_regex_flags: 26

recognizers:
- name: SpacyRecognizer
type: predefined
enabled: false

- name: "HuggingFace NER"
type: predefined
class_name: HuggingFaceNerRecognizer
model_name: dslim/bert-base-NER
supported_languages:
- en
- es
- de
- fr
- it
- pt
- nl
- sv
supported_entities:
- PERSON
- LOCATION
- ORGANIZATION
- MISC
aggregation_strategy: simple
threshold: 0.3
device: cpu
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def validate_recognizer_registry_configuration(
# Use Pydantic model for validation
validated_config = RecognizerRegistryConfig(**config)
# Use model_dump() without exclude_unset to include default values
return validated_config.model_dump(exclude_unset=False)
return validated_config.model_dump(exclude_unset=False, exclude_none=True)
except ValidationError as e:
raise ValueError("Invalid recognizer registry configuration") from e

Expand Down
16 changes: 13 additions & 3 deletions presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,16 +63,26 @@ def _enable_gpu(self) -> None:
)

def load(self) -> None:
"""Load the spaCy NLP model."""
"""Load the spaCy NLP model.

When multiple languages share the same model_name (e.g. xx_ent_wiki_sm
for multilingual setups), a single spaCy Language instance is reused.
This avoids duplicate model memory and multiple independently growing
StringStore/Vocab instances.
"""
logger.debug(f"Loading SpaCy models: {self.models}")

self._enable_gpu()

self.nlp = {}
loaded_models = {}
for model in self.models:
self._validate_model_params(model)
self._download_spacy_model_if_needed(model["model_name"])
self.nlp[model["lang_code"]] = spacy.load(model["model_name"])
model_name = model["model_name"]
if model_name not in loaded_models:
self._download_spacy_model_if_needed(model_name)
loaded_models[model_name] = spacy.load(model_name)
self.nlp[model["lang_code"]] = loaded_models[model_name]

@staticmethod
def _download_spacy_model_if_needed(model_name: str) -> None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@
class GLiNERRecognizer(LocalRecognizer):
"""GLiNER model based entity recognizer."""

# Class-level cache for sharing GLiNER models across instances.
# Keyed by (model_name, map_location, load_onnx_model, onnx_model_file).
# Avoids loading duplicate copies when the same model serves multiple languages.
_shared_models: dict = {}
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we think of an alternative approach using dependency injection or model registry? This would never get released.
For example:

model = GLiNER.from_pretrained(...)

recognizer_en = GLiNERRecognizer(..., model=model)
recognizer_es = GLiNERRecognizer(..., model=model)
recognizer_fr = GLiNERRecognizer(..., model=model)

Or

self.gliner = GLiNERModelRegistry.get_model(...)

The user should be able to control this model registry (add, remove, update)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point - the initial approach is naive, but it showcases the reality of the issue and the potential gains in multilingual setups. My intent was to implement it for both programmatic and yaml config-based use cases.

Here are the options I'm considering based on DI and model registry ideas:

Option A: DI + loader-level sharing
Dependency injection as you shown, plus

# YAML path:
# RecognizerListLoader detects same-model recognizers and
# injects the loaded model into subsequent instances

Simple, no new classes. But couples the loader to each recognizer's internals (gliner_model= vs ner_pipeline=, different cache key shapes). Every new recognizer type would need loader changes.

Option B: ModelRegistry

# Programmatic — user controls the registry:
registry = ModelRegistry()
rec_en = GLiNERRecognizer(model_registry=registry, supported_language="en")
rec_es = GLiNERRecognizer(model_registry=registry, supported_language="es")
# First instance loads and registers, second reuses

# Or direct injection (no registry needed):
model = GLiNER.from_pretrained(...)
rec = GLiNERRecognizer(gliner_model=model, ...)

# YAML path — automatic:
# RecognizerListLoader creates a ModelRegistry and injects it
# into recognizers that accept `model_registry` parameter

The caching logic (key shape, what to store) stays inside each recognizer - the loader just provides the shared bucket and doesn't need model-specific knowledge.

Option C: Multi-language recognizer

GLiNERRecognizer(supported_languages=["en", "es", "de", ...])

Eliminates the problem at the root - one instance, one model, no sharing needed. But EntityRecognizer is built around supported_language (singular). Cleanest long-term solution but a major refactor, not a bug fix/small improvement scope.

Option D: Lazy loading
Remove self.load() from EntityRecognizer.__init__, load on first analyze() call instead. Makes DI/registry/sharing trivially easy since all instances are created cheaply and models configured afterward. But it's a base class behavior change that affects every recognizer.

I prefer option B for this PR - covers both use cases, keeps the loader generic, gives the user full control.

Options C and D are worth considering as longer-term architectural improvements.

What do you think?


def __init__(
self,
supported_entities: Optional[List[str]] = None,
Expand Down Expand Up @@ -106,9 +111,7 @@ def __init__(
self.model_name = model_name

self.map_location = (
map_location
if map_location is not None
else device_detector.get_device()
map_location if map_location is not None else device_detector.get_device()
)

self.flat_ner = flat_ner
Expand Down Expand Up @@ -142,17 +145,34 @@ def __init__(
self.gliner_labels = list(self.model_to_presidio_entity_mapping.keys())

def load(self) -> None:
"""Load the GLiNER model."""
"""Load the GLiNER model.

Shares model instances across recognizers with identical configuration
to avoid loading duplicate copies for multilingual setups.
"""
if not GLiNER:
raise ImportError("GLiNER is not installed. Please install it.")

cache_key = (
self.model_name,
self.map_location,
self.load_onnx_model,
self.onnx_model_file,
)
if cache_key in GLiNERRecognizer._shared_models:
self.gliner = GLiNERRecognizer._shared_models[cache_key]
logger.info(f"Reusing shared GLiNER model for {self.model_name}")
return

self.gliner = GLiNER.from_pretrained(
self.model_name,
map_location=self.map_location,
load_onnx_model=self.load_onnx_model,
onnx_model_file=self.onnx_model_file,
**self.model_kwargs,
)
GLiNERRecognizer._shared_models[cache_key] = self.gliner
logger.info(f"Loaded GLiNER model: {self.model_name}")

def analyze(
self,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,11 @@ class HuggingFaceNerRecognizer(LocalRecognizer):
>>> analyzer.registry.add_recognizer(recognizer)
"""

# Class-level cache for sharing HF pipelines across instances.
# Keyed by (model_name, tokenizer_name, aggregation_strategy, device).
# Avoids loading duplicate copies when the same model serves multiple languages.
_shared_pipelines: dict = {}

# Default label mapping from common NER models to Presidio entities
DEFAULT_LABEL_MAPPING = {
# Standard NER labels (CoNLL format)
Expand Down Expand Up @@ -249,6 +254,8 @@ def load(self) -> None:
This method handles:
1. Hardware acceleration setup (CUDA validation and fallback)
2. Lazy-loading of the heavyweight ML pipeline.
3. Sharing pipelines across instances with identical configuration
to avoid loading duplicate model copies for multilingual setups.

:raises ValueError: If model_name is not set
"""
Expand All @@ -275,6 +282,17 @@ def load(self) -> None:
)
device = -1

cache_key = (
self.model_name,
self.tokenizer_name,
self.aggregation_strategy,
device,
)
if cache_key in HuggingFaceNerRecognizer._shared_pipelines:
self.ner_pipeline = HuggingFaceNerRecognizer._shared_pipelines[cache_key]
logger.info(f"Reusing shared HuggingFace pipeline for {self.model_name}")
return

logger.info(f"Loading HuggingFace model: {self.model_name}, device={device}")

try:
Expand All @@ -285,6 +303,7 @@ def load(self) -> None:
aggregation_strategy=self.aggregation_strategy,
device=device,
)
HuggingFaceNerRecognizer._shared_pipelines[cache_key] = self.ner_pipeline
logger.info(f"Successfully loaded {self.model_name}")
except Exception:
logger.exception(f"Failed to load model {self.model_name}")
Expand Down
66 changes: 66 additions & 0 deletions presidio-analyzer/tests/test_gliner_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@
from presidio_analyzer.chunkers import CharacterBasedTextChunker


@pytest.fixture(autouse=True)
def clear_shared_model_cache():
"""Clear GLiNER shared model cache before and after each test."""
GLiNERRecognizer._shared_models.clear()
yield
GLiNERRecognizer._shared_models.clear()


@pytest.fixture
def mock_gliner():
"""
Expand Down Expand Up @@ -323,3 +331,61 @@ def test_when_model_kwargs_then_passes_to_from_pretrained():
assert call_kwargs["custom_param2"] == 42


def test_load_shares_model_across_instances():
"""Test that instances with identical config share a single GLiNER model."""
if sys.version_info < (3, 10):
pytest.skip("gliner requires Python >= 3.10")

pytest.importorskip("gliner", reason="GLiNER package is not installed")

GLiNERRecognizer._shared_models.clear()
try:
with patch(GLINER_MOCK_PATH) as mock_gliner_class:
mock_model = MagicMock()
mock_gliner_class.from_pretrained.return_value = mock_model

rec_en = GLiNERRecognizer(
supported_entities=["PERSON"], supported_language="en"
)
rec_es = GLiNERRecognizer(
supported_entities=["PERSON"], supported_language="es"
)
rec_en.load()
rec_es.load()

mock_gliner_class.from_pretrained.assert_called_once()
assert rec_en.gliner is rec_es.gliner
finally:
GLiNERRecognizer._shared_models.clear()


def test_load_does_not_share_model_for_different_models():
"""Test that instances with different model names get separate models."""
if sys.version_info < (3, 10):
pytest.skip("gliner requires Python >= 3.10")

pytest.importorskip("gliner", reason="GLiNER package is not installed")

GLiNERRecognizer._shared_models.clear()
try:
with patch(GLINER_MOCK_PATH) as mock_gliner_class:
mock_gliner_class.from_pretrained.side_effect = [
MagicMock(),
MagicMock(),
]

rec_a = GLiNERRecognizer(
supported_entities=["PERSON"], model_name="model-a"
)
rec_b = GLiNERRecognizer(
supported_entities=["PERSON"], model_name="model-b"
)
rec_a.load()
rec_b.load()

assert mock_gliner_class.from_pretrained.call_count == 2
assert rec_a.gliner is not rec_b.gliner
finally:
GLiNERRecognizer._shared_models.clear()


66 changes: 66 additions & 0 deletions presidio-analyzer/tests/test_huggingface_ner_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,18 @@
TEST_MODEL_NAME = "dslim/bert-base-NER"


@pytest.fixture(autouse=True)
def clear_shared_pipeline_cache():
"""Clear the shared pipeline cache before and after each test.

The class-level _shared_pipelines dict persists across tests,
so stale mock pipelines can leak between tests.
"""
HuggingFaceNerRecognizer._shared_pipelines.clear()
yield
HuggingFaceNerRecognizer._shared_pipelines.clear()


@pytest.fixture
def mock_torch_installed():
"""Fixture to mock torch as installed and configured."""
Expand Down Expand Up @@ -564,6 +576,60 @@ def test_hf_recognizer_prediction_edge_cases(mock_recognizer, caplog):
assert "NER prediction failed" in caplog.text


@pytest.mark.usefixtures("mock_torch_installed")
def test_load_shares_pipeline_across_instances():
"""Test that instances with identical config share a single HF pipeline."""
with patch(HF_PIPELINE_PATH) as mock_hf_pipeline:
mock_hf_pipeline.return_value = MagicMock()

rec_en = HuggingFaceNerRecognizer(
model_name="test-model", supported_language="en", device=-1
)
rec_es = HuggingFaceNerRecognizer(
model_name="test-model", supported_language="es", device=-1
)
rec_en.load()
rec_es.load()

# Pipeline should be created only once
mock_hf_pipeline.assert_called_once()
assert rec_en.ner_pipeline is rec_es.ner_pipeline


@pytest.mark.usefixtures("mock_torch_installed")
def test_load_does_not_share_pipeline_for_different_models():
"""Test that instances with different models get separate pipelines."""
with patch(HF_PIPELINE_PATH) as mock_hf_pipeline:
mock_hf_pipeline.side_effect = [MagicMock(), MagicMock()]

rec_a = HuggingFaceNerRecognizer(
model_name="model-a", supported_language="en", device=-1
)
rec_b = HuggingFaceNerRecognizer(
model_name="model-b", supported_language="en", device=-1
)
rec_a.load()
rec_b.load()

assert mock_hf_pipeline.call_count == 2
assert rec_a.ner_pipeline is not rec_b.ner_pipeline


@pytest.mark.usefixtures("mock_torch_installed")
def test_load_skip_when_already_loaded():
"""Test that load() is a no-op when pipeline is already assigned."""
with patch(HF_PIPELINE_PATH) as mock_hf_pipeline:
mock_hf_pipeline.return_value = MagicMock()

rec = HuggingFaceNerRecognizer(
model_name="test-model", supported_language="en", device=-1
)
rec.load()
rec.load() # second call should be a no-op

mock_hf_pipeline.assert_called_once()


def test_hf_recognizer_analyze_handles_malformed_pipeline_output(
mock_recognizer, caplog
):
Expand Down
Loading