From 4315b87067c50c22dbe90b7ca16c0125a49fba0f Mon Sep 17 00:00:00 2001 From: Yurii Havrylko <10372700+yuriihavrylko@users.noreply.github.com> Date: Sun, 31 May 2026 11:49:52 +0200 Subject: [PATCH 1/4] feat: implement model sharing for spacy, gliner and HuggingFace recognizers to avoid in-memory duplicates --- .../nlp_engine/spacy_nlp_engine.py | 16 +++++++++-- .../ner/gliner_recognizer.py | 28 ++++++++++++++++--- .../ner/huggingface_ner_recognizer.py | 19 +++++++++++++ 3 files changed, 56 insertions(+), 7 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py index 8ec293be75..04bebabd42 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py @@ -63,16 +63,26 @@ def _enable_gpu(self) -> None: ) def load(self) -> None: - """Load the spaCy NLP model.""" + """Load the spaCy NLP model. + + When multiple languages share the same model_name (e.g. xx_ent_wiki_sm + for multilingual setups), a single spaCy Language instance is reused. + This avoids duplicate model memory and multiple independently growing + StringStore/Vocab instances. + """ logger.debug(f"Loading SpaCy models: {self.models}") self._enable_gpu() self.nlp = {} + loaded_models = {} for model in self.models: self._validate_model_params(model) - self._download_spacy_model_if_needed(model["model_name"]) - self.nlp[model["lang_code"]] = spacy.load(model["model_name"]) + model_name = model["model_name"] + if model_name not in loaded_models: + self._download_spacy_model_if_needed(model_name) + loaded_models[model_name] = spacy.load(model_name) + self.nlp[model["lang_code"]] = loaded_models[model_name] @staticmethod def _download_spacy_model_if_needed(model_name: str) -> None: diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/ner/gliner_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/ner/gliner_recognizer.py index 4e12f7e634..c5c68ec2b6 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/ner/gliner_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/ner/gliner_recognizer.py @@ -26,6 +26,11 @@ class GLiNERRecognizer(LocalRecognizer): """GLiNER model based entity recognizer.""" + # Class-level cache for sharing GLiNER models across instances. + # Keyed by (model_name, map_location, load_onnx_model, onnx_model_file). + # Avoids loading duplicate copies when the same model serves multiple languages. + _shared_models: dict = {} + def __init__( self, supported_entities: Optional[List[str]] = None, @@ -106,9 +111,7 @@ def __init__( self.model_name = model_name self.map_location = ( - map_location - if map_location is not None - else device_detector.get_device() + map_location if map_location is not None else device_detector.get_device() ) self.flat_ner = flat_ner @@ -142,10 +145,25 @@ def __init__( self.gliner_labels = list(self.model_to_presidio_entity_mapping.keys()) def load(self) -> None: - """Load the GLiNER model.""" + """Load the GLiNER model. + + Shares model instances across recognizers with identical configuration + to avoid loading duplicate copies for multilingual setups. + """ if not GLiNER: raise ImportError("GLiNER is not installed. Please install it.") + cache_key = ( + self.model_name, + self.map_location, + self.load_onnx_model, + self.onnx_model_file, + ) + if cache_key in GLiNERRecognizer._shared_models: + self.gliner = GLiNERRecognizer._shared_models[cache_key] + logger.info(f"Reusing shared GLiNER model for {self.model_name}") + return + self.gliner = GLiNER.from_pretrained( self.model_name, map_location=self.map_location, @@ -153,6 +171,8 @@ def load(self) -> None: onnx_model_file=self.onnx_model_file, **self.model_kwargs, ) + GLiNERRecognizer._shared_models[cache_key] = self.gliner + logger.info(f"Loaded GLiNER model: {self.model_name}") def analyze( self, diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/ner/huggingface_ner_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/ner/huggingface_ner_recognizer.py index 43b17683f1..7bcb3cfacb 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/ner/huggingface_ner_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/ner/huggingface_ner_recognizer.py @@ -75,6 +75,11 @@ class HuggingFaceNerRecognizer(LocalRecognizer): >>> analyzer.registry.add_recognizer(recognizer) """ + # Class-level cache for sharing HF pipelines across instances. + # Keyed by (model_name, tokenizer_name, aggregation_strategy, device). + # Avoids loading duplicate copies when the same model serves multiple languages. + _shared_pipelines: dict = {} + # Default label mapping from common NER models to Presidio entities DEFAULT_LABEL_MAPPING = { # Standard NER labels (CoNLL format) @@ -249,6 +254,8 @@ def load(self) -> None: This method handles: 1. Hardware acceleration setup (CUDA validation and fallback) 2. Lazy-loading of the heavyweight ML pipeline. + 3. Sharing pipelines across instances with identical configuration + to avoid loading duplicate model copies for multilingual setups. :raises ValueError: If model_name is not set """ @@ -275,6 +282,17 @@ def load(self) -> None: ) device = -1 + cache_key = ( + self.model_name, + self.tokenizer_name, + self.aggregation_strategy, + device, + ) + if cache_key in HuggingFaceNerRecognizer._shared_pipelines: + self.ner_pipeline = HuggingFaceNerRecognizer._shared_pipelines[cache_key] + logger.info(f"Reusing shared HuggingFace pipeline for {self.model_name}") + return + logger.info(f"Loading HuggingFace model: {self.model_name}, device={device}") try: @@ -285,6 +303,7 @@ def load(self) -> None: aggregation_strategy=self.aggregation_strategy, device=device, ) + HuggingFaceNerRecognizer._shared_pipelines[cache_key] = self.ner_pipeline logger.info(f"Successfully loaded {self.model_name}") except Exception: logger.exception(f"Failed to load model {self.model_name}") From 5c6bc67536d7cb78a06a9f02c98620145c71c7d6 Mon Sep 17 00:00:00 2001 From: Yurii Havrylko <10372700+yuriihavrylko@users.noreply.github.com> Date: Sun, 31 May 2026 11:50:24 +0200 Subject: [PATCH 2/4] test: add shared model caching tests for spacy, gliner and HuggingFace recognizers --- .../tests/test_gliner_recognizer.py | 66 +++++++++++++++++++ .../tests/test_huggingface_ner_recognizer.py | 66 +++++++++++++++++++ .../tests/test_spacy_nlp_engine.py | 53 ++++++++++++++- 3 files changed, 184 insertions(+), 1 deletion(-) diff --git a/presidio-analyzer/tests/test_gliner_recognizer.py b/presidio-analyzer/tests/test_gliner_recognizer.py index 528634441d..b236a76115 100644 --- a/presidio-analyzer/tests/test_gliner_recognizer.py +++ b/presidio-analyzer/tests/test_gliner_recognizer.py @@ -7,6 +7,14 @@ from presidio_analyzer.chunkers import CharacterBasedTextChunker +@pytest.fixture(autouse=True) +def clear_shared_model_cache(): + """Clear GLiNER shared model cache before and after each test.""" + GLiNERRecognizer._shared_models.clear() + yield + GLiNERRecognizer._shared_models.clear() + + @pytest.fixture def mock_gliner(): """ @@ -323,3 +331,61 @@ def test_when_model_kwargs_then_passes_to_from_pretrained(): assert call_kwargs["custom_param2"] == 42 +def test_load_shares_model_across_instances(): + """Test that instances with identical config share a single GLiNER model.""" + if sys.version_info < (3, 10): + pytest.skip("gliner requires Python >= 3.10") + + pytest.importorskip("gliner", reason="GLiNER package is not installed") + + GLiNERRecognizer._shared_models.clear() + try: + with patch(GLINER_MOCK_PATH) as mock_gliner_class: + mock_model = MagicMock() + mock_gliner_class.from_pretrained.return_value = mock_model + + rec_en = GLiNERRecognizer( + supported_entities=["PERSON"], supported_language="en" + ) + rec_es = GLiNERRecognizer( + supported_entities=["PERSON"], supported_language="es" + ) + rec_en.load() + rec_es.load() + + mock_gliner_class.from_pretrained.assert_called_once() + assert rec_en.gliner is rec_es.gliner + finally: + GLiNERRecognizer._shared_models.clear() + + +def test_load_does_not_share_model_for_different_models(): + """Test that instances with different model names get separate models.""" + if sys.version_info < (3, 10): + pytest.skip("gliner requires Python >= 3.10") + + pytest.importorskip("gliner", reason="GLiNER package is not installed") + + GLiNERRecognizer._shared_models.clear() + try: + with patch(GLINER_MOCK_PATH) as mock_gliner_class: + mock_gliner_class.from_pretrained.side_effect = [ + MagicMock(), + MagicMock(), + ] + + rec_a = GLiNERRecognizer( + supported_entities=["PERSON"], model_name="model-a" + ) + rec_b = GLiNERRecognizer( + supported_entities=["PERSON"], model_name="model-b" + ) + rec_a.load() + rec_b.load() + + assert mock_gliner_class.from_pretrained.call_count == 2 + assert rec_a.gliner is not rec_b.gliner + finally: + GLiNERRecognizer._shared_models.clear() + + diff --git a/presidio-analyzer/tests/test_huggingface_ner_recognizer.py b/presidio-analyzer/tests/test_huggingface_ner_recognizer.py index 7191aa6a43..b2d2fd9acc 100644 --- a/presidio-analyzer/tests/test_huggingface_ner_recognizer.py +++ b/presidio-analyzer/tests/test_huggingface_ner_recognizer.py @@ -20,6 +20,18 @@ TEST_MODEL_NAME = "dslim/bert-base-NER" +@pytest.fixture(autouse=True) +def clear_shared_pipeline_cache(): + """Clear the shared pipeline cache before and after each test. + + The class-level _shared_pipelines dict persists across tests, + so stale mock pipelines can leak between tests. + """ + HuggingFaceNerRecognizer._shared_pipelines.clear() + yield + HuggingFaceNerRecognizer._shared_pipelines.clear() + + @pytest.fixture def mock_torch_installed(): """Fixture to mock torch as installed and configured.""" @@ -564,6 +576,60 @@ def test_hf_recognizer_prediction_edge_cases(mock_recognizer, caplog): assert "NER prediction failed" in caplog.text +@pytest.mark.usefixtures("mock_torch_installed") +def test_load_shares_pipeline_across_instances(): + """Test that instances with identical config share a single HF pipeline.""" + with patch(HF_PIPELINE_PATH) as mock_hf_pipeline: + mock_hf_pipeline.return_value = MagicMock() + + rec_en = HuggingFaceNerRecognizer( + model_name="test-model", supported_language="en", device=-1 + ) + rec_es = HuggingFaceNerRecognizer( + model_name="test-model", supported_language="es", device=-1 + ) + rec_en.load() + rec_es.load() + + # Pipeline should be created only once + mock_hf_pipeline.assert_called_once() + assert rec_en.ner_pipeline is rec_es.ner_pipeline + + +@pytest.mark.usefixtures("mock_torch_installed") +def test_load_does_not_share_pipeline_for_different_models(): + """Test that instances with different models get separate pipelines.""" + with patch(HF_PIPELINE_PATH) as mock_hf_pipeline: + mock_hf_pipeline.side_effect = [MagicMock(), MagicMock()] + + rec_a = HuggingFaceNerRecognizer( + model_name="model-a", supported_language="en", device=-1 + ) + rec_b = HuggingFaceNerRecognizer( + model_name="model-b", supported_language="en", device=-1 + ) + rec_a.load() + rec_b.load() + + assert mock_hf_pipeline.call_count == 2 + assert rec_a.ner_pipeline is not rec_b.ner_pipeline + + +@pytest.mark.usefixtures("mock_torch_installed") +def test_load_skip_when_already_loaded(): + """Test that load() is a no-op when pipeline is already assigned.""" + with patch(HF_PIPELINE_PATH) as mock_hf_pipeline: + mock_hf_pipeline.return_value = MagicMock() + + rec = HuggingFaceNerRecognizer( + model_name="test-model", supported_language="en", device=-1 + ) + rec.load() + rec.load() # second call should be a no-op + + mock_hf_pipeline.assert_called_once() + + def test_hf_recognizer_analyze_handles_malformed_pipeline_output( mock_recognizer, caplog ): diff --git a/presidio-analyzer/tests/test_spacy_nlp_engine.py b/presidio-analyzer/tests/test_spacy_nlp_engine.py index 152032c1c1..c7eee3f94c 100644 --- a/presidio-analyzer/tests/test_spacy_nlp_engine.py +++ b/presidio-analyzer/tests/test_spacy_nlp_engine.py @@ -149,5 +149,56 @@ def test_when_cpu_device_then_gpu_not_configured(): engine = SpacyNlpEngine(models=[{"lang_code": "en", "model_name": "en_core_web_sm"}]) engine.load() - + mock_spacy.require_gpu.assert_not_called() + + +def test_load_shares_model_when_same_model_name(): + """Test that languages sharing the same model_name reuse one spaCy instance.""" + with patch("presidio_analyzer.nlp_engine.spacy_nlp_engine.device_detector") as mock_detector: + mock_detector.get_device.return_value = "cpu" + + with patch("presidio_analyzer.nlp_engine.spacy_nlp_engine.spacy") as mock_spacy: + mock_model = MagicMock() + mock_spacy.load.return_value = mock_model + mock_spacy.util.is_package.return_value = True + + engine = SpacyNlpEngine( + models=[ + {"lang_code": "en", "model_name": "xx_ent_wiki_sm"}, + {"lang_code": "es", "model_name": "xx_ent_wiki_sm"}, + {"lang_code": "de", "model_name": "xx_ent_wiki_sm"}, + ] + ) + engine.load() + + # spacy.load should be called only once for the shared model + mock_spacy.load.assert_called_once_with("xx_ent_wiki_sm") + # All languages should reference the same instance + assert engine.nlp["en"] is engine.nlp["es"] + assert engine.nlp["es"] is engine.nlp["de"] + + +def test_load_keeps_separate_models_when_different_model_names(): + """Test that different model_names are loaded separately.""" + with patch("presidio_analyzer.nlp_engine.spacy_nlp_engine.device_detector") as mock_detector: + mock_detector.get_device.return_value = "cpu" + + with patch("presidio_analyzer.nlp_engine.spacy_nlp_engine.spacy") as mock_spacy: + mock_en = MagicMock() + mock_de = MagicMock() + mock_spacy.load.side_effect = [mock_en, mock_de] + mock_spacy.util.is_package.return_value = True + + engine = SpacyNlpEngine( + models=[ + {"lang_code": "en", "model_name": "en_core_web_sm"}, + {"lang_code": "de", "model_name": "de_core_news_sm"}, + ] + ) + engine.load() + + assert mock_spacy.load.call_count == 2 + assert engine.nlp["en"] is mock_en + assert engine.nlp["de"] is mock_de + assert engine.nlp["en"] is not engine.nlp["de"] From e2e9a0806be9d5125a32b528f1d1097a093d3d58 Mon Sep 17 00:00:00 2001 From: Yurii Havrylko <10372700+yuriihavrylko@users.noreply.github.com> Date: Sun, 31 May 2026 16:26:25 +0200 Subject: [PATCH 3/4] fix: exclude None values in recognizer registry configuration validation --- presidio-analyzer/presidio_analyzer/input_validation/schemas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py index a256ca2a58..0ac9ae10b5 100644 --- a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py +++ b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py @@ -81,7 +81,7 @@ def validate_recognizer_registry_configuration( # Use Pydantic model for validation validated_config = RecognizerRegistryConfig(**config) # Use model_dump() without exclude_unset to include default values - return validated_config.model_dump(exclude_unset=False) + return validated_config.model_dump(exclude_unset=False, exclude_none=True) except ValidationError as e: raise ValueError("Invalid recognizer registry configuration") from e From 0e04855aba14af4c9e495b2bce46e1dec478537d Mon Sep 17 00:00:00 2001 From: Yurii Havrylko <10372700+yuriihavrylko@users.noreply.github.com> Date: Sun, 31 May 2026 17:09:08 +0200 Subject: [PATCH 4/4] docs: add configuration files for multilingual support in analyzer, nlp engine, and recognizers --- .../transformers_multilingual/analyzer.yaml | 10 +++++ .../transformers_multilingual/nlp_engine.yaml | 29 +++++++++++++++ .../recognizers.yaml | 37 +++++++++++++++++++ 3 files changed, 76 insertions(+) create mode 100644 presidio-analyzer/presidio_analyzer/conf/transformers_multilingual/analyzer.yaml create mode 100644 presidio-analyzer/presidio_analyzer/conf/transformers_multilingual/nlp_engine.yaml create mode 100644 presidio-analyzer/presidio_analyzer/conf/transformers_multilingual/recognizers.yaml diff --git a/presidio-analyzer/presidio_analyzer/conf/transformers_multilingual/analyzer.yaml b/presidio-analyzer/presidio_analyzer/conf/transformers_multilingual/analyzer.yaml new file mode 100644 index 0000000000..1593887d52 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/conf/transformers_multilingual/analyzer.yaml @@ -0,0 +1,10 @@ +supported_languages: + - en + - es + - de + - fr + - it + - pt + - nl + - sv +default_score_threshold: 0 diff --git a/presidio-analyzer/presidio_analyzer/conf/transformers_multilingual/nlp_engine.yaml b/presidio-analyzer/presidio_analyzer/conf/transformers_multilingual/nlp_engine.yaml new file mode 100644 index 0000000000..3c2d4cab51 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/conf/transformers_multilingual/nlp_engine.yaml @@ -0,0 +1,29 @@ +nlp_engine_name: spacy + + +models: + - + lang_code: en + model_name: xx_ent_wiki_sm + - + lang_code: es + model_name: xx_ent_wiki_sm + - + lang_code: de + model_name: xx_ent_wiki_sm + - + lang_code: fr + model_name: xx_ent_wiki_sm + - + lang_code: it + model_name: xx_ent_wiki_sm + - + lang_code: pt + model_name: xx_ent_wiki_sm + - + lang_code: nl + model_name: xx_ent_wiki_sm + - + lang_code: sv + model_name: xx_ent_wiki_sm + diff --git a/presidio-analyzer/presidio_analyzer/conf/transformers_multilingual/recognizers.yaml b/presidio-analyzer/presidio_analyzer/conf/transformers_multilingual/recognizers.yaml new file mode 100644 index 0000000000..f31d4c3b24 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/conf/transformers_multilingual/recognizers.yaml @@ -0,0 +1,37 @@ +supported_languages: + - en + - es + - de + - fr + - it + - pt + - nl + - sv +global_regex_flags: 26 + +recognizers: + - name: SpacyRecognizer + type: predefined + enabled: false + + - name: "HuggingFace NER" + type: predefined + class_name: HuggingFaceNerRecognizer + model_name: dslim/bert-base-NER + supported_languages: + - en + - es + - de + - fr + - it + - pt + - nl + - sv + supported_entities: + - PERSON + - LOCATION + - ORGANIZATION + - MISC + aggregation_strategy: simple + threshold: 0.3 + device: cpu