diff --git a/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py b/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py index 3b58101d92..8c6ac19480 100644 --- a/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py +++ b/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py @@ -99,6 +99,13 @@ def validate_language_configuration(self): return self + def _recognizer_description_for_errors(self) -> str: + if self.name is not None: + return self.name + if self.class_name is not None: + return self.class_name + return "recognizer" + @model_validator(mode="after") def validate_entity_configuration(self): """Ensure proper entity validation.""" @@ -108,8 +115,9 @@ def validate_entity_configuration(self): ) if user_provided_both: + describe = self._recognizer_description_for_errors() raise ValueError( - f"Recognizer {self.name} has both " + f"Recognizer {describe} has both " "'supported_entity' and 'supported_entities' specified." ) @@ -140,11 +148,24 @@ class PredefinedRecognizerConfig(BaseRecognizerConfig): """Configuration for predefined recognizers.""" type: str = Field(default="predefined", description="Type of recognizer") + name: Optional[str] = Field( + default=None, + description=( + "Instance name for analysis results; optional when " + "`class_name` is set — some recognizers infer a stable default " + "(e.g. GLiNERRecognizer from model_name)." + ), + ) @model_validator(mode="after") def validate_predefined_recognizer_exists(self): """Validate that the predefined recognizer class actually exists.""" recognizer_class_name = self.class_name if self.class_name else self.name + if recognizer_class_name is None: + raise ValueError( + "Predefined recognizer requires either 'name' " + "(shorthand/class key) or 'class_name'." + ) try: RecognizerListLoader.get_existing_recognizer_cls(recognizer_class_name) except PredefinedRecognizerNotFoundError as e: @@ -350,6 +371,7 @@ class RecognizerRegistryConfig(BaseModel): recognizers: List[ Union[ HuggingFaceRecognizerConfig, + GLiNERRecognizerConfig, PredefinedRecognizerConfig, CustomRecognizerConfig, str, diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/ner/gliner_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/ner/gliner_recognizer.py index 4e12f7e634..66f35e5791 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/ner/gliner_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/ner/gliner_recognizer.py @@ -1,5 +1,7 @@ +import hashlib import json import logging +import re from typing import Dict, List, Optional from presidio_analyzer import ( @@ -22,6 +24,32 @@ logger = logging.getLogger("presidio-analyzer") +_DEFAULT_GLINER_MODEL_NAME = "urchade/gliner_multi_pii-v1" +_LEGACY_GLINER_RECOGNIZER_NAME = "GLiNERRecognizer" + + +def _sanitize_model_name_for_instance_name(model_name: str) -> str: + """Map a HF-style model id to a deterministic, collision-resistant suffix.""" + + sanitized = re.sub(r"[^0-9A-Za-z]+", "_", model_name) + sanitized = re.sub(r"_+", "_", sanitized).strip("_") + model_hash = hashlib.sha256(model_name.encode("utf-8")).hexdigest()[:8] + if not sanitized: + sanitized = "model" + return f"{sanitized}_{model_hash}" + + +def _default_gliner_recognizer_name(model_name: str) -> str: + """Stable default recognizer ``name`` when the user omits ``name``. + + Preserve the legacy name for the built-in default model for backwards compatibility. + """ + + if model_name == _DEFAULT_GLINER_MODEL_NAME: + return _LEGACY_GLINER_RECOGNIZER_NAME + suffix = _sanitize_model_name_for_instance_name(model_name) + return f"{_LEGACY_GLINER_RECOGNIZER_NAME}_{suffix}" + class GLiNERRecognizer(LocalRecognizer): """GLiNER model based entity recognizer.""" @@ -29,12 +57,12 @@ class GLiNERRecognizer(LocalRecognizer): def __init__( self, supported_entities: Optional[List[str]] = None, - name: str = "GLiNERRecognizer", + name: Optional[str] = None, supported_language: str = "en", version: str = "0.0.1", context: Optional[List[str]] = None, entity_mapping: Optional[Dict[str, str]] = None, - model_name: str = "urchade/gliner_multi_pii-v1", + model_name: str = _DEFAULT_GLINER_MODEL_NAME, flat_ner: bool = True, multi_label: bool = False, threshold: float = 0.30, @@ -51,7 +79,9 @@ def __init__( :param supported_entities: List of supported entities for this recognizer. If None, all entities in Presidio's default configuration will be used. see `NerModelConfiguration` - :param name: Name of the recognizer + :param name: Name of the recognizer. When omitted, a deterministic name is + derived from ``model_name`` (with the default model preserving the legacy + name ``GLiNERRecognizer``). :param supported_language: Language code to use for the recognizer :param version: Version of the recognizer :param context: N/A for this recognizer @@ -131,9 +161,13 @@ def __init__( self.gliner = None + resolved_name = name if name is not None else _default_gliner_recognizer_name( + model_name + ) + super().__init__( supported_entities=supported_entities, - name=name, + name=resolved_name, supported_language=supported_language, version=version, context=context, diff --git a/presidio-analyzer/tests/test_gliner_recognizer.py b/presidio-analyzer/tests/test_gliner_recognizer.py index 528634441d..2dcf37d4e8 100644 --- a/presidio-analyzer/tests/test_gliner_recognizer.py +++ b/presidio-analyzer/tests/test_gliner_recognizer.py @@ -1,10 +1,22 @@ +import inspect import sys +from unittest.mock import MagicMock, patch import pytest -from unittest.mock import MagicMock, patch -from presidio_analyzer.predefined_recognizers import GLiNERRecognizer from presidio_analyzer.chunkers import CharacterBasedTextChunker +from presidio_analyzer.predefined_recognizers import GLiNERRecognizer +from presidio_analyzer.predefined_recognizers.ner import gliner_recognizer + + +@pytest.fixture +def noop_gliner_load(monkeypatch): + """Avoid GLiNER installs/weights by stubbing recognizer load.""" + + def _noop(self): + return None + + monkeypatch.setattr(GLiNERRecognizer, "load", _noop) @pytest.fixture @@ -261,6 +273,52 @@ def mock_predict_entities(text, labels, flat_ner, threshold, multi_label): ) +def test_when_name_omitted_derives_distinct_names_from_model(noop_gliner_load): + """No GLiNER package or weights are required (load is stubbed).""" + + a = GLiNERRecognizer(model_name="model/a") + b = GLiNERRecognizer(model_name="model/b") + assert a.name.startswith("GLiNERRecognizer_model_a_") + assert b.name.startswith("GLiNERRecognizer_model_b_") + assert a.name != b.name + + +def test_when_sanitized_model_names_collide_then_hash_keeps_names_distinct( + noop_gliner_load, +): + """Model IDs differing only by punctuation should still get unique names.""" + + a = GLiNERRecognizer(model_name="team/model-a") + b = GLiNERRecognizer(model_name="team_model_a") + punctuation_only = GLiNERRecognizer(model_name="---") + + assert a.name.startswith("GLiNERRecognizer_team_model_a_") + assert b.name.startswith("GLiNERRecognizer_team_model_a_") + assert a.name != b.name + assert punctuation_only.name.startswith("GLiNERRecognizer_model_") + + +def test_when_explicit_name_then_preserved(noop_gliner_load): + """Explicit recognizer names should not be replaced.""" + r = GLiNERRecognizer(name="custom", model_name="model/a") + assert r.name == "custom" + + +def test_when_default_model_and_name_omitted_keeps_legacy_name(noop_gliner_load): + """The built-in GLiNER default should keep the historical recognizer name.""" + r = GLiNERRecognizer() + assert r.name == "GLiNERRecognizer" + + +def test_default_model_parameter_reuses_shared_constant(): + """Keep the constructor default and legacy-name comparison from drifting.""" + signature = inspect.signature(GLiNERRecognizer.__init__) + assert ( + signature.parameters["model_name"].default + == gliner_recognizer._DEFAULT_GLINER_MODEL_NAME + ) + + @pytest.mark.parametrize( "load_onnx_model,onnx_model_file,expected_onnx_model,expected_file", [ @@ -322,4 +380,3 @@ def test_when_model_kwargs_then_passes_to_from_pretrained(): assert call_kwargs["custom_param1"] == "value1" assert call_kwargs["custom_param2"] == 42 - diff --git a/presidio-analyzer/tests/test_recognizers_loader_utils.py b/presidio-analyzer/tests/test_recognizers_loader_utils.py index 89a178b182..f2a6da54d7 100644 --- a/presidio-analyzer/tests/test_recognizers_loader_utils.py +++ b/presidio-analyzer/tests/test_recognizers_loader_utils.py @@ -173,6 +173,45 @@ def test_configuration_loader_bad_yaml_raises_value_error(tmp_path): RecognizerConfigurationLoader.get(conf_file=str(f)) +def test_yaml_two_gliner_entries_without_name_yield_distinct_recognizers(monkeypatch): + """Validated registry YAML + RecognizerListLoader; no HF model download/load.""" + from presidio_analyzer.input_validation import ConfigurationValidator + from presidio_analyzer.predefined_recognizers import GLiNERRecognizer + + def _noop(self): + return None + + monkeypatch.setattr(GLiNERRecognizer, "load", _noop) + + cfg = ConfigurationValidator.validate_recognizer_registry_configuration( + { + "supported_languages": ["en"], + "global_regex_flags": 26, + "recognizers": [ + { + "type": "predefined", + "class_name": "GLiNERRecognizer", + "model_name": "team/model-a", + }, + { + "type": "predefined", + "class_name": "GLiNERRecognizer", + "model_name": "team/model-b", + }, + ], + } + ) + instances = list( + RecognizerListLoader.get( + cfg["recognizers"], cfg["supported_languages"], cfg["global_regex_flags"] + ) + ) + names = sorted(r.name for r in instances) + assert names == sorted( + ["GLiNERRecognizer_team_model_a", "GLiNERRecognizer_team_model_b"] + ) + + def test_convert_supported_entities_to_entity_uses_first_item(): """Test that supported_entities list is converted to single supported_entity.""" conf = {"supported_entities": ["ENT1", "ENT2"]} diff --git a/presidio-analyzer/tests/test_yaml_recognizer_models.py b/presidio-analyzer/tests/test_yaml_recognizer_models.py index 1979f3bf9c..7a011afdd4 100644 --- a/presidio-analyzer/tests/test_yaml_recognizer_models.py +++ b/presidio-analyzer/tests/test_yaml_recognizer_models.py @@ -172,6 +172,30 @@ def test_predefined_recognizer_config_with_language(): assert config.supported_languages is None +def test_recognizer_description_uses_class_name_when_name_missing(): + """Test error descriptions fall back to class_name when name is optional.""" + config = PredefinedRecognizerConfig(class_name="CreditCardRecognizer") + + assert config._recognizer_description_for_errors() == "CreditCardRecognizer" + + +def test_recognizer_description_uses_generic_fallback_without_name_or_class(): + """Test error descriptions have a generic fallback for incomplete configs.""" + config = BaseRecognizerConfig.model_construct(name=None, class_name=None) + + assert config._recognizer_description_for_errors() == "recognizer" + + +def test_predefined_recognizer_requires_name_or_class_name(): + """Test predefined recognizer requires a name or explicit class_name.""" + with pytest.raises(ValidationError) as exc_info: + PredefinedRecognizerConfig() + + error_message = str(exc_info.value) + assert "Predefined recognizer requires either 'name'" in error_message + assert "'class_name'" in error_message + + def test_custom_recognizer_config_with_patterns(): """Test custom recognizer with patterns.""" patterns = [