Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,13 @@ def validate_language_configuration(self):

return self

def _recognizer_description_for_errors(self) -> str:
if self.name is not None:
return self.name
if self.class_name is not None:
return self.class_name
return "recognizer"

@model_validator(mode="after")
def validate_entity_configuration(self):
"""Ensure proper entity validation."""
Expand All @@ -108,8 +115,9 @@ def validate_entity_configuration(self):
)

if user_provided_both:
describe = self._recognizer_description_for_errors()
raise ValueError(
f"Recognizer {self.name} has both "
f"Recognizer {describe} has both "
"'supported_entity' and 'supported_entities' specified."
)

Expand Down Expand Up @@ -140,11 +148,24 @@ class PredefinedRecognizerConfig(BaseRecognizerConfig):
"""Configuration for predefined recognizers."""

type: str = Field(default="predefined", description="Type of recognizer")
name: Optional[str] = Field(
default=None,
description=(
"Instance name for analysis results; optional when "
"`class_name` is set — some recognizers infer a stable default "
"(e.g. GLiNERRecognizer from model_name)."
),
)

@model_validator(mode="after")
def validate_predefined_recognizer_exists(self):
"""Validate that the predefined recognizer class actually exists."""
recognizer_class_name = self.class_name if self.class_name else self.name
if recognizer_class_name is None:
raise ValueError(
"Predefined recognizer requires either 'name' "
"(shorthand/class key) or 'class_name'."
)
try:
RecognizerListLoader.get_existing_recognizer_cls(recognizer_class_name)
except PredefinedRecognizerNotFoundError as e:
Expand Down Expand Up @@ -350,6 +371,7 @@ class RecognizerRegistryConfig(BaseModel):
recognizers: List[
Union[
HuggingFaceRecognizerConfig,
GLiNERRecognizerConfig,
PredefinedRecognizerConfig,
CustomRecognizerConfig,
str,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import hashlib
import json
import logging
import re
from typing import Dict, List, Optional

from presidio_analyzer import (
Expand All @@ -22,19 +24,45 @@

logger = logging.getLogger("presidio-analyzer")

_DEFAULT_GLINER_MODEL_NAME = "urchade/gliner_multi_pii-v1"
_LEGACY_GLINER_RECOGNIZER_NAME = "GLiNERRecognizer"


def _sanitize_model_name_for_instance_name(model_name: str) -> str:
"""Map a HF-style model id to a deterministic, collision-resistant suffix."""

sanitized = re.sub(r"[^0-9A-Za-z]+", "_", model_name)
sanitized = re.sub(r"_+", "_", sanitized).strip("_")
model_hash = hashlib.sha256(model_name.encode("utf-8")).hexdigest()[:8]
if not sanitized:
sanitized = "model"
return f"{sanitized}_{model_hash}"


def _default_gliner_recognizer_name(model_name: str) -> str:
"""Stable default recognizer ``name`` when the user omits ``name``.

Preserve the legacy name for the built-in default model for backwards compatibility.
"""

if model_name == _DEFAULT_GLINER_MODEL_NAME:
return _LEGACY_GLINER_RECOGNIZER_NAME
suffix = _sanitize_model_name_for_instance_name(model_name)
return f"{_LEGACY_GLINER_RECOGNIZER_NAME}_{suffix}"
Comment on lines +31 to +51


class GLiNERRecognizer(LocalRecognizer):
"""GLiNER model based entity recognizer."""

def __init__(
self,
supported_entities: Optional[List[str]] = None,
name: str = "GLiNERRecognizer",
name: Optional[str] = None,
supported_language: str = "en",
version: str = "0.0.1",
context: Optional[List[str]] = None,
entity_mapping: Optional[Dict[str, str]] = None,
model_name: str = "urchade/gliner_multi_pii-v1",
model_name: str = _DEFAULT_GLINER_MODEL_NAME,
flat_ner: bool = True,
multi_label: bool = False,
threshold: float = 0.30,
Expand All @@ -51,7 +79,9 @@ def __init__(
:param supported_entities: List of supported entities for this recognizer.
If None, all entities in Presidio's default configuration will be used.
see `NerModelConfiguration`
:param name: Name of the recognizer
:param name: Name of the recognizer. When omitted, a deterministic name is
derived from ``model_name`` (with the default model preserving the legacy
name ``GLiNERRecognizer``).
:param supported_language: Language code to use for the recognizer
:param version: Version of the recognizer
:param context: N/A for this recognizer
Expand Down Expand Up @@ -131,9 +161,13 @@ def __init__(

self.gliner = None

resolved_name = name if name is not None else _default_gliner_recognizer_name(
model_name
)

super().__init__(
supported_entities=supported_entities,
name=name,
name=resolved_name,
supported_language=supported_language,
version=version,
context=context,
Expand Down
63 changes: 60 additions & 3 deletions presidio-analyzer/tests/test_gliner_recognizer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,22 @@
import inspect
import sys
from unittest.mock import MagicMock, patch

import pytest
from unittest.mock import MagicMock, patch

from presidio_analyzer.predefined_recognizers import GLiNERRecognizer
from presidio_analyzer.chunkers import CharacterBasedTextChunker
from presidio_analyzer.predefined_recognizers import GLiNERRecognizer
from presidio_analyzer.predefined_recognizers.ner import gliner_recognizer


@pytest.fixture
def noop_gliner_load(monkeypatch):
"""Avoid GLiNER installs/weights by stubbing recognizer load."""

def _noop(self):
return None

monkeypatch.setattr(GLiNERRecognizer, "load", _noop)


@pytest.fixture
Expand Down Expand Up @@ -261,6 +273,52 @@ def mock_predict_entities(text, labels, flat_ner, threshold, multi_label):
)


def test_when_name_omitted_derives_distinct_names_from_model(noop_gliner_load):
"""No GLiNER package or weights are required (load is stubbed)."""

a = GLiNERRecognizer(model_name="model/a")
b = GLiNERRecognizer(model_name="model/b")
assert a.name.startswith("GLiNERRecognizer_model_a_")
assert b.name.startswith("GLiNERRecognizer_model_b_")
assert a.name != b.name


def test_when_sanitized_model_names_collide_then_hash_keeps_names_distinct(
noop_gliner_load,
):
"""Model IDs differing only by punctuation should still get unique names."""

a = GLiNERRecognizer(model_name="team/model-a")
b = GLiNERRecognizer(model_name="team_model_a")
punctuation_only = GLiNERRecognizer(model_name="---")

assert a.name.startswith("GLiNERRecognizer_team_model_a_")
assert b.name.startswith("GLiNERRecognizer_team_model_a_")
assert a.name != b.name
assert punctuation_only.name.startswith("GLiNERRecognizer_model_")


def test_when_explicit_name_then_preserved(noop_gliner_load):
"""Explicit recognizer names should not be replaced."""
r = GLiNERRecognizer(name="custom", model_name="model/a")
assert r.name == "custom"


def test_when_default_model_and_name_omitted_keeps_legacy_name(noop_gliner_load):
"""The built-in GLiNER default should keep the historical recognizer name."""
r = GLiNERRecognizer()
assert r.name == "GLiNERRecognizer"


def test_default_model_parameter_reuses_shared_constant():
"""Keep the constructor default and legacy-name comparison from drifting."""
signature = inspect.signature(GLiNERRecognizer.__init__)
assert (
signature.parameters["model_name"].default
== gliner_recognizer._DEFAULT_GLINER_MODEL_NAME
)


@pytest.mark.parametrize(
"load_onnx_model,onnx_model_file,expected_onnx_model,expected_file",
[
Expand Down Expand Up @@ -322,4 +380,3 @@ def test_when_model_kwargs_then_passes_to_from_pretrained():
assert call_kwargs["custom_param1"] == "value1"
assert call_kwargs["custom_param2"] == 42


39 changes: 39 additions & 0 deletions presidio-analyzer/tests/test_recognizers_loader_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,45 @@ def test_configuration_loader_bad_yaml_raises_value_error(tmp_path):
RecognizerConfigurationLoader.get(conf_file=str(f))


def test_yaml_two_gliner_entries_without_name_yield_distinct_recognizers(monkeypatch):
"""Validated registry YAML + RecognizerListLoader; no HF model download/load."""
from presidio_analyzer.input_validation import ConfigurationValidator
from presidio_analyzer.predefined_recognizers import GLiNERRecognizer

def _noop(self):
return None

monkeypatch.setattr(GLiNERRecognizer, "load", _noop)

cfg = ConfigurationValidator.validate_recognizer_registry_configuration(
{
"supported_languages": ["en"],
"global_regex_flags": 26,
"recognizers": [
{
"type": "predefined",
"class_name": "GLiNERRecognizer",
"model_name": "team/model-a",
},
{
"type": "predefined",
"class_name": "GLiNERRecognizer",
"model_name": "team/model-b",
},
],
}
)
instances = list(
RecognizerListLoader.get(
cfg["recognizers"], cfg["supported_languages"], cfg["global_regex_flags"]
)
)
names = sorted(r.name for r in instances)
assert names == sorted(
["GLiNERRecognizer_team_model_a", "GLiNERRecognizer_team_model_b"]
)


def test_convert_supported_entities_to_entity_uses_first_item():
"""Test that supported_entities list is converted to single supported_entity."""
conf = {"supported_entities": ["ENT1", "ENT2"]}
Expand Down
24 changes: 24 additions & 0 deletions presidio-analyzer/tests/test_yaml_recognizer_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,30 @@ def test_predefined_recognizer_config_with_language():
assert config.supported_languages is None


def test_recognizer_description_uses_class_name_when_name_missing():
"""Test error descriptions fall back to class_name when name is optional."""
config = PredefinedRecognizerConfig(class_name="CreditCardRecognizer")

assert config._recognizer_description_for_errors() == "CreditCardRecognizer"


def test_recognizer_description_uses_generic_fallback_without_name_or_class():
"""Test error descriptions have a generic fallback for incomplete configs."""
config = BaseRecognizerConfig.model_construct(name=None, class_name=None)

assert config._recognizer_description_for_errors() == "recognizer"


def test_predefined_recognizer_requires_name_or_class_name():
"""Test predefined recognizer requires a name or explicit class_name."""
with pytest.raises(ValidationError) as exc_info:
PredefinedRecognizerConfig()

error_message = str(exc_info.value)
assert "Predefined recognizer requires either 'name'" in error_message
assert "'class_name'" in error_message


def test_custom_recognizer_config_with_patterns():
"""Test custom recognizer with patterns."""
patterns = [
Expand Down