Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,13 @@ def validate_language_configuration(self):

return self

def _recognizer_description_for_errors(self) -> str:
if self.name is not None:
return self.name
if self.class_name is not None:
return self.class_name
return "recognizer"

@model_validator(mode="after")
def validate_entity_configuration(self):
"""Ensure proper entity validation."""
Expand All @@ -108,8 +115,9 @@ def validate_entity_configuration(self):
)

if user_provided_both:
describe = self._recognizer_description_for_errors()
raise ValueError(
f"Recognizer {self.name} has both "
f"Recognizer {describe} has both "
"'supported_entity' and 'supported_entities' specified."
)

Expand Down Expand Up @@ -140,11 +148,24 @@ class PredefinedRecognizerConfig(BaseRecognizerConfig):
"""Configuration for predefined recognizers."""

type: str = Field(default="predefined", description="Type of recognizer")
name: Optional[str] = Field(
default=None,
description=(
"Instance name for analysis results; optional when "
"`class_name` is set — some recognizers infer a stable default "
"(e.g. GLiNERRecognizer from model_name)."
),
)

@model_validator(mode="after")
def validate_predefined_recognizer_exists(self):
"""Validate that the predefined recognizer class actually exists."""
recognizer_class_name = self.class_name if self.class_name else self.name
if recognizer_class_name is None:
raise ValueError(
"Predefined recognizer requires either 'name' "
"(shorthand/class key) or 'class_name'."
)
try:
RecognizerListLoader.get_existing_recognizer_cls(recognizer_class_name)
except PredefinedRecognizerNotFoundError as e:
Expand Down Expand Up @@ -176,6 +197,27 @@ class HuggingFaceRecognizerConfig(PredefinedRecognizerConfig):
)


class GLiNERRecognizerConfig(PredefinedRecognizerConfig):
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is now a duplicate of line 210

"""Configuration specifically for GLiNER recognizers."""

model_config = ConfigDict(extra="allow")

model_name: Optional[str] = Field(None, description="GLiNER model name")
entity_mapping: Optional[Dict[str, str]] = Field(
None, description="GLiNER label to Presidio entity mapping"
)
flat_ner: Optional[bool] = Field(None, description="Whether to use flat NER")
multi_label: Optional[bool] = Field(
None, description="Whether to use multi-label classification"
)
threshold: Optional[float] = Field(None, description="Confidence threshold")
map_location: Optional[str] = Field(None, description="Model device")
load_onnx_model: Optional[bool] = Field(
None, description="Whether to load GLiNER with ONNX Runtime"
)
onnx_model_file: Optional[str] = Field(None, description="ONNX model file name")


class CustomRecognizerConfig(BaseRecognizerConfig):
"""Configuration for custom pattern-based recognizers."""

Expand Down Expand Up @@ -280,6 +322,7 @@ class RecognizerRegistryConfig(BaseModel):
recognizers: List[
Union[
HuggingFaceRecognizerConfig,
GLiNERRecognizerConfig,
PredefinedRecognizerConfig,
CustomRecognizerConfig,
str,
Expand Down Expand Up @@ -463,4 +506,5 @@ def validate_language_presence(self):
# This allows for modular expansion without polluting the base config
CONFIG_MODEL_MAP: Dict[str, Type[BaseModel]] = {
"HuggingFaceNerRecognizer": HuggingFaceRecognizerConfig,
"GLiNERRecognizer": GLiNERRecognizerConfig,
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import logging
import re
from typing import Dict, List, Optional

from presidio_analyzer import (
Expand All @@ -22,14 +23,37 @@

logger = logging.getLogger("presidio-analyzer")

_DEFAULT_GLINER_MODEL_NAME = "urchade/gliner_multi_pii-v1"
_LEGACY_GLINER_RECOGNIZER_NAME = "GLiNERRecognizer"


def _sanitize_model_name_for_instance_name(model_name: str) -> str:
"""Map a HF-style model id to a deterministic single-token-ish suffix."""

sanitized = re.sub(r"[^0-9A-Za-z]+", "_", model_name)
sanitized = re.sub(r"_+", "_", sanitized).strip("_")
return sanitized


def _default_gliner_recognizer_name(model_name: str) -> str:
"""Stable default recognizer ``name`` when the user omits ``name``.

Preserve the legacy name for the built-in default model for backwards compatibility.
"""

if model_name == _DEFAULT_GLINER_MODEL_NAME:
return _LEGACY_GLINER_RECOGNIZER_NAME
suffix = _sanitize_model_name_for_instance_name(model_name)
return f"{_LEGACY_GLINER_RECOGNIZER_NAME}_{suffix}"
Comment on lines +31 to +51


class GLiNERRecognizer(LocalRecognizer):
"""GLiNER model based entity recognizer."""

def __init__(
self,
supported_entities: Optional[List[str]] = None,
name: str = "GLiNERRecognizer",
name: Optional[str] = None,
supported_language: str = "en",
version: str = "0.0.1",
context: Optional[List[str]] = None,
Expand All @@ -51,7 +75,9 @@ def __init__(
:param supported_entities: List of supported entities for this recognizer.
If None, all entities in Presidio's default configuration will be used.
see `NerModelConfiguration`
:param name: Name of the recognizer
:param name: Name of the recognizer. When omitted, a deterministic name is
derived from ``model_name`` (with the default model preserving the legacy
name ``GLiNERRecognizer``).
:param supported_language: Language code to use for the recognizer
:param version: Version of the recognizer
:param context: N/A for this recognizer
Expand Down Expand Up @@ -131,9 +157,13 @@ def __init__(

self.gliner = None

resolved_name = name if name is not None else _default_gliner_recognizer_name(
model_name
)

super().__init__(
supported_entities=supported_entities,
name=name,
name=resolved_name,
supported_language=supported_language,
version=version,
context=context,
Expand Down
36 changes: 34 additions & 2 deletions presidio-analyzer/tests/test_gliner_recognizer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,20 @@
import sys
from unittest.mock import MagicMock, patch

import pytest
from unittest.mock import MagicMock, patch

from presidio_analyzer.predefined_recognizers import GLiNERRecognizer
from presidio_analyzer.chunkers import CharacterBasedTextChunker
from presidio_analyzer.predefined_recognizers import GLiNERRecognizer


@pytest.fixture
def noop_gliner_load(monkeypatch):
"""Avoid GLiNER installs/weights by stubbing recognizer load."""

def _noop(self):
return None

monkeypatch.setattr(GLiNERRecognizer, "load", _noop)


@pytest.fixture
Expand Down Expand Up @@ -261,6 +271,28 @@ def mock_predict_entities(text, labels, flat_ner, threshold, multi_label):
)


def test_when_name_omitted_derives_distinct_names_from_model(noop_gliner_load):
"""No GLiNER package or weights are required (load is stubbed)."""

a = GLiNERRecognizer(model_name="model/a")
b = GLiNERRecognizer(model_name="model/b")
assert a.name == "GLiNERRecognizer_model_a"
assert b.name == "GLiNERRecognizer_model_b"
assert a.name != b.name


def test_when_explicit_name_then_preserved(noop_gliner_load):
"""Explicit recognizer names should not be replaced."""
r = GLiNERRecognizer(name="custom", model_name="model/a")
assert r.name == "custom"


def test_when_default_model_and_name_omitted_keeps_legacy_name(noop_gliner_load):
"""The built-in GLiNER default should keep the historical recognizer name."""
r = GLiNERRecognizer()
assert r.name == "GLiNERRecognizer"


@pytest.mark.parametrize(
"load_onnx_model,onnx_model_file,expected_onnx_model,expected_file",
[
Expand Down
39 changes: 39 additions & 0 deletions presidio-analyzer/tests/test_recognizers_loader_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,45 @@ def test_configuration_loader_bad_yaml_raises_value_error(tmp_path):
RecognizerConfigurationLoader.get(conf_file=str(f))


def test_yaml_two_gliner_entries_without_name_yield_distinct_recognizers(monkeypatch):
"""Validated registry YAML + RecognizerListLoader; no HF model download/load."""
from presidio_analyzer.input_validation import ConfigurationValidator
from presidio_analyzer.predefined_recognizers import GLiNERRecognizer

def _noop(self):
return None

monkeypatch.setattr(GLiNERRecognizer, "load", _noop)

cfg = ConfigurationValidator.validate_recognizer_registry_configuration(
{
"supported_languages": ["en"],
"global_regex_flags": 26,
"recognizers": [
{
"type": "predefined",
"class_name": "GLiNERRecognizer",
"model_name": "team/model-a",
},
{
"type": "predefined",
"class_name": "GLiNERRecognizer",
"model_name": "team/model-b",
},
],
}
)
instances = list(
RecognizerListLoader.get(
cfg["recognizers"], cfg["supported_languages"], cfg["global_regex_flags"]
)
)
names = sorted(r.name for r in instances)
assert names == sorted(
["GLiNERRecognizer_team_model_a", "GLiNERRecognizer_team_model_b"]
)


def test_convert_supported_entities_to_entity_uses_first_item():
"""Test that supported_entities list is converted to single supported_entity."""
conf = {"supported_entities": ["ENT1", "ENT2"]}
Expand Down
24 changes: 24 additions & 0 deletions presidio-analyzer/tests/test_yaml_recognizer_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,30 @@ def test_predefined_recognizer_config_with_language():
assert config.supported_languages is None


def test_recognizer_description_uses_class_name_when_name_missing():
"""Test error descriptions fall back to class_name when name is optional."""
config = PredefinedRecognizerConfig(class_name="CreditCardRecognizer")

assert config._recognizer_description_for_errors() == "CreditCardRecognizer"


def test_recognizer_description_uses_generic_fallback_without_name_or_class():
"""Test error descriptions have a generic fallback for incomplete configs."""
config = BaseRecognizerConfig.model_construct(name=None, class_name=None)

assert config._recognizer_description_for_errors() == "recognizer"


def test_predefined_recognizer_requires_name_or_class_name():
"""Test predefined recognizer requires a name or explicit class_name."""
with pytest.raises(ValidationError) as exc_info:
PredefinedRecognizerConfig()

error_message = str(exc_info.value)
assert "Predefined recognizer requires either 'name'" in error_message
assert "'class_name'" in error_message


def test_custom_recognizer_config_with_patterns():
"""Test custom recognizer with patterns."""
patterns = [
Expand Down