Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
9888943
feat: add TokenizerBasedTextChunker and update TextChunkerProvider fo…
yuriihavrylko May 25, 2026
c16d31c
feat: enhance GLiNER and HuggingFaceNer recognizers with customizable…
yuriihavrylko May 25, 2026
6cab1fc
docs: add text chunking customization for GLiNERRecognizer using toke…
yuriihavrylko May 25, 2026
e4a5665
feat: reserve special tokens in max_tokens calculation for TokenizerB…
yuriihavrylko May 31, 2026
bb0f6b0
Merge branch 'main' into feat/tokenizer-based-text-chunker
omri374 Jun 1, 2026
9894929
style: format code in TokenizerBasedTextChunker
yuriihavrylko Jun 1, 2026
17c0635
refactor: remove lazy import for TokenizerBasedTextChunker in __init_…
yuriihavrylko Jun 1, 2026
9035af5
feat: enforce fast tokenizer requirement in TokenizerBasedTextChunker…
yuriihavrylko Jun 1, 2026
47d3a01
test: add YAML configuration tests for character and tokenizer chunke…
yuriihavrylko Jun 1, 2026
266dfe7
test: add tests for unknown chunker type and invalid text chunker dic…
yuriihavrylko Jun 1, 2026
2bf870b
feat: enhance tokenizer configuration and clamp overlap tokens in Tok…
yuriihavrylko Jun 1, 2026
4b4b376
test: add tests for slow tokenizer and overlap clamping in TokenizerB…
yuriihavrylko Jun 1, 2026
c5520f8
feat: add pydantic TextChunkerConfig model and integrate with GLiNER …
yuriihavrylko Jun 4, 2026
1358b7d
feat: make tokenizer optional in TokenizerBasedTextChunker with defer…
yuriihavrylko Jun 4, 2026
06b2e69
feat: handle missing offset_mapping in TokenizerBasedTextChunker
yuriihavrylko Jun 4, 2026
3f32f68
docs: update GLiNERRecognizer configuration to use model_name and imp…
yuriihavrylko Jun 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions docs/analyzer/recognizer_registry_provider.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,15 @@ The recognizer list comprises of both the predefined and custom recognizers, for
- `supported_entity`: the detected entity associated by the recognizer.
- `deny_list`: A list of words to detect, in case the recognizer uses a predefined list of words.
- `deny_list_score`: confidence score for a term identified using a deny-list.
- `text_chunker`: configures how long texts are split for NER recognizers (`GLiNERRecognizer`, `HuggingFaceNerRecognizer`). Accepts a dict with `chunker_type` and params. Available types: `character` (default) and `tokenizer` (uses the model's tokenizer for accurate token-based splitting). Example:

```yaml
Comment thread
omri374 marked this conversation as resolved.
- name: GLiNERRecognizer
type: predefined
text_chunker:
chunker_type: tokenizer
tokenizer: urchade/gliner_multi_pii-v1
```

!!! tip "Configuration Tip: Agglutinative languages (e.g., Korean)"

Expand Down
30 changes: 30 additions & 0 deletions docs/samples/python/gliner.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,36 @@ results = analyzer_engine.analyze(
print(results)
```

## Text Chunking

By default, GLiNERRecognizer splits long texts into character-based chunks (250 chars, 50 overlap). You can customize this via `text_chunker`:

**From Python:**

```python
from presidio_analyzer.chunkers import CharacterBasedTextChunker

gliner_recognizer = GLiNERRecognizer(
model_name="urchade/gliner_multi_pii-v1",
entity_mapping=entity_mapping,
text_chunker=CharacterBasedTextChunker(chunk_size=400, chunk_overlap=60),
)
```

**From YAML (using tokenizer-based chunking):**
Comment thread
omri374 marked this conversation as resolved.

```yaml
- name: GLiNERRecognizer
type: predefined
text_chunker:
chunker_type: tokenizer
tokenizer: urchade/gliner_multi_pii-v1
max_tokens: 512
overlap_tokens: 32
```

The `tokenizer` chunker uses the model's actual tokenizer to split text by token count, respecting the model's token limit instead of approximating with character counts.

## ONNX Runtime Support

GLiNERRecognizer supports using ONNX Runtime as a backend, which provides better CPU compatibility and can prevent crashes on older CPUs without AVX2 instruction set support (e.g., Intel Sandy Bridge).
Expand Down
12 changes: 12 additions & 0 deletions presidio-analyzer/presidio_analyzer/chunkers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,17 @@
"TextChunk",
"CharacterBasedTextChunker",
"TextChunkerProvider",
"TokenizerBasedTextChunker",
]


def __getattr__(name: str):
Comment thread
omri374 marked this conversation as resolved.
Outdated
"""Lazy import for TokenizerBasedTextChunker to avoid requiring transformers."""
if name == "TokenizerBasedTextChunker":
from presidio_analyzer.chunkers.tokenizer_based_text_chunker import (
TokenizerBasedTextChunker,
)

return TokenizerBasedTextChunker
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Factory provider for creating text chunkers from configuration."""

import logging
from typing import Any, Dict, Optional, Type
from typing import Any, Dict, Optional

from presidio_analyzer.chunkers.base_chunker import BaseTextChunker
from presidio_analyzer.chunkers.character_based_text_chunker import (
Expand All @@ -10,11 +10,6 @@

logger = logging.getLogger("presidio-analyzer")

# Registry mapping chunker type names to classes
_CHUNKER_REGISTRY: Dict[str, Type[BaseTextChunker]] = {
"character": CharacterBasedTextChunker,
}


class TextChunkerProvider:
"""Create text chunkers from configuration.
Expand Down Expand Up @@ -44,17 +39,23 @@ def create_chunker(self) -> BaseTextChunker:
config = self.chunker_configuration.copy()
chunker_type = config.pop("chunker_type", "character")

if chunker_type not in _CHUNKER_REGISTRY:
if chunker_type == "character":
chunker_class = CharacterBasedTextChunker
elif chunker_type == "tokenizer":
Comment thread
omri374 marked this conversation as resolved.
from presidio_analyzer.chunkers.tokenizer_based_text_chunker import (
TokenizerBasedTextChunker,
)

chunker_class = TokenizerBasedTextChunker
Comment thread
yuriihavrylko marked this conversation as resolved.
else:
raise ValueError(
f"Unknown chunker_type '{chunker_type}'. "
f"Available: {list(_CHUNKER_REGISTRY.keys())}"
f"Available: ['character', 'tokenizer']"
)
Comment thread
omri374 marked this conversation as resolved.
Comment thread
omri374 marked this conversation as resolved.
Comment thread
yuriihavrylko marked this conversation as resolved.

chunker_class = _CHUNKER_REGISTRY[chunker_type]
try:
return chunker_class(**config)
except TypeError as exc:
raise ValueError(
f"Invalid configuration for chunker_type '{chunker_type}': {config}"
) from exc

Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
"""Tokenizer-based text chunker using HuggingFace tokenizers."""
import logging
from typing import TYPE_CHECKING, List, Union

from presidio_analyzer.chunkers.base_chunker import BaseTextChunker, TextChunk

if TYPE_CHECKING:
from transformers import PreTrainedTokenizerBase

logger = logging.getLogger("presidio-analyzer")

# Fallback when the tokenizer does not expose a finite model_max_length
_DEFAULT_MAX_TOKENS = 512


class TokenizerBasedTextChunker(BaseTextChunker):
"""Text chunker that splits text based on tokenizer token counts.

Unlike character-based chunking, this respects the model's actual token
limit and avoids splitting mid-subword. Chunks are defined by token
boundaries and mapped back to character offsets.

Can be configured from YAML by passing a tokenizer name string:

chunker_configuration:
chunker_type: tokenizer
tokenizer: bert-base-uncased
max_tokens: 512
overlap_tokens: 32
Comment thread
omri374 marked this conversation as resolved.
Outdated

:param tokenizer: A HuggingFace tokenizer name (str) or a loaded
PreTrainedTokenizer instance.
:param max_tokens: Maximum number of tokens per chunk. Defaults to the
tokenizer's model_max_length (falls back to 512 if not set or
unreasonably large).
:param overlap_tokens: Number of tokens to overlap between consecutive
chunks (must be >= 0 and < max_tokens). Defaults to 32.
"""

def __init__(
self,
tokenizer: Union[str, "PreTrainedTokenizerBase"],
max_tokens: int = None,
overlap_tokens: int = 32,
Comment thread
yuriihavrylko marked this conversation as resolved.
):
Comment thread
yuriihavrylko marked this conversation as resolved.
if isinstance(tokenizer, str):
try:
from transformers import AutoTokenizer
except ImportError as e:
raise ImportError(
"transformers is required to load a tokenizer by name. "
"Install it with: pip install transformers"
) from e
tokenizer = AutoTokenizer.from_pretrained(tokenizer)

self.tokenizer = tokenizer

if max_tokens is None:
raw = getattr(tokenizer, "model_max_length", _DEFAULT_MAX_TOKENS)
# Some tokenizers report absurdly large values (e.g. 1e30)
if raw is None or raw > 1_000_000:
max_tokens = _DEFAULT_MAX_TOKENS
else:
max_tokens = raw
Comment thread
yuriihavrylko marked this conversation as resolved.

if max_tokens <= 0:
raise ValueError("max_tokens must be greater than 0")
if overlap_tokens < 0 or overlap_tokens >= max_tokens:
raise ValueError(
"overlap_tokens must be non-negative and less than max_tokens"
)
Comment thread
omri374 marked this conversation as resolved.

self.max_tokens = max_tokens
self.overlap_tokens = overlap_tokens

def chunk(self, text: str) -> List[TextChunk]:
"""Split text into token-aligned chunks with character offset tracking.

:param text: The input text to chunk.
:return: List of TextChunk objects with text and position information.
"""
if not text:
return []

encoding = self.tokenizer(
text,
return_offsets_mapping=True,
add_special_tokens=False,
truncation=False,
)

offsets = encoding["offset_mapping"]
Comment thread
yuriihavrylko marked this conversation as resolved.
Outdated
num_tokens = len(offsets)

logger.debug(
"Chunking text: length=%d chars, %d tokens, max_tokens=%d, overlap=%d",
len(text),
num_tokens,
self.max_tokens,
self.overlap_tokens,
)

if num_tokens <= self.max_tokens:
return [TextChunk(text=text, start=0, end=len(text))]

chunks = []
step = self.max_tokens - self.overlap_tokens
start_token = 0

while start_token < num_tokens:
end_token = min(start_token + self.max_tokens, num_tokens)

char_start = offsets[start_token][0]
char_end = offsets[end_token - 1][1]

chunks.append(TextChunk(text=text[char_start:char_end], start=char_start, end=char_end))

if end_token >= num_tokens:
break
start_token += step

logger.debug("Created %d chunks from text", len(chunks))
return chunks
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import json
import logging
from typing import Dict, List, Optional
from typing import Any, Dict, List, Optional, Union

from presidio_analyzer import (
AnalysisExplanation,
LocalRecognizer,
RecognizerResult,
)
from presidio_analyzer.chunkers import BaseTextChunker
from presidio_analyzer.chunkers import BaseTextChunker, TextChunkerProvider
from presidio_analyzer.nlp_engine import (
NerModelConfiguration,
NlpArtifacts,
Expand Down Expand Up @@ -39,7 +39,9 @@ def __init__(
multi_label: bool = False,
threshold: float = 0.30,
map_location: Optional[str] = None,
text_chunker: Optional[BaseTextChunker] = None,
text_chunker: Optional[Union[BaseTextChunker, Dict[str, Any]]] = None,
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In which scenario would the user pass a dict here? I think it's better to ask the user to pass the chunker class, and instantiate it using the dict prior to calling the recognizer.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The dict path is used when loading from yaml config. The recognizer registry passes yaml fields as kwargs directly to the constructor, so text_chunker arrives as a raw dict like {"chunker_type": "tokenizer", "tokenizer": "model-name", "max_tokens": 512}.

Without dict support here, users would need custom python code to instantiate the chunker before passing it - which defeats the purpose of yaml-based configuration.

The alternative would be handling the dict-to-object conversion in the registry loader, but that would require the loader to know about chunker-specific logic. Keeping it in the recognizer felt more self-contained

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have a pydantic validation layer between YAML and actual Presidio classes to handle configuration errors more gracefully. I don't see a reason not to use it here too, and avoid generic dicts as input. Please take a look and see if there's a reason for it not to apply here too. Thanks!

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. Moved the dict-to-object conversion into the Pydantic validation layer (TextChunkerConfig in yaml_recognizer_models.py) and the registry loader. Recognizer constructors now only accept Optional[BaseTextChunker], no more dicts

chunk_size: int = 250,
chunk_overlap: int = 50,
load_onnx_model: bool = False,
onnx_model_file: str = "model.onnx",
**model_kwargs,
Expand All @@ -63,9 +65,15 @@ def __init__(
(see GLiNER's documentation)
:param map_location: The device to use for the model.
If None, will auto-detect GPU or use CPU.
:param text_chunker: Custom text chunking strategy. If None, uses
CharacterBasedTextChunker with default settings (chunk_size=250,
chunk_overlap=50)
:param text_chunker: Text chunking strategy. Accepts a BaseTextChunker
instance (Python) or a dict config (YAML). If None, uses
CharacterBasedTextChunker with provided chunk_size and chunk_overlap.
Dict example::

{"chunker_type": "tokenizer", "tokenizer": "bert-base-uncased"}

:param chunk_size: Maximum number of characters per chunk.
:param chunk_overlap: Number of characters to overlap between chunks.
:param load_onnx_model: Whether to load the model using ONNX Runtime.
If True, uses ONNX Runtime backend which supports CPUs without AVX2.
Requires onnxruntime to be installed. Default is False.
Expand Down Expand Up @@ -118,15 +126,17 @@ def __init__(
self.onnx_model_file = onnx_model_file
self.model_kwargs = model_kwargs

# Use provided chunker or default to in-house character-based chunker
if text_chunker is not None:
# Initialize text chunker (object, dict config, or default)
if isinstance(text_chunker, dict):
self.text_chunker = TextChunkerProvider(text_chunker).create_chunker()
elif text_chunker is not None:
self.text_chunker = text_chunker
else:
from presidio_analyzer.chunkers import CharacterBasedTextChunker

self.text_chunker = CharacterBasedTextChunker(
chunk_size=250,
chunk_overlap=50,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)

self.gliner = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@
LocalRecognizer,
RecognizerResult,
)
from presidio_analyzer.chunkers import BaseTextChunker, CharacterBasedTextChunker
from presidio_analyzer.chunkers import (
BaseTextChunker,
CharacterBasedTextChunker,
TextChunkerProvider,
)
from presidio_analyzer.nlp_engine import NlpArtifacts, device_detector

try:
Expand Down Expand Up @@ -115,7 +119,7 @@ def __init__(
chunk_size: int = 400,
device: Optional[Union[str, int]] = None,
tokenizer_name: Optional[str] = None,
text_chunker: Optional[BaseTextChunker] = None,
text_chunker: Optional[Union[BaseTextChunker, Dict[str, Any]]] = None,
Comment thread
yuriihavrylko marked this conversation as resolved.
Outdated
label_prefixes: Optional[List[str]] = None,
**kwargs,
):
Expand Down Expand Up @@ -144,8 +148,13 @@ def __init__(
:param chunk_overlap: Number of characters to overlap between chunks.
:param chunk_size: Maximum number of characters per chunk.
:param tokenizer_name: Name of the tokenizer. Defaults to model_name.
:param text_chunker: Custom text chunking strategy. If None, uses
:param text_chunker: Text chunking strategy. Accepts a BaseTextChunker
instance (Python) or a dict config (YAML). If None, uses
CharacterBasedTextChunker with provided chunk_size and chunk_overlap.
Dict example::

{"chunker_type": "tokenizer", "tokenizer": "bert-base-uncased"}

:param label_prefixes: List of label prefixes to strip (e.g., B-, I-).
:raises ImportError: If transformers or torch libraries are not installed.
"""
Expand Down Expand Up @@ -199,8 +208,10 @@ def __init__(
context=context,
)

# Initialize the text chunker
if text_chunker:
# Initialize text chunker (object, dict config, or default)
if isinstance(text_chunker, dict):
self.text_chunker = TextChunkerProvider(text_chunker).create_chunker()
elif text_chunker is not None:
self.text_chunker = text_chunker
else:
self.text_chunker = CharacterBasedTextChunker(
Expand Down
Loading