Skip to content
Open
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
9888943
feat: add TokenizerBasedTextChunker and update TextChunkerProvider fo…
yuriihavrylko May 25, 2026
c16d31c
feat: enhance GLiNER and HuggingFaceNer recognizers with customizable…
yuriihavrylko May 25, 2026
6cab1fc
docs: add text chunking customization for GLiNERRecognizer using toke…
yuriihavrylko May 25, 2026
e4a5665
feat: reserve special tokens in max_tokens calculation for TokenizerB…
yuriihavrylko May 31, 2026
bb0f6b0
Merge branch 'main' into feat/tokenizer-based-text-chunker
omri374 Jun 1, 2026
9894929
style: format code in TokenizerBasedTextChunker
yuriihavrylko Jun 1, 2026
17c0635
refactor: remove lazy import for TokenizerBasedTextChunker in __init_…
yuriihavrylko Jun 1, 2026
9035af5
feat: enforce fast tokenizer requirement in TokenizerBasedTextChunker…
yuriihavrylko Jun 1, 2026
47d3a01
test: add YAML configuration tests for character and tokenizer chunke…
yuriihavrylko Jun 1, 2026
266dfe7
test: add tests for unknown chunker type and invalid text chunker dic…
yuriihavrylko Jun 1, 2026
2bf870b
feat: enhance tokenizer configuration and clamp overlap tokens in Tok…
yuriihavrylko Jun 1, 2026
4b4b376
test: add tests for slow tokenizer and overlap clamping in TokenizerB…
yuriihavrylko Jun 1, 2026
c5520f8
feat: add pydantic TextChunkerConfig model and integrate with GLiNER …
yuriihavrylko Jun 4, 2026
1358b7d
feat: make tokenizer optional in TokenizerBasedTextChunker with defer…
yuriihavrylko Jun 4, 2026
06b2e69
feat: handle missing offset_mapping in TokenizerBasedTextChunker
yuriihavrylko Jun 4, 2026
3f32f68
docs: update GLiNERRecognizer configuration to use model_name and imp…
yuriihavrylko Jun 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions docs/analyzer/recognizer_registry_provider.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,15 @@ The recognizer list comprises of both the predefined and custom recognizers, for
- `supported_entity`: the detected entity associated by the recognizer.
- `deny_list`: A list of words to detect, in case the recognizer uses a predefined list of words.
- `deny_list_score`: confidence score for a term identified using a deny-list.
- `text_chunker`: configures how long texts are split for NER recognizers (`GLiNERRecognizer`, `HuggingFaceNerRecognizer`). Accepts a dict with `chunker_type` and params. Available types: `character` (default) and `tokenizer` (uses the model's tokenizer for accurate token-based splitting). Example:

```yaml
Comment thread
omri374 marked this conversation as resolved.
- name: GLiNERRecognizer
type: predefined
text_chunker:
chunker_type: tokenizer
tokenizer: urchade/gliner_multi_pii-v1
```

!!! tip "Configuration Tip: Agglutinative languages (e.g., Korean)"

Expand Down
30 changes: 30 additions & 0 deletions docs/samples/python/gliner.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,36 @@ results = analyzer_engine.analyze(
print(results)
```

## Text Chunking

By default, GLiNERRecognizer splits long texts into character-based chunks (250 chars, 50 overlap). You can customize this via `text_chunker`:

**From Python:**

```python
from presidio_analyzer.chunkers import CharacterBasedTextChunker

gliner_recognizer = GLiNERRecognizer(
model_name="urchade/gliner_multi_pii-v1",
entity_mapping=entity_mapping,
text_chunker=CharacterBasedTextChunker(chunk_size=400, chunk_overlap=60),
)
```

**From YAML (using tokenizer-based chunking):**
Comment thread
omri374 marked this conversation as resolved.

```yaml
- name: GLiNERRecognizer
type: predefined
text_chunker:
chunker_type: tokenizer
tokenizer: urchade/gliner_multi_pii-v1
max_tokens: 512
overlap_tokens: 32
```

The `tokenizer` chunker uses the model's actual tokenizer to split text by token count, respecting the model's token limit instead of approximating with character counts.

## ONNX Runtime Support

GLiNERRecognizer supports using ONNX Runtime as a backend, which provides better CPU compatibility and can prevent crashes on older CPUs without AVX2 instruction set support (e.g., Intel Sandy Bridge).
Expand Down
5 changes: 4 additions & 1 deletion presidio-analyzer/presidio_analyzer/chunkers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,14 @@
CharacterBasedTextChunker,
)
from presidio_analyzer.chunkers.text_chunker_provider import TextChunkerProvider
from presidio_analyzer.chunkers.tokenizer_based_text_chunker import (
TokenizerBasedTextChunker,
)

__all__ = [
"BaseTextChunker",
"TextChunk",
"CharacterBasedTextChunker",
"TextChunkerProvider",
"TokenizerBasedTextChunker",
]

Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Factory provider for creating text chunkers from configuration."""

import logging
from typing import Any, Dict, Optional, Type
from typing import Any, Dict, Optional

from presidio_analyzer.chunkers.base_chunker import BaseTextChunker
from presidio_analyzer.chunkers.character_based_text_chunker import (
Expand All @@ -10,11 +10,6 @@

logger = logging.getLogger("presidio-analyzer")

# Registry mapping chunker type names to classes
_CHUNKER_REGISTRY: Dict[str, Type[BaseTextChunker]] = {
"character": CharacterBasedTextChunker,
}


class TextChunkerProvider:
"""Create text chunkers from configuration.
Expand Down Expand Up @@ -44,17 +39,23 @@ def create_chunker(self) -> BaseTextChunker:
config = self.chunker_configuration.copy()
chunker_type = config.pop("chunker_type", "character")

if chunker_type not in _CHUNKER_REGISTRY:
if chunker_type == "character":
chunker_class = CharacterBasedTextChunker
elif chunker_type == "tokenizer":
Comment thread
omri374 marked this conversation as resolved.
from presidio_analyzer.chunkers.tokenizer_based_text_chunker import (
TokenizerBasedTextChunker,
)

chunker_class = TokenizerBasedTextChunker
Comment thread
yuriihavrylko marked this conversation as resolved.
else:
raise ValueError(
f"Unknown chunker_type '{chunker_type}'. "
f"Available: {list(_CHUNKER_REGISTRY.keys())}"
f"Available: ['character', 'tokenizer']"
)
Comment thread
omri374 marked this conversation as resolved.
Comment thread
omri374 marked this conversation as resolved.
Comment thread
yuriihavrylko marked this conversation as resolved.

chunker_class = _CHUNKER_REGISTRY[chunker_type]
try:
return chunker_class(**config)
except TypeError as exc:
raise ValueError(
f"Invalid configuration for chunker_type '{chunker_type}': {config}"
) from exc

Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
"""Tokenizer-based text chunker using HuggingFace tokenizers."""

import logging
from typing import TYPE_CHECKING, List, Optional, Union

from presidio_analyzer.chunkers.base_chunker import BaseTextChunker, TextChunk

if TYPE_CHECKING:
from transformers import PreTrainedTokenizerBase

logger = logging.getLogger("presidio-analyzer")

# Fallback when the tokenizer does not expose a finite model_max_length
_DEFAULT_MAX_TOKENS = 512


class TokenizerBasedTextChunker(BaseTextChunker):
"""Text chunker that splits text based on tokenizer token counts.

Unlike character-based chunking, this respects the model's actual token
limit and avoids splitting mid-subword. Chunks are defined by token
boundaries and mapped back to character offsets.

Can be configured from YAML via the ``text_chunker`` field::

text_chunker:
chunker_type: tokenizer
max_tokens: 512
overlap_tokens: 32

When ``tokenizer`` is omitted, the chunker starts in deferred mode and
the recognizer resolves it at model-load time using the model's own
tokenizer (via :meth:`resolve`).

:param tokenizer: A HuggingFace tokenizer name (str), a loaded
PreTrainedTokenizer instance, or None for deferred mode.
:param max_tokens: Maximum number of tokens per chunk. Defaults to the
tokenizer's model_max_length (falls back to 512 if not set or
unreasonably large).
:param overlap_tokens: Number of tokens to overlap between consecutive
chunks (must be >= 0 and < max_tokens). Defaults to 32.
"""

def __init__(
self,
tokenizer: Optional[Union[str, "PreTrainedTokenizerBase"]] = None,
max_tokens: Optional[int] = None,
overlap_tokens: int = 32,
Comment thread
yuriihavrylko marked this conversation as resolved.
):
Comment thread
yuriihavrylko marked this conversation as resolved.
if tokenizer is None:
# Deferred mode: tokenizer will be provided later via resolve().
# Store config for now; validation happens in resolve().
self.tokenizer = None
self.max_tokens = max_tokens
self.overlap_tokens = overlap_tokens
return

if isinstance(tokenizer, str):
try:
from transformers import AutoTokenizer
except ImportError as e:
raise ImportError(
"transformers is required to load a tokenizer by name. "
"Install it with: pip install transformers"
) from e
tokenizer = AutoTokenizer.from_pretrained(tokenizer)

self._init_with_tokenizer(tokenizer, max_tokens, overlap_tokens)

def _init_with_tokenizer(
self,
tokenizer: "PreTrainedTokenizerBase",
max_tokens: Optional[int],
overlap_tokens: int,
) -> None:
"""Initialize with a loaded tokenizer instance."""
self.tokenizer = tokenizer

if not getattr(tokenizer, "is_fast", True):
raise ValueError(
"TokenizerBasedTextChunker requires a fast tokenizer "
"(one that supports return_offsets_mapping). "
"Use AutoTokenizer.from_pretrained(name, use_fast=True)."
)
Comment thread
omri374 marked this conversation as resolved.

if max_tokens is None:
raw = getattr(tokenizer, "model_max_length", _DEFAULT_MAX_TOKENS)
# Some tokenizers report absurdly large values (e.g. 1e30)
if raw is None or raw > 1_000_000:
max_tokens = _DEFAULT_MAX_TOKENS
else:
max_tokens = raw
Comment thread
yuriihavrylko marked this conversation as resolved.

# Reserve space for special tokens ([CLS], [SEP], etc.) that the
# NER pipeline adds automatically, so chunks don't exceed the
# model's actual input limit.
num_special = getattr(
tokenizer, "num_special_tokens_to_add", lambda pair=False: 0
)(pair=False)
max_tokens = max(1, max_tokens - num_special)

# Clamp overlap if auto-derived max_tokens is smaller than default overlap
if overlap_tokens >= max_tokens:
overlap_tokens = max(0, max_tokens - 1)
logger.warning(
"overlap_tokens clamped to %d (max_tokens=%d)",
overlap_tokens,
max_tokens,
)

if max_tokens <= 0:
raise ValueError("max_tokens must be greater than 0")
if overlap_tokens < 0 or overlap_tokens >= max_tokens:
raise ValueError(
"overlap_tokens must be non-negative and less than max_tokens"
)
Comment thread
omri374 marked this conversation as resolved.

self.max_tokens = max_tokens
self.overlap_tokens = overlap_tokens

def resolve(
self, tokenizer: "PreTrainedTokenizerBase"
) -> "TokenizerBasedTextChunker":
"""Resolve a deferred chunker with the model's own tokenizer.

:param tokenizer: A loaded HuggingFace fast tokenizer.
:return: self, for convenience.
"""
self._init_with_tokenizer(tokenizer, self.max_tokens, self.overlap_tokens)
return self

@property
def is_deferred(self) -> bool:
"""Whether this chunker is waiting for a tokenizer."""
return self.tokenizer is None

def chunk(self, text: str) -> List[TextChunk]:
"""Split text into token-aligned chunks with character offset tracking.

:param text: The input text to chunk.
:return: List of TextChunk objects with text and position information.
:raises RuntimeError: If tokenizer has not been resolved yet.
"""
if self.tokenizer is None:
raise RuntimeError(
"TokenizerBasedTextChunker has no tokenizer. "
"Either pass one at init or call resolve(tokenizer) first."
)
if not text:
return []

encoding = self.tokenizer(
text,
return_offsets_mapping=True,
add_special_tokens=False,
truncation=False,
)

offsets = encoding.get("offset_mapping")
if offsets is None:
raise ValueError(
"Tokenizer did not return offset_mapping. "
"TokenizerBasedTextChunker requires a fast tokenizer "
"(one that supports return_offsets_mapping)."
)
num_tokens = len(offsets)

logger.debug(
"Chunking text: length=%d chars, %d tokens, max_tokens=%d, overlap=%d",
len(text),
num_tokens,
self.max_tokens,
self.overlap_tokens,
)

if num_tokens <= self.max_tokens:
return [TextChunk(text=text, start=0, end=len(text))]

chunks = []
step = self.max_tokens - self.overlap_tokens
start_token = 0

while start_token < num_tokens:
end_token = min(start_token + self.max_tokens, num_tokens)

char_start = offsets[start_token][0]
char_end = offsets[end_token - 1][1]

chunks.append(
TextChunk(
text=text[char_start:char_end], start=char_start, end=char_end
)
)

if end_token >= num_tokens:
break
start_token += step

logger.debug("Created %d chunks from text", len(chunks))
return chunks
Loading