microsoft · yuriihavrylko · May 25, 2026 · May 25, 2026 · May 25, 2026 · May 31, 2026
diff --git a/docs/analyzer/recognizer_registry_provider.md b/docs/analyzer/recognizer_registry_provider.md
@@ -107,6 +107,15 @@ The recognizer list comprises of both the predefined and custom recognizers, for
   - `supported_entity`: the detected entity associated by the recognizer.
   - `deny_list`: A list of words to detect, in case the recognizer uses a predefined list of words.
   - `deny_list_score`: confidence score for a term identified using a deny-list.
+  - `text_chunker`: configures how long texts are split for NER recognizers (`GLiNERRecognizer`, `HuggingFaceNerRecognizer`). Accepts a dict with `chunker_type` and params. Available types: `character` (default) and `tokenizer` (uses the model's tokenizer for accurate token-based splitting). Example:
+
+    ```yaml
+    - name: GLiNERRecognizer
+      type: predefined
+      text_chunker:
+        chunker_type: tokenizer
+        tokenizer: urchade/gliner_multi_pii-v1
+    ```
 
 !!! tip "Configuration Tip: Agglutinative languages (e.g., Korean)"
 

diff --git a/docs/samples/python/gliner.md b/docs/samples/python/gliner.md
@@ -75,6 +75,36 @@ results = analyzer_engine.analyze(
 print(results)
 ```
 
+## Text Chunking
+
+By default, GLiNERRecognizer splits long texts into character-based chunks (250 chars, 50 overlap). You can customize this via `text_chunker`:
+
+**From Python:**
+
+```python
+from presidio_analyzer.chunkers import CharacterBasedTextChunker
+
+gliner_recognizer = GLiNERRecognizer(
+    model_name="urchade/gliner_multi_pii-v1",
+    entity_mapping=entity_mapping,
+    text_chunker=CharacterBasedTextChunker(chunk_size=400, chunk_overlap=60),
+)
+```
+
+**From YAML (using tokenizer-based chunking):**
+
+```yaml
+- name: GLiNERRecognizer
+  type: predefined
+  text_chunker:
+    chunker_type: tokenizer
+    tokenizer: urchade/gliner_multi_pii-v1
+    max_tokens: 512
+    overlap_tokens: 32
+```
+
+The `tokenizer` chunker uses the model's actual tokenizer to split text by token count, respecting the model's token limit instead of approximating with character counts.
+
 ## ONNX Runtime Support
 
 GLiNERRecognizer supports using ONNX Runtime as a backend, which provides better CPU compatibility and can prevent crashes on older CPUs without AVX2 instruction set support (e.g., Intel Sandy Bridge).

diff --git a/presidio-analyzer/presidio_analyzer/chunkers/__init__.py b/presidio-analyzer/presidio_analyzer/chunkers/__init__.py
@@ -5,11 +5,14 @@
     CharacterBasedTextChunker,
 )
 from presidio_analyzer.chunkers.text_chunker_provider import TextChunkerProvider
+from presidio_analyzer.chunkers.tokenizer_based_text_chunker import (
+    TokenizerBasedTextChunker,
+)
 
 __all__ = [
     "BaseTextChunker",
     "TextChunk",
     "CharacterBasedTextChunker",
     "TextChunkerProvider",
+    "TokenizerBasedTextChunker",
 ]
-
diff --git a/presidio-analyzer/presidio_analyzer/chunkers/text_chunker_provider.py b/presidio-analyzer/presidio_analyzer/chunkers/text_chunker_provider.py
@@ -1,7 +1,7 @@
 """Factory provider for creating text chunkers from configuration."""
 
 import logging
-from typing import Any, Dict, Optional, Type
+from typing import Any, Dict, Optional
 
 from presidio_analyzer.chunkers.base_chunker import BaseTextChunker
 from presidio_analyzer.chunkers.character_based_text_chunker import (
@@ -10,11 +10,6 @@
 
 logger = logging.getLogger("presidio-analyzer")
 
-# Registry mapping chunker type names to classes
-_CHUNKER_REGISTRY: Dict[str, Type[BaseTextChunker]] = {
-    "character": CharacterBasedTextChunker,
-}
-
 
 class TextChunkerProvider:
     """Create text chunkers from configuration.
@@ -44,17 +39,23 @@ def create_chunker(self) -> BaseTextChunker:
         config = self.chunker_configuration.copy()
         chunker_type = config.pop("chunker_type", "character")
 
-        if chunker_type not in _CHUNKER_REGISTRY:
+        if chunker_type == "character":
+            chunker_class = CharacterBasedTextChunker
+        elif chunker_type == "tokenizer":
+            from presidio_analyzer.chunkers.tokenizer_based_text_chunker import (
+                TokenizerBasedTextChunker,
+            )
+
+            chunker_class = TokenizerBasedTextChunker
+        else:
             raise ValueError(
                 f"Unknown chunker_type '{chunker_type}'. "
-                f"Available: {list(_CHUNKER_REGISTRY.keys())}"
+                f"Available: ['character', 'tokenizer']"
             )
 
-        chunker_class = _CHUNKER_REGISTRY[chunker_type]
         try:
             return chunker_class(**config)
         except TypeError as exc:
             raise ValueError(
                 f"Invalid configuration for chunker_type '{chunker_type}': {config}"
             ) from exc
-
diff --git a/presidio-analyzer/presidio_analyzer/chunkers/tokenizer_based_text_chunker.py b/presidio-analyzer/presidio_analyzer/chunkers/tokenizer_based_text_chunker.py
@@ -0,0 +1,200 @@
+"""Tokenizer-based text chunker using HuggingFace tokenizers."""
+
+import logging
+from typing import TYPE_CHECKING, List, Optional, Union
+
+from presidio_analyzer.chunkers.base_chunker import BaseTextChunker, TextChunk
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizerBase
+
+logger = logging.getLogger("presidio-analyzer")
+
+# Fallback when the tokenizer does not expose a finite model_max_length
+_DEFAULT_MAX_TOKENS = 512
+
+
+class TokenizerBasedTextChunker(BaseTextChunker):
+    """Text chunker that splits text based on tokenizer token counts.
+
+    Unlike character-based chunking, this respects the model's actual token
+    limit and avoids splitting mid-subword. Chunks are defined by token
+    boundaries and mapped back to character offsets.
+
+    Can be configured from YAML via the ``text_chunker`` field::
+
+        text_chunker:
+          chunker_type: tokenizer
+          max_tokens: 512
+          overlap_tokens: 32
+
+    When ``tokenizer`` is omitted, the chunker starts in deferred mode and
+    the recognizer resolves it at model-load time using the model's own
+    tokenizer (via :meth:`resolve`).
+
+    :param tokenizer: A HuggingFace tokenizer name (str), a loaded
+        PreTrainedTokenizer instance, or None for deferred mode.
+    :param max_tokens: Maximum number of tokens per chunk. Defaults to the
+        tokenizer's model_max_length (falls back to 512 if not set or
+        unreasonably large).
+    :param overlap_tokens: Number of tokens to overlap between consecutive
+        chunks (must be >= 0 and < max_tokens). Defaults to 32.
+    """
+
+    def __init__(
+        self,
+        tokenizer: Optional[Union[str, "PreTrainedTokenizerBase"]] = None,
+        max_tokens: Optional[int] = None,
+        overlap_tokens: int = 32,
+    ):
+        if tokenizer is None:
+            # Deferred mode: tokenizer will be provided later via resolve().
+            # Store config for now; validation happens in resolve().
+            self.tokenizer = None
+            self.max_tokens = max_tokens
+            self.overlap_tokens = overlap_tokens
+            return
+
+        if isinstance(tokenizer, str):
+            try:
+                from transformers import AutoTokenizer
+            except ImportError as e:
+                raise ImportError(
+                    "transformers is required to load a tokenizer by name. "
+                    "Install it with: pip install transformers"
+                ) from e
+            tokenizer = AutoTokenizer.from_pretrained(tokenizer)
+
+        self._init_with_tokenizer(tokenizer, max_tokens, overlap_tokens)
+
+    def _init_with_tokenizer(
+        self,
+        tokenizer: "PreTrainedTokenizerBase",
+        max_tokens: Optional[int],
+        overlap_tokens: int,
+    ) -> None:
+        """Initialize with a loaded tokenizer instance."""
+        self.tokenizer = tokenizer
+
+        if not getattr(tokenizer, "is_fast", True):
+            raise ValueError(
+                "TokenizerBasedTextChunker requires a fast tokenizer "
+                "(one that supports return_offsets_mapping). "
+                "Use AutoTokenizer.from_pretrained(name, use_fast=True)."
+            )
+
+        if max_tokens is None:
+            raw = getattr(tokenizer, "model_max_length", _DEFAULT_MAX_TOKENS)
+            # Some tokenizers report absurdly large values (e.g. 1e30)
+            if raw is None or raw > 1_000_000:
+                max_tokens = _DEFAULT_MAX_TOKENS
+            else:
+                max_tokens = raw
+
+            # Reserve space for special tokens ([CLS], [SEP], etc.) that the
+            # NER pipeline adds automatically, so chunks don't exceed the
+            # model's actual input limit.
+            num_special = getattr(
+                tokenizer, "num_special_tokens_to_add", lambda pair=False: 0
+            )(pair=False)
+            max_tokens = max(1, max_tokens - num_special)
+
+            # Clamp overlap if auto-derived max_tokens is smaller than default overlap
+            if overlap_tokens >= max_tokens:
+                overlap_tokens = max(0, max_tokens - 1)
+                logger.warning(
+                    "overlap_tokens clamped to %d (max_tokens=%d)",
+                    overlap_tokens,
+                    max_tokens,
+                )
+
+        if max_tokens <= 0:
+            raise ValueError("max_tokens must be greater than 0")
+        if overlap_tokens < 0 or overlap_tokens >= max_tokens:
+            raise ValueError(
+                "overlap_tokens must be non-negative and less than max_tokens"
+            )
+
+        self.max_tokens = max_tokens
+        self.overlap_tokens = overlap_tokens
+
+    def resolve(
+        self, tokenizer: "PreTrainedTokenizerBase"
+    ) -> "TokenizerBasedTextChunker":
+        """Resolve a deferred chunker with the model's own tokenizer.
+
+        :param tokenizer: A loaded HuggingFace fast tokenizer.
+        :return: self, for convenience.
+        """
+        self._init_with_tokenizer(tokenizer, self.max_tokens, self.overlap_tokens)
+        return self
+
+    @property
+    def is_deferred(self) -> bool:
+        """Whether this chunker is waiting for a tokenizer."""
+        return self.tokenizer is None
+
+    def chunk(self, text: str) -> List[TextChunk]:
+        """Split text into token-aligned chunks with character offset tracking.
+
+        :param text: The input text to chunk.
+        :return: List of TextChunk objects with text and position information.
+        :raises RuntimeError: If tokenizer has not been resolved yet.
+        """
+        if self.tokenizer is None:
+            raise RuntimeError(
+                "TokenizerBasedTextChunker has no tokenizer. "
+                "Either pass one at init or call resolve(tokenizer) first."
+            )
+        if not text:
+            return []
+
+        encoding = self.tokenizer(
+            text,
+            return_offsets_mapping=True,
+            add_special_tokens=False,
+            truncation=False,
+        )
+
+        offsets = encoding.get("offset_mapping")
+        if offsets is None:
+            raise ValueError(
+                "Tokenizer did not return offset_mapping. "
+                "TokenizerBasedTextChunker requires a fast tokenizer "
+                "(one that supports return_offsets_mapping)."
+            )
+        num_tokens = len(offsets)
+
+        logger.debug(
+            "Chunking text: length=%d chars, %d tokens, max_tokens=%d, overlap=%d",
+            len(text),
+            num_tokens,
+            self.max_tokens,
+            self.overlap_tokens,
+        )
+
+        if num_tokens <= self.max_tokens:
+            return [TextChunk(text=text, start=0, end=len(text))]
+
+        chunks = []
+        step = self.max_tokens - self.overlap_tokens
+        start_token = 0
+
+        while start_token < num_tokens:
+            end_token = min(start_token + self.max_tokens, num_tokens)
+
+            char_start = offsets[start_token][0]
+            char_end = offsets[end_token - 1][1]
+
+            chunks.append(
+                TextChunk(
+                    text=text[char_start:char_end], start=char_start, end=char_end
+                )
+            )
+
+            if end_token >= num_tokens:
+                break
+            start_token += step
+
+        logger.debug("Created %d chunks from text", len(chunks))
+        return chunks