microsoft · yuriihavrylko · May 25, 2026 · May 25, 2026 · May 25, 2026 · May 31, 2026
diff --git a/docs/analyzer/recognizer_registry_provider.md b/docs/analyzer/recognizer_registry_provider.md
@@ -107,6 +107,15 @@ The recognizer list comprises of both the predefined and custom recognizers, for
   - `supported_entity`: the detected entity associated by the recognizer.
   - `deny_list`: A list of words to detect, in case the recognizer uses a predefined list of words.
   - `deny_list_score`: confidence score for a term identified using a deny-list.
+  - `text_chunker`: configures how long texts are split for NER recognizers (`GLiNERRecognizer`, `HuggingFaceNerRecognizer`). Accepts a dict with `chunker_type` and params. Available types: `character` (default) and `tokenizer` (uses the model's tokenizer for accurate token-based splitting). Example:
+
+    ```yaml
+    - name: GLiNERRecognizer
+      type: predefined
+      text_chunker:
+        chunker_type: tokenizer
+        tokenizer: urchade/gliner_multi_pii-v1
+    ```
 
 !!! tip "Configuration Tip: Agglutinative languages (e.g., Korean)"
 

diff --git a/docs/samples/python/gliner.md b/docs/samples/python/gliner.md
@@ -75,6 +75,36 @@ results = analyzer_engine.analyze(
 print(results)
 ```
 
+## Text Chunking
+
+By default, GLiNERRecognizer splits long texts into character-based chunks (250 chars, 50 overlap). You can customize this via `text_chunker`:
+
+**From Python:**
+
+```python
+from presidio_analyzer.chunkers import CharacterBasedTextChunker
+
+gliner_recognizer = GLiNERRecognizer(
+    model_name="urchade/gliner_multi_pii-v1",
+    entity_mapping=entity_mapping,
+    text_chunker=CharacterBasedTextChunker(chunk_size=400, chunk_overlap=60),
+)
+```
+
+**From YAML (using tokenizer-based chunking):**
+
+```yaml
+- name: GLiNERRecognizer
+  type: predefined
+  text_chunker:
+    chunker_type: tokenizer
+    tokenizer: urchade/gliner_multi_pii-v1
+    max_tokens: 512
+    overlap_tokens: 32
+```
+
+The `tokenizer` chunker uses the model's actual tokenizer to split text by token count, respecting the model's token limit instead of approximating with character counts.
+
 ## ONNX Runtime Support
 
 GLiNERRecognizer supports using ONNX Runtime as a backend, which provides better CPU compatibility and can prevent crashes on older CPUs without AVX2 instruction set support (e.g., Intel Sandy Bridge).

diff --git a/presidio-analyzer/presidio_analyzer/chunkers/__init__.py b/presidio-analyzer/presidio_analyzer/chunkers/__init__.py
@@ -11,5 +11,17 @@
     "TextChunk",
     "CharacterBasedTextChunker",
     "TextChunkerProvider",
+    "TokenizerBasedTextChunker",
 ]
 
+
+def __getattr__(name: str):
+    """Lazy import for TokenizerBasedTextChunker to avoid requiring transformers."""
+    if name == "TokenizerBasedTextChunker":
+        from presidio_analyzer.chunkers.tokenizer_based_text_chunker import (
+            TokenizerBasedTextChunker,
+        )
+
+        return TokenizerBasedTextChunker
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
diff --git a/presidio-analyzer/presidio_analyzer/chunkers/text_chunker_provider.py b/presidio-analyzer/presidio_analyzer/chunkers/text_chunker_provider.py
@@ -1,7 +1,7 @@
 """Factory provider for creating text chunkers from configuration."""
 
 import logging
-from typing import Any, Dict, Optional, Type
+from typing import Any, Dict, Optional
 
 from presidio_analyzer.chunkers.base_chunker import BaseTextChunker
 from presidio_analyzer.chunkers.character_based_text_chunker import (
@@ -10,11 +10,6 @@
 
 logger = logging.getLogger("presidio-analyzer")
 
-# Registry mapping chunker type names to classes
-_CHUNKER_REGISTRY: Dict[str, Type[BaseTextChunker]] = {
-    "character": CharacterBasedTextChunker,
-}
-
 
 class TextChunkerProvider:
     """Create text chunkers from configuration.
@@ -44,17 +39,23 @@ def create_chunker(self) -> BaseTextChunker:
         config = self.chunker_configuration.copy()
         chunker_type = config.pop("chunker_type", "character")
 
-        if chunker_type not in _CHUNKER_REGISTRY:
+        if chunker_type == "character":
+            chunker_class = CharacterBasedTextChunker
+        elif chunker_type == "tokenizer":
+            from presidio_analyzer.chunkers.tokenizer_based_text_chunker import (
+                TokenizerBasedTextChunker,
+            )
+
+            chunker_class = TokenizerBasedTextChunker
+        else:
             raise ValueError(
                 f"Unknown chunker_type '{chunker_type}'. "
-                f"Available: {list(_CHUNKER_REGISTRY.keys())}"
+                f"Available: ['character', 'tokenizer']"
             )
 
-        chunker_class = _CHUNKER_REGISTRY[chunker_type]
         try:
             return chunker_class(**config)
         except TypeError as exc:
             raise ValueError(
                 f"Invalid configuration for chunker_type '{chunker_type}': {config}"
             ) from exc
-
diff --git a/presidio-analyzer/presidio_analyzer/chunkers/tokenizer_based_text_chunker.py b/presidio-analyzer/presidio_analyzer/chunkers/tokenizer_based_text_chunker.py
@@ -0,0 +1,123 @@
+"""Tokenizer-based text chunker using HuggingFace tokenizers."""
+import logging
+from typing import TYPE_CHECKING, List, Union
+
+from presidio_analyzer.chunkers.base_chunker import BaseTextChunker, TextChunk
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizerBase
+
+logger = logging.getLogger("presidio-analyzer")
+
+# Fallback when the tokenizer does not expose a finite model_max_length
+_DEFAULT_MAX_TOKENS = 512
+
+
+class TokenizerBasedTextChunker(BaseTextChunker):
+    """Text chunker that splits text based on tokenizer token counts.
+
+    Unlike character-based chunking, this respects the model's actual token
+    limit and avoids splitting mid-subword. Chunks are defined by token
+    boundaries and mapped back to character offsets.
+
+    Can be configured from YAML by passing a tokenizer name string:
+
+        chunker_configuration:
+          chunker_type: tokenizer
+          tokenizer: bert-base-uncased
+          max_tokens: 512
+          overlap_tokens: 32
+
+    :param tokenizer: A HuggingFace tokenizer name (str) or a loaded
+        PreTrainedTokenizer instance.
+    :param max_tokens: Maximum number of tokens per chunk. Defaults to the
+        tokenizer's model_max_length (falls back to 512 if not set or
+        unreasonably large).
+    :param overlap_tokens: Number of tokens to overlap between consecutive
+        chunks (must be >= 0 and < max_tokens). Defaults to 32.
+    """
+
+    def __init__(
+        self,
+        tokenizer: Union[str, "PreTrainedTokenizerBase"],
+        max_tokens: int = None,
+        overlap_tokens: int = 32,
+    ):
+        if isinstance(tokenizer, str):
+            try:
+                from transformers import AutoTokenizer
+            except ImportError as e:
+                raise ImportError(
+                    "transformers is required to load a tokenizer by name. "
+                    "Install it with: pip install transformers"
+                ) from e
+            tokenizer = AutoTokenizer.from_pretrained(tokenizer)
+
+        self.tokenizer = tokenizer
+
+        if max_tokens is None:
+            raw = getattr(tokenizer, "model_max_length", _DEFAULT_MAX_TOKENS)
+            # Some tokenizers report absurdly large values (e.g. 1e30)
+            if raw is None or raw > 1_000_000:
+                max_tokens = _DEFAULT_MAX_TOKENS
+            else:
+                max_tokens = raw
+
+        if max_tokens <= 0:
+            raise ValueError("max_tokens must be greater than 0")
+        if overlap_tokens < 0 or overlap_tokens >= max_tokens:
+            raise ValueError(
+                "overlap_tokens must be non-negative and less than max_tokens"
+            )
+
+        self.max_tokens = max_tokens
+        self.overlap_tokens = overlap_tokens
+
+    def chunk(self, text: str) -> List[TextChunk]:
+        """Split text into token-aligned chunks with character offset tracking.
+
+        :param text: The input text to chunk.
+        :return: List of TextChunk objects with text and position information.
+        """
+        if not text:
+            return []
+
+        encoding = self.tokenizer(
+            text,
+            return_offsets_mapping=True,
+            add_special_tokens=False,
+            truncation=False,
+        )
+
+        offsets = encoding["offset_mapping"]
+        num_tokens = len(offsets)
+
+        logger.debug(
+            "Chunking text: length=%d chars, %d tokens, max_tokens=%d, overlap=%d",
+            len(text),
+            num_tokens,
+            self.max_tokens,
+            self.overlap_tokens,
+        )
+
+        if num_tokens <= self.max_tokens:
+            return [TextChunk(text=text, start=0, end=len(text))]
+
+        chunks = []
+        step = self.max_tokens - self.overlap_tokens
+        start_token = 0
+
+        while start_token < num_tokens:
+            end_token = min(start_token + self.max_tokens, num_tokens)
+
+            char_start = offsets[start_token][0]
+            char_end = offsets[end_token - 1][1]
+
+            chunks.append(TextChunk(text=text[char_start:char_end], start=char_start, end=char_end))
+
+            if end_token >= num_tokens:
+                break
+            start_token += step
+
+        logger.debug("Created %d chunks from text", len(chunks))
+        return chunks
diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/ner/gliner_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/ner/gliner_recognizer.py
@@ -1,13 +1,13 @@
 import json
 import logging
-from typing import Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
 from presidio_analyzer import (
     AnalysisExplanation,
     LocalRecognizer,
     RecognizerResult,
 )
-from presidio_analyzer.chunkers import BaseTextChunker
+from presidio_analyzer.chunkers import BaseTextChunker, TextChunkerProvider
 from presidio_analyzer.nlp_engine import (
     NerModelConfiguration,
     NlpArtifacts,
@@ -39,7 +39,9 @@ def __init__(
         multi_label: bool = False,
         threshold: float = 0.30,
         map_location: Optional[str] = None,
-        text_chunker: Optional[BaseTextChunker] = None,
+        text_chunker: Optional[Union[BaseTextChunker, Dict[str, Any]]] = None,
+        chunk_size: int = 250,
+        chunk_overlap: int = 50,
         load_onnx_model: bool = False,
         onnx_model_file: str = "model.onnx",
         **model_kwargs,
@@ -63,9 +65,15 @@ def __init__(
         (see GLiNER's documentation)
         :param map_location: The device to use for the model.
             If None, will auto-detect GPU or use CPU.
-        :param text_chunker: Custom text chunking strategy. If None, uses
-            CharacterBasedTextChunker with default settings (chunk_size=250,
-            chunk_overlap=50)
+        :param text_chunker: Text chunking strategy. Accepts a BaseTextChunker
+            instance (Python) or a dict config (YAML). If None, uses
+            CharacterBasedTextChunker with provided chunk_size and chunk_overlap.
+            Dict example::
+
+                {"chunker_type": "tokenizer", "tokenizer": "bert-base-uncased"}
+
+        :param chunk_size: Maximum number of characters per chunk.
+        :param chunk_overlap: Number of characters to overlap between chunks.
         :param load_onnx_model: Whether to load the model using ONNX Runtime.
             If True, uses ONNX Runtime backend which supports CPUs without AVX2.
             Requires onnxruntime to be installed. Default is False.
@@ -118,15 +126,17 @@ def __init__(
         self.onnx_model_file = onnx_model_file
         self.model_kwargs = model_kwargs
 
-        # Use provided chunker or default to in-house character-based chunker
-        if text_chunker is not None:
+        # Initialize text chunker (object, dict config, or default)
+        if isinstance(text_chunker, dict):
+            self.text_chunker = TextChunkerProvider(text_chunker).create_chunker()
+        elif text_chunker is not None:
             self.text_chunker = text_chunker
         else:
             from presidio_analyzer.chunkers import CharacterBasedTextChunker
 
             self.text_chunker = CharacterBasedTextChunker(
-                chunk_size=250,
-                chunk_overlap=50,
+                chunk_size=chunk_size,
+                chunk_overlap=chunk_overlap,
             )
 
         self.gliner = None

diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/ner/huggingface_ner_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/ner/huggingface_ner_recognizer.py
@@ -22,7 +22,11 @@
     LocalRecognizer,
     RecognizerResult,
 )
-from presidio_analyzer.chunkers import BaseTextChunker, CharacterBasedTextChunker
+from presidio_analyzer.chunkers import (
+    BaseTextChunker,
+    CharacterBasedTextChunker,
+    TextChunkerProvider,
+)
 from presidio_analyzer.nlp_engine import NlpArtifacts, device_detector
 
 try:
@@ -115,7 +119,7 @@ def __init__(
         chunk_size: int = 400,
         device: Optional[Union[str, int]] = None,
         tokenizer_name: Optional[str] = None,
-        text_chunker: Optional[BaseTextChunker] = None,
+        text_chunker: Optional[Union[BaseTextChunker, Dict[str, Any]]] = None,
         label_prefixes: Optional[List[str]] = None,
         **kwargs,
     ):
@@ -144,8 +148,13 @@ def __init__(
         :param chunk_overlap: Number of characters to overlap between chunks.
         :param chunk_size: Maximum number of characters per chunk.
         :param tokenizer_name: Name of the tokenizer. Defaults to model_name.
-        :param text_chunker: Custom text chunking strategy. If None, uses
+        :param text_chunker: Text chunking strategy. Accepts a BaseTextChunker
+            instance (Python) or a dict config (YAML). If None, uses
             CharacterBasedTextChunker with provided chunk_size and chunk_overlap.
+            Dict example::
+
+                {"chunker_type": "tokenizer", "tokenizer": "bert-base-uncased"}
+
         :param label_prefixes: List of label prefixes to strip (e.g., B-, I-).
         :raises ImportError: If transformers or torch libraries are not installed.
         """
@@ -199,8 +208,10 @@ def __init__(
             context=context,
         )
 
-        # Initialize the text chunker
-        if text_chunker:
+        # Initialize text chunker (object, dict config, or default)
+        if isinstance(text_chunker, dict):
+            self.text_chunker = TextChunkerProvider(text_chunker).create_chunker()
+        elif text_chunker is not None:
             self.text_chunker = text_chunker
         else:
             self.text_chunker = CharacterBasedTextChunker(