-
Notifications
You must be signed in to change notification settings - Fork 1.1k
feat: Add tokenizer-based text chunking for NER recognizers #2041
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 7 commits
9888943
c16d31c
6cab1fc
e4a5665
bb0f6b0
9894929
17c0635
9035af5
47d3a01
266dfe7
2bf870b
4b4b376
c5520f8
1358b7d
06b2e69
3f32f68
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,136 @@ | ||
| """Tokenizer-based text chunker using HuggingFace tokenizers.""" | ||
|
|
||
| import logging | ||
| from typing import TYPE_CHECKING, List, Union | ||
|
|
||
| from presidio_analyzer.chunkers.base_chunker import BaseTextChunker, TextChunk | ||
|
|
||
| if TYPE_CHECKING: | ||
| from transformers import PreTrainedTokenizerBase | ||
|
|
||
| logger = logging.getLogger("presidio-analyzer") | ||
|
|
||
| # Fallback when the tokenizer does not expose a finite model_max_length | ||
| _DEFAULT_MAX_TOKENS = 512 | ||
|
|
||
|
|
||
| class TokenizerBasedTextChunker(BaseTextChunker): | ||
| """Text chunker that splits text based on tokenizer token counts. | ||
|
|
||
| Unlike character-based chunking, this respects the model's actual token | ||
| limit and avoids splitting mid-subword. Chunks are defined by token | ||
| boundaries and mapped back to character offsets. | ||
|
|
||
| Can be configured from YAML by passing a tokenizer name string: | ||
|
|
||
| chunker_configuration: | ||
| chunker_type: tokenizer | ||
| tokenizer: bert-base-uncased | ||
| max_tokens: 512 | ||
| overlap_tokens: 32 | ||
|
omri374 marked this conversation as resolved.
Outdated
|
||
|
|
||
| :param tokenizer: A HuggingFace tokenizer name (str) or a loaded | ||
| PreTrainedTokenizer instance. | ||
| :param max_tokens: Maximum number of tokens per chunk. Defaults to the | ||
| tokenizer's model_max_length (falls back to 512 if not set or | ||
| unreasonably large). | ||
| :param overlap_tokens: Number of tokens to overlap between consecutive | ||
| chunks (must be >= 0 and < max_tokens). Defaults to 32. | ||
| """ | ||
|
|
||
| def __init__( | ||
| self, | ||
| tokenizer: Union[str, "PreTrainedTokenizerBase"], | ||
| max_tokens: int = None, | ||
| overlap_tokens: int = 32, | ||
|
yuriihavrylko marked this conversation as resolved.
|
||
| ): | ||
|
yuriihavrylko marked this conversation as resolved.
|
||
| if isinstance(tokenizer, str): | ||
| try: | ||
| from transformers import AutoTokenizer | ||
| except ImportError as e: | ||
| raise ImportError( | ||
| "transformers is required to load a tokenizer by name. " | ||
| "Install it with: pip install transformers" | ||
| ) from e | ||
| tokenizer = AutoTokenizer.from_pretrained(tokenizer) | ||
|
|
||
| self.tokenizer = tokenizer | ||
|
|
||
| if max_tokens is None: | ||
| raw = getattr(tokenizer, "model_max_length", _DEFAULT_MAX_TOKENS) | ||
| # Some tokenizers report absurdly large values (e.g. 1e30) | ||
| if raw is None or raw > 1_000_000: | ||
| max_tokens = _DEFAULT_MAX_TOKENS | ||
| else: | ||
| max_tokens = raw | ||
|
yuriihavrylko marked this conversation as resolved.
|
||
|
|
||
| # Reserve space for special tokens ([CLS], [SEP], etc.) that the | ||
| # NER pipeline adds automatically, so chunks don't exceed the | ||
| # model's actual input limit. | ||
| num_special = getattr( | ||
| tokenizer, "num_special_tokens_to_add", lambda pair=False: 0 | ||
| )(pair=False) | ||
| max_tokens = max(1, max_tokens - num_special) | ||
|
|
||
| if max_tokens <= 0: | ||
| raise ValueError("max_tokens must be greater than 0") | ||
| if overlap_tokens < 0 or overlap_tokens >= max_tokens: | ||
| raise ValueError( | ||
| "overlap_tokens must be non-negative and less than max_tokens" | ||
| ) | ||
|
omri374 marked this conversation as resolved.
|
||
|
|
||
| self.max_tokens = max_tokens | ||
| self.overlap_tokens = overlap_tokens | ||
|
|
||
| def chunk(self, text: str) -> List[TextChunk]: | ||
| """Split text into token-aligned chunks with character offset tracking. | ||
|
|
||
| :param text: The input text to chunk. | ||
| :return: List of TextChunk objects with text and position information. | ||
| """ | ||
| if not text: | ||
| return [] | ||
|
|
||
| encoding = self.tokenizer( | ||
| text, | ||
| return_offsets_mapping=True, | ||
| add_special_tokens=False, | ||
| truncation=False, | ||
| ) | ||
|
|
||
| offsets = encoding["offset_mapping"] | ||
|
yuriihavrylko marked this conversation as resolved.
Outdated
|
||
| num_tokens = len(offsets) | ||
|
|
||
| logger.debug( | ||
| "Chunking text: length=%d chars, %d tokens, max_tokens=%d, overlap=%d", | ||
| len(text), | ||
| num_tokens, | ||
| self.max_tokens, | ||
| self.overlap_tokens, | ||
| ) | ||
|
|
||
| if num_tokens <= self.max_tokens: | ||
| return [TextChunk(text=text, start=0, end=len(text))] | ||
|
|
||
| chunks = [] | ||
| step = self.max_tokens - self.overlap_tokens | ||
| start_token = 0 | ||
|
|
||
| while start_token < num_tokens: | ||
| end_token = min(start_token + self.max_tokens, num_tokens) | ||
|
|
||
| char_start = offsets[start_token][0] | ||
| char_end = offsets[end_token - 1][1] | ||
|
|
||
| chunks.append( | ||
| TextChunk( | ||
| text=text[char_start:char_end], start=char_start, end=char_end | ||
| ) | ||
| ) | ||
|
|
||
| if end_token >= num_tokens: | ||
| break | ||
| start_token += step | ||
|
|
||
| logger.debug("Created %d chunks from text", len(chunks)) | ||
| return chunks | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,13 +1,13 @@ | ||
| import json | ||
| import logging | ||
| from typing import Dict, List, Optional | ||
| from typing import Any, Dict, List, Optional, Union | ||
|
|
||
| from presidio_analyzer import ( | ||
| AnalysisExplanation, | ||
| LocalRecognizer, | ||
| RecognizerResult, | ||
| ) | ||
| from presidio_analyzer.chunkers import BaseTextChunker | ||
| from presidio_analyzer.chunkers import BaseTextChunker, TextChunkerProvider | ||
| from presidio_analyzer.nlp_engine import ( | ||
| NerModelConfiguration, | ||
| NlpArtifacts, | ||
|
|
@@ -39,7 +39,9 @@ def __init__( | |
| multi_label: bool = False, | ||
| threshold: float = 0.30, | ||
| map_location: Optional[str] = None, | ||
| text_chunker: Optional[BaseTextChunker] = None, | ||
| text_chunker: Optional[Union[BaseTextChunker, Dict[str, Any]]] = None, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In which scenario would the user pass a dict here? I think it's better to ask the user to pass the chunker class, and instantiate it using the dict prior to calling the recognizer.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The dict path is used when loading from yaml config. The recognizer registry passes yaml fields as kwargs directly to the constructor, so Without dict support here, users would need custom python code to instantiate the chunker before passing it - which defeats the purpose of yaml-based configuration. The alternative would be handling the dict-to-object conversion in the registry loader, but that would require the loader to know about chunker-specific logic. Keeping it in the recognizer felt more self-contained
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have a pydantic validation layer between YAML and actual Presidio classes to handle configuration errors more gracefully. I don't see a reason not to use it here too, and avoid generic dicts as input. Please take a look and see if there's a reason for it not to apply here too. Thanks!
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. Moved the dict-to-object conversion into the Pydantic validation layer ( |
||
| chunk_size: int = 250, | ||
| chunk_overlap: int = 50, | ||
| load_onnx_model: bool = False, | ||
| onnx_model_file: str = "model.onnx", | ||
| **model_kwargs, | ||
|
|
@@ -63,9 +65,15 @@ def __init__( | |
| (see GLiNER's documentation) | ||
| :param map_location: The device to use for the model. | ||
| If None, will auto-detect GPU or use CPU. | ||
| :param text_chunker: Custom text chunking strategy. If None, uses | ||
| CharacterBasedTextChunker with default settings (chunk_size=250, | ||
| chunk_overlap=50) | ||
| :param text_chunker: Text chunking strategy. Accepts a BaseTextChunker | ||
| instance (Python) or a dict config (YAML). If None, uses | ||
| CharacterBasedTextChunker with provided chunk_size and chunk_overlap. | ||
| Dict example:: | ||
|
|
||
| {"chunker_type": "tokenizer", "tokenizer": "bert-base-uncased"} | ||
|
|
||
| :param chunk_size: Maximum number of characters per chunk. | ||
| :param chunk_overlap: Number of characters to overlap between chunks. | ||
| :param load_onnx_model: Whether to load the model using ONNX Runtime. | ||
| If True, uses ONNX Runtime backend which supports CPUs without AVX2. | ||
| Requires onnxruntime to be installed. Default is False. | ||
|
|
@@ -118,15 +126,17 @@ def __init__( | |
| self.onnx_model_file = onnx_model_file | ||
| self.model_kwargs = model_kwargs | ||
|
|
||
| # Use provided chunker or default to in-house character-based chunker | ||
| if text_chunker is not None: | ||
| # Initialize text chunker (object, dict config, or default) | ||
| if isinstance(text_chunker, dict): | ||
| self.text_chunker = TextChunkerProvider(text_chunker).create_chunker() | ||
| elif text_chunker is not None: | ||
| self.text_chunker = text_chunker | ||
| else: | ||
| from presidio_analyzer.chunkers import CharacterBasedTextChunker | ||
|
|
||
| self.text_chunker = CharacterBasedTextChunker( | ||
| chunk_size=250, | ||
| chunk_overlap=50, | ||
| chunk_size=chunk_size, | ||
| chunk_overlap=chunk_overlap, | ||
| ) | ||
|
|
||
| self.gliner = None | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.