From bb3a2f9ad8a0ffc3fd0c2854ca3fc393d7009372 Mon Sep 17 00:00:00 2001 From: Oleksandr Sanin Date: Thu, 28 May 2026 09:13:58 +0000 Subject: [PATCH] feat(analyzer): merge adjacent same-type entities separated by whitespace When an NER model tokenizes a multi-word entity (e.g. "Dave Jones") it may return two consecutive spans of the same entity type with only whitespace between them. Previously Presidio would emit two separate placeholders (e.g. ) instead of a single one, breaking anonymization quality and downstream synthetic-data generation. A new static method EntityRecognizer.merge_adjacent_text_entities(results, text) is added. It sorts results by start offset and greedily merges consecutive spans of the same entity type whose intervening gap consists solely of whitespace, assigning the maximum score to the fused span. The method is called in AnalyzerEngine.analyze() immediately after remove_duplicates(), so it integrates transparently into the existing pipeline without breaking any existing behaviour. Six unit tests are added to test_entity_recognizer.py covering: basic two- token merge, score preservation, three-token chain merge, different entity types not merged, non-whitespace gap not merged, and empty input. Closes #1090 Signed-off-by: Oleksandr Sanin --- .../presidio_analyzer/analyzer_engine.py | 1 + .../presidio_analyzer/entity_recognizer.py | 42 +++++++++++ .../tests/test_entity_recognizer.py | 73 +++++++++++++++++++ 3 files changed, 116 insertions(+) diff --git a/presidio-analyzer/presidio_analyzer/analyzer_engine.py b/presidio-analyzer/presidio_analyzer/analyzer_engine.py index df4e23d815..567b0d9a7f 100644 --- a/presidio-analyzer/presidio_analyzer/analyzer_engine.py +++ b/presidio-analyzer/presidio_analyzer/analyzer_engine.py @@ -256,6 +256,7 @@ def analyze( # Remove duplicates or low score results results = EntityRecognizer.remove_duplicates(results) + results = EntityRecognizer.merge_adjacent_text_entities(results, text) results = self.__remove_low_scores(results, score_threshold) if allow_list: diff --git a/presidio-analyzer/presidio_analyzer/entity_recognizer.py b/presidio-analyzer/presidio_analyzer/entity_recognizer.py index 18a6fc3d65..f0bda615ef 100644 --- a/presidio-analyzer/presidio_analyzer/entity_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/entity_recognizer.py @@ -289,6 +289,48 @@ def remove_duplicates(results: List[RecognizerResult]) -> List[RecognizerResult] return filtered_results + @staticmethod + def merge_adjacent_text_entities( + results: List[RecognizerResult], text: str + ) -> List[RecognizerResult]: + """ + Merge adjacent entities of the same type separated only by whitespace. + + When an NER model detects a multi-token entity (e.g. "Dave Jones") as + separate consecutive spans of the same entity type, this method fuses + them into a single span so downstream anonymization produces one + placeholder instead of many. + + :param results: List[RecognizerResult] (need not be sorted) + :param text: the original text that was analyzed + :return: List[RecognizerResult] with whitespace-adjacent same-type + entities merged; order is preserved by start index + """ + if not results: + return results + + sorted_results = sorted(results, key=lambda x: x.start) + merged: List[RecognizerResult] = [] + current = sorted_results[0] + + for nxt in sorted_results[1:]: + gap = text[current.end : nxt.start] + if current.entity_type == nxt.entity_type and gap.strip() == "": + current = RecognizerResult( + entity_type=current.entity_type, + start=current.start, + end=nxt.end, + score=max(current.score, nxt.score), + analysis_explanation=current.analysis_explanation, + recognition_metadata=current.recognition_metadata, + ) + else: + merged.append(current) + current = nxt + + merged.append(current) + return merged + @staticmethod def sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str: """ diff --git a/presidio-analyzer/tests/test_entity_recognizer.py b/presidio-analyzer/tests/test_entity_recognizer.py index 3bb695eb5d..8406cf6f5f 100644 --- a/presidio-analyzer/tests/test_entity_recognizer.py +++ b/presidio-analyzer/tests/test_entity_recognizer.py @@ -122,6 +122,79 @@ def test_when_remove_duplicates_contained_shorter_length_results_removed(): import pytest + +# --------------------------------------------------------------------------- +# merge_adjacent_text_entities tests +# --------------------------------------------------------------------------- + +def test_merge_adjacent_same_type_entities(): + """Two PERSON spans separated by a single space are merged into one.""" + text = "My name is Dave Jones" + results = [ + RecognizerResult(entity_type="PERSON", start=11, end=15, score=0.85), + RecognizerResult(entity_type="PERSON", start=16, end=21, score=0.85), + ] + merged = EntityRecognizer.merge_adjacent_text_entities(results, text) + assert len(merged) == 1 + assert merged[0].start == 11 + assert merged[0].end == 21 + assert merged[0].entity_type == "PERSON" + + +def test_merge_adjacent_preserves_max_score(): + """Merged entity takes the higher of the two scores.""" + text = "Anne Marie" + results = [ + RecognizerResult(entity_type="PERSON", start=0, end=4, score=0.7), + RecognizerResult(entity_type="PERSON", start=5, end=10, score=0.9), + ] + merged = EntityRecognizer.merge_adjacent_text_entities(results, text) + assert len(merged) == 1 + assert merged[0].score == 0.9 + + +def test_merge_adjacent_three_tokens(): + """Three consecutive same-type spans are merged into a single result.""" + text = "Jean Luc Picard" + results = [ + RecognizerResult(entity_type="PERSON", start=0, end=4, score=0.8), + RecognizerResult(entity_type="PERSON", start=5, end=8, score=0.8), + RecognizerResult(entity_type="PERSON", start=9, end=15, score=0.8), + ] + merged = EntityRecognizer.merge_adjacent_text_entities(results, text) + assert len(merged) == 1 + assert merged[0].start == 0 + assert merged[0].end == 15 + + +def test_no_merge_when_different_entity_types(): + """Adjacent spans of different types are NOT merged.""" + text = "John London" + results = [ + RecognizerResult(entity_type="PERSON", start=0, end=4, score=0.8), + RecognizerResult(entity_type="LOCATION", start=5, end=11, score=0.8), + ] + merged = EntityRecognizer.merge_adjacent_text_entities(results, text) + assert len(merged) == 2 + + +def test_no_merge_when_gap_has_non_whitespace(): + """Spans separated by non-whitespace characters are NOT merged.""" + text = "foo, bar" + results = [ + RecognizerResult(entity_type="PERSON", start=0, end=3, score=0.8), + RecognizerResult(entity_type="PERSON", start=5, end=8, score=0.8), + ] + merged = EntityRecognizer.merge_adjacent_text_entities(results, text) + assert len(merged) == 2 + + +def test_merge_empty_results(): + """Empty input returns empty output without error.""" + merged = EntityRecognizer.merge_adjacent_text_entities([], "some text") + assert merged == [] + + sanitizer_test_set = [ [" a|b:c ::-", [("-", ""), (" ", ""), (":", ""), ("|", "")], "abc"], ["def", "", "def"],