diff --git a/presidio-analyzer/presidio_analyzer/analyzer_engine.py b/presidio-analyzer/presidio_analyzer/analyzer_engine.py index df4e23d81..567b0d9a7 100644 --- a/presidio-analyzer/presidio_analyzer/analyzer_engine.py +++ b/presidio-analyzer/presidio_analyzer/analyzer_engine.py @@ -256,6 +256,7 @@ def analyze( # Remove duplicates or low score results results = EntityRecognizer.remove_duplicates(results) + results = EntityRecognizer.merge_adjacent_text_entities(results, text) results = self.__remove_low_scores(results, score_threshold) if allow_list: diff --git a/presidio-analyzer/presidio_analyzer/entity_recognizer.py b/presidio-analyzer/presidio_analyzer/entity_recognizer.py index 18a6fc3d6..f0bda615e 100644 --- a/presidio-analyzer/presidio_analyzer/entity_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/entity_recognizer.py @@ -289,6 +289,48 @@ def remove_duplicates(results: List[RecognizerResult]) -> List[RecognizerResult] return filtered_results + @staticmethod + def merge_adjacent_text_entities( + results: List[RecognizerResult], text: str + ) -> List[RecognizerResult]: + """ + Merge adjacent entities of the same type separated only by whitespace. + + When an NER model detects a multi-token entity (e.g. "Dave Jones") as + separate consecutive spans of the same entity type, this method fuses + them into a single span so downstream anonymization produces one + placeholder instead of many. + + :param results: List[RecognizerResult] (need not be sorted) + :param text: the original text that was analyzed + :return: List[RecognizerResult] with whitespace-adjacent same-type + entities merged; order is preserved by start index + """ + if not results: + return results + + sorted_results = sorted(results, key=lambda x: x.start) + merged: List[RecognizerResult] = [] + current = sorted_results[0] + + for nxt in sorted_results[1:]: + gap = text[current.end : nxt.start] + if current.entity_type == nxt.entity_type and gap.strip() == "": + current = RecognizerResult( + entity_type=current.entity_type, + start=current.start, + end=nxt.end, + score=max(current.score, nxt.score), + analysis_explanation=current.analysis_explanation, + recognition_metadata=current.recognition_metadata, + ) + else: + merged.append(current) + current = nxt + + merged.append(current) + return merged + @staticmethod def sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str: """ diff --git a/presidio-analyzer/tests/test_entity_recognizer.py b/presidio-analyzer/tests/test_entity_recognizer.py index 3bb695eb5..8406cf6f5 100644 --- a/presidio-analyzer/tests/test_entity_recognizer.py +++ b/presidio-analyzer/tests/test_entity_recognizer.py @@ -122,6 +122,79 @@ def test_when_remove_duplicates_contained_shorter_length_results_removed(): import pytest + +# --------------------------------------------------------------------------- +# merge_adjacent_text_entities tests +# --------------------------------------------------------------------------- + +def test_merge_adjacent_same_type_entities(): + """Two PERSON spans separated by a single space are merged into one.""" + text = "My name is Dave Jones" + results = [ + RecognizerResult(entity_type="PERSON", start=11, end=15, score=0.85), + RecognizerResult(entity_type="PERSON", start=16, end=21, score=0.85), + ] + merged = EntityRecognizer.merge_adjacent_text_entities(results, text) + assert len(merged) == 1 + assert merged[0].start == 11 + assert merged[0].end == 21 + assert merged[0].entity_type == "PERSON" + + +def test_merge_adjacent_preserves_max_score(): + """Merged entity takes the higher of the two scores.""" + text = "Anne Marie" + results = [ + RecognizerResult(entity_type="PERSON", start=0, end=4, score=0.7), + RecognizerResult(entity_type="PERSON", start=5, end=10, score=0.9), + ] + merged = EntityRecognizer.merge_adjacent_text_entities(results, text) + assert len(merged) == 1 + assert merged[0].score == 0.9 + + +def test_merge_adjacent_three_tokens(): + """Three consecutive same-type spans are merged into a single result.""" + text = "Jean Luc Picard" + results = [ + RecognizerResult(entity_type="PERSON", start=0, end=4, score=0.8), + RecognizerResult(entity_type="PERSON", start=5, end=8, score=0.8), + RecognizerResult(entity_type="PERSON", start=9, end=15, score=0.8), + ] + merged = EntityRecognizer.merge_adjacent_text_entities(results, text) + assert len(merged) == 1 + assert merged[0].start == 0 + assert merged[0].end == 15 + + +def test_no_merge_when_different_entity_types(): + """Adjacent spans of different types are NOT merged.""" + text = "John London" + results = [ + RecognizerResult(entity_type="PERSON", start=0, end=4, score=0.8), + RecognizerResult(entity_type="LOCATION", start=5, end=11, score=0.8), + ] + merged = EntityRecognizer.merge_adjacent_text_entities(results, text) + assert len(merged) == 2 + + +def test_no_merge_when_gap_has_non_whitespace(): + """Spans separated by non-whitespace characters are NOT merged.""" + text = "foo, bar" + results = [ + RecognizerResult(entity_type="PERSON", start=0, end=3, score=0.8), + RecognizerResult(entity_type="PERSON", start=5, end=8, score=0.8), + ] + merged = EntityRecognizer.merge_adjacent_text_entities(results, text) + assert len(merged) == 2 + + +def test_merge_empty_results(): + """Empty input returns empty output without error.""" + merged = EntityRecognizer.merge_adjacent_text_entities([], "some text") + assert merged == [] + + sanitizer_test_set = [ [" a|b:c ::-", [("-", ""), (" ", ""), (":", ""), ("|", "")], "abc"], ["def", "", "def"],