Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions presidio-analyzer/presidio_analyzer/analyzer_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ def analyze(

# Remove duplicates or low score results
results = EntityRecognizer.remove_duplicates(results)
results = EntityRecognizer.merge_adjacent_text_entities(results, text)
results = self.__remove_low_scores(results, score_threshold)
Comment on lines 258 to 260

if allow_list:
Expand Down
42 changes: 42 additions & 0 deletions presidio-analyzer/presidio_analyzer/entity_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,48 @@ def remove_duplicates(results: List[RecognizerResult]) -> List[RecognizerResult]

return filtered_results

@staticmethod
def merge_adjacent_text_entities(
results: List[RecognizerResult], text: str
) -> List[RecognizerResult]:
"""
Merge adjacent entities of the same type separated only by whitespace.

When an NER model detects a multi-token entity (e.g. "Dave Jones") as
separate consecutive spans of the same entity type, this method fuses
them into a single span so downstream anonymization produces one
placeholder instead of many.

:param results: List[RecognizerResult] (need not be sorted)
:param text: the original text that was analyzed
:return: List[RecognizerResult] with whitespace-adjacent same-type
entities merged; order is preserved by start index
"""
if not results:
return results

sorted_results = sorted(results, key=lambda x: x.start)
merged: List[RecognizerResult] = []
current = sorted_results[0]

for nxt in sorted_results[1:]:
gap = text[current.end : nxt.start]
if current.entity_type == nxt.entity_type and gap.strip() == "":
Comment on lines +316 to +318
current = RecognizerResult(
entity_type=current.entity_type,
start=current.start,
end=nxt.end,
score=max(current.score, nxt.score),
analysis_explanation=current.analysis_explanation,
recognition_metadata=current.recognition_metadata,
)
Comment on lines +319 to +326
else:
merged.append(current)
current = nxt

merged.append(current)
return merged

@staticmethod
def sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
"""
Expand Down
73 changes: 73 additions & 0 deletions presidio-analyzer/tests/test_entity_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,79 @@ def test_when_remove_duplicates_contained_shorter_length_results_removed():

import pytest


# ---------------------------------------------------------------------------
# merge_adjacent_text_entities tests
# ---------------------------------------------------------------------------

def test_merge_adjacent_same_type_entities():
"""Two PERSON spans separated by a single space are merged into one."""
text = "My name is Dave Jones"
results = [
RecognizerResult(entity_type="PERSON", start=11, end=15, score=0.85),
RecognizerResult(entity_type="PERSON", start=16, end=21, score=0.85),
]
merged = EntityRecognizer.merge_adjacent_text_entities(results, text)
assert len(merged) == 1
assert merged[0].start == 11
assert merged[0].end == 21
assert merged[0].entity_type == "PERSON"


def test_merge_adjacent_preserves_max_score():
"""Merged entity takes the higher of the two scores."""
text = "Anne Marie"
results = [
RecognizerResult(entity_type="PERSON", start=0, end=4, score=0.7),
RecognizerResult(entity_type="PERSON", start=5, end=10, score=0.9),
]
merged = EntityRecognizer.merge_adjacent_text_entities(results, text)
assert len(merged) == 1
assert merged[0].score == 0.9


def test_merge_adjacent_three_tokens():
"""Three consecutive same-type spans are merged into a single result."""
text = "Jean Luc Picard"
results = [
RecognizerResult(entity_type="PERSON", start=0, end=4, score=0.8),
RecognizerResult(entity_type="PERSON", start=5, end=8, score=0.8),
RecognizerResult(entity_type="PERSON", start=9, end=15, score=0.8),
]
merged = EntityRecognizer.merge_adjacent_text_entities(results, text)
assert len(merged) == 1
assert merged[0].start == 0
assert merged[0].end == 15


def test_no_merge_when_different_entity_types():
"""Adjacent spans of different types are NOT merged."""
text = "John London"
results = [
RecognizerResult(entity_type="PERSON", start=0, end=4, score=0.8),
RecognizerResult(entity_type="LOCATION", start=5, end=11, score=0.8),
]
merged = EntityRecognizer.merge_adjacent_text_entities(results, text)
assert len(merged) == 2


def test_no_merge_when_gap_has_non_whitespace():
"""Spans separated by non-whitespace characters are NOT merged."""
text = "foo, bar"
results = [
RecognizerResult(entity_type="PERSON", start=0, end=3, score=0.8),
RecognizerResult(entity_type="PERSON", start=5, end=8, score=0.8),
]
merged = EntityRecognizer.merge_adjacent_text_entities(results, text)
assert len(merged) == 2


def test_merge_empty_results():
"""Empty input returns empty output without error."""
merged = EntityRecognizer.merge_adjacent_text_entities([], "some text")
assert merged == []
Comment on lines +192 to +195


sanitizer_test_set = [
[" a|b:c ::-", [("-", ""), (" ", ""), (":", ""), ("|", "")], "abc"],
["def", "", "def"],
Expand Down