Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -337,13 +337,18 @@ def _add_n_words(
# to an interesting context word, so we allow it and add 1 to
# the number of collected words

# collect at most n words (in lower case)
# collect at most n_words keywords (plus the entity token itself).
# A hard cap on total token positions prevents scanning arbitrarily
# far when n_words is small (e.g. 0), which would otherwise allow
# context words far outside the intended window to contribute.
remaining = n_words + 1
while 0 <= i < len(lemmas) and remaining > 0:
max_token_positions = n_words * 2 + 1
while 0 <= i < len(lemmas) and remaining > 0 and max_token_positions > 0:
lower_lemma = lemmas[i].lower()
if lower_lemma in lemmatized_filtered_keywords:
context_words.append(lower_lemma)
remaining -= 1
Comment on lines 344 to 350
max_token_positions -= 1
i = i - 1 if is_backward else i + 1
Comment on lines +340 to 352
return context_words

Expand Down
80 changes: 80 additions & 0 deletions presidio-analyzer/tests/test_lemma_context_aware_enhancer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pytest

from presidio_analyzer import LemmaContextAwareEnhancer


Expand Down Expand Up @@ -280,3 +282,81 @@ def test_when_substring_mode_then_compound_words_work_in_integration(spacy_nlp_e
assert (
enhanced_results[0].analysis_explanation.supportive_context_word == "passport"
)


def test_add_n_words_forward_respects_token_window():
"""
Verify that _add_n_words hard-caps at max_token_positions = n_words * 2 + 1.

With n_words=1 the hard cap is 3 token positions. A keyword sitting at
position 4 (beyond the cap) must not be collected even when stop-words
and punctuation fill the earlier positions.
Comment on lines +291 to +293
Regression test for: context words used outside the suffix/prefix window.
"""
# entity(0), stop(1), near_keyword(2), far_keyword(3)
# n_words=1 → max_token_positions = 1*2+1 = 3 → scans positions 0,1,2 only
lemmas = ["entity", "the", "near_keyword", "far_keyword"]
lemmatized_filtered_keywords = ["entity", "near_keyword", "far_keyword"]

result = LemmaContextAwareEnhancer._add_n_words(
index=0,
n_words=1,
lemmas=lemmas,
lemmatized_filtered_keywords=lemmatized_filtered_keywords,
is_backward=False,
)

assert "near_keyword" in result
# "far_keyword" is at position 3 which is >= max_token_positions=3 from the start
assert "far_keyword" not in result


def test_add_n_words_backward_respects_token_window():
"""
Verify that _add_n_words hard-caps at max_token_positions = n_words * 2 + 1.

With n_words=1 the hard cap is 3 token positions. A keyword sitting
more than 2 steps behind the entity must not be collected.
Regression test for: context words used outside the suffix/prefix window.
"""
# far_keyword(0), stop(1), near_keyword(2), entity(3)
# n_words=1 → max_token_positions=3 → scans positions 3,2,1 (3 steps)
lemmas = ["far_keyword", "the", "near_keyword", "entity"]
lemmatized_filtered_keywords = ["entity", "near_keyword", "far_keyword"]

result = LemmaContextAwareEnhancer._add_n_words(
index=3,
n_words=1,
lemmas=lemmas,
lemmatized_filtered_keywords=lemmatized_filtered_keywords,
is_backward=True,
)

assert "near_keyword" in result
# "far_keyword" is at position 0, which is 3 steps from the entity —
# just outside the hard cap of max_token_positions=3 (positions 3,2,1).
assert "far_keyword" not in result


def test_add_n_words_zero_window_only_includes_entity():
"""With n_words=0 the hard cap is 1 token position: only the entity."""
lemmas = ["before_keyword", "entity", "after_keyword"]
lemmatized_filtered_keywords = ["before_keyword", "entity", "after_keyword"]

forward = LemmaContextAwareEnhancer._add_n_words(
index=1,
n_words=0,
lemmas=lemmas,
lemmatized_filtered_keywords=lemmatized_filtered_keywords,
is_backward=False,
)
backward = LemmaContextAwareEnhancer._add_n_words(
index=1,
n_words=0,
lemmas=lemmas,
lemmatized_filtered_keywords=lemmatized_filtered_keywords,
is_backward=True,
)

assert forward == ["entity"]
assert backward == ["entity"]