diff --git a/presidio-analyzer/presidio_analyzer/context_aware_enhancers/lemma_context_aware_enhancer.py b/presidio-analyzer/presidio_analyzer/context_aware_enhancers/lemma_context_aware_enhancer.py index aac1e4ed8c..a249461c53 100644 --- a/presidio-analyzer/presidio_analyzer/context_aware_enhancers/lemma_context_aware_enhancer.py +++ b/presidio-analyzer/presidio_analyzer/context_aware_enhancers/lemma_context_aware_enhancer.py @@ -337,13 +337,18 @@ def _add_n_words( # to an interesting context word, so we allow it and add 1 to # the number of collected words - # collect at most n words (in lower case) + # collect at most n_words keywords (plus the entity token itself). + # A hard cap on total token positions prevents scanning arbitrarily + # far when n_words is small (e.g. 0), which would otherwise allow + # context words far outside the intended window to contribute. remaining = n_words + 1 - while 0 <= i < len(lemmas) and remaining > 0: + max_token_positions = n_words * 2 + 1 + while 0 <= i < len(lemmas) and remaining > 0 and max_token_positions > 0: lower_lemma = lemmas[i].lower() if lower_lemma in lemmatized_filtered_keywords: context_words.append(lower_lemma) remaining -= 1 + max_token_positions -= 1 i = i - 1 if is_backward else i + 1 return context_words diff --git a/presidio-analyzer/tests/test_lemma_context_aware_enhancer.py b/presidio-analyzer/tests/test_lemma_context_aware_enhancer.py index 74df0e8e85..aa7fb32724 100644 --- a/presidio-analyzer/tests/test_lemma_context_aware_enhancer.py +++ b/presidio-analyzer/tests/test_lemma_context_aware_enhancer.py @@ -1,3 +1,5 @@ +import pytest + from presidio_analyzer import LemmaContextAwareEnhancer @@ -280,3 +282,81 @@ def test_when_substring_mode_then_compound_words_work_in_integration(spacy_nlp_e assert ( enhanced_results[0].analysis_explanation.supportive_context_word == "passport" ) + + +def test_add_n_words_forward_respects_token_window(): + """ + Verify that _add_n_words hard-caps at max_token_positions = n_words * 2 + 1. + + With n_words=1 the hard cap is 3 token positions. A keyword sitting at + position 4 (beyond the cap) must not be collected even when stop-words + and punctuation fill the earlier positions. + Regression test for: context words used outside the suffix/prefix window. + """ + # entity(0), stop(1), near_keyword(2), far_keyword(3) + # n_words=1 → max_token_positions = 1*2+1 = 3 → scans positions 0,1,2 only + lemmas = ["entity", "the", "near_keyword", "far_keyword"] + lemmatized_filtered_keywords = ["entity", "near_keyword", "far_keyword"] + + result = LemmaContextAwareEnhancer._add_n_words( + index=0, + n_words=1, + lemmas=lemmas, + lemmatized_filtered_keywords=lemmatized_filtered_keywords, + is_backward=False, + ) + + assert "near_keyword" in result + # "far_keyword" is at position 3 which is >= max_token_positions=3 from the start + assert "far_keyword" not in result + + +def test_add_n_words_backward_respects_token_window(): + """ + Verify that _add_n_words hard-caps at max_token_positions = n_words * 2 + 1. + + With n_words=1 the hard cap is 3 token positions. A keyword sitting + more than 2 steps behind the entity must not be collected. + Regression test for: context words used outside the suffix/prefix window. + """ + # far_keyword(0), stop(1), near_keyword(2), entity(3) + # n_words=1 → max_token_positions=3 → scans positions 3,2,1 (3 steps) + lemmas = ["far_keyword", "the", "near_keyword", "entity"] + lemmatized_filtered_keywords = ["entity", "near_keyword", "far_keyword"] + + result = LemmaContextAwareEnhancer._add_n_words( + index=3, + n_words=1, + lemmas=lemmas, + lemmatized_filtered_keywords=lemmatized_filtered_keywords, + is_backward=True, + ) + + assert "near_keyword" in result + # "far_keyword" is at position 0, which is 3 steps from the entity — + # just outside the hard cap of max_token_positions=3 (positions 3,2,1). + assert "far_keyword" not in result + + +def test_add_n_words_zero_window_only_includes_entity(): + """With n_words=0 the hard cap is 1 token position: only the entity.""" + lemmas = ["before_keyword", "entity", "after_keyword"] + lemmatized_filtered_keywords = ["before_keyword", "entity", "after_keyword"] + + forward = LemmaContextAwareEnhancer._add_n_words( + index=1, + n_words=0, + lemmas=lemmas, + lemmatized_filtered_keywords=lemmatized_filtered_keywords, + is_backward=False, + ) + backward = LemmaContextAwareEnhancer._add_n_words( + index=1, + n_words=0, + lemmas=lemmas, + lemmatized_filtered_keywords=lemmatized_filtered_keywords, + is_backward=True, + ) + + assert forward == ["entity"] + assert backward == ["entity"]