From 3de90e6300dfa1f4aeb2f8dc7a6e17fd1cdc5977 Mon Sep 17 00:00:00 2001 From: devs6186 Date: Sun, 15 Mar 2026 19:53:59 +0530 Subject: [PATCH 1/3] rules: skip pure-literal case-insensitive regex evaluation via lowercased string fast path For Regex patterns that are pure literals with the /i flag (no metacharacters), add an O(1) pre-check using a frozenset of lowercased string values built once per scope evaluation. When the lowercased pattern is found in the set the rule is added to candidates without invoking the regex engine. When not found the full regex scan still runs to handle substring matches such as /createfile/i matching "CreateFileA". Adds _is_pure_literal_ci and _normalized_lower attrs to Regex and two tests to verify detection and correctness. Closes: https://github.com/mandiant/capa/issues/2129 --- capa/features/common.py | 12 ++++++++++ capa/rules/__init__.py | 25 +++++++++++++++---- tests/test_match.py | 53 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+), 5 deletions(-) diff --git a/capa/features/common.py b/capa/features/common.py index 5bde5d3599..f481b00fc8 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -328,6 +328,18 @@ def __init__(self, value: str, description=None): f"invalid regular expression: {value} it should use Python syntax, try it at https://pythex.org" ) from exc + # Detect pure-literal case-insensitive patterns: no regex metacharacters, + # just a simple string with the /i flag. For these we can skip the regex + # engine when the lowercased string value is present in the feature set, + # which is a common case for API names, file extensions, and registry keys. + # See: https://github.com/mandiant/capa/issues/2129 + if value.endswith("/i") and re.escape(pat) == pat: + self._is_pure_literal_ci: bool = True + self._normalized_lower: str = pat.lower() + else: + self._is_pure_literal_ci = False + self._normalized_lower = "" + def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True): capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.regex"] += 1 diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index 23fd0dd3c0..d090a30c69 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -1988,11 +1988,6 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> tuple[Fea # We may want to try to pre-evaluate these strings, based on their presence in the file, # to reduce the number of evaluations we do here. # See: https://github.com/mandiant/capa/issues/2126 - # - # We may also want to specialize case-insensitive strings, which would enable them to - # be indexed, and therefore skip the scanning here, improving performance. - # This strategy is described here: - # https://github.com/mandiant/capa/issues/2129 if feature_index.string_rules: # This is a FeatureSet that contains only String features. # Since we'll only be evaluating String/Regex features below, we don't care about @@ -2009,10 +2004,30 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> tuple[Fea string_features[feature] = locations if string_features: + # Pre-compute the set of lowercased string values once per scope evaluation. + # This enables an O(1) fast path for pure-literal case-insensitive patterns: + # instead of invoking the regex engine for every candidate string, we check + # whether the lowercased pattern value is already present in the feature set. + # When found, the pattern is guaranteed to match and we skip the regex call. + # When not found, we still fall back to the full regex scan to handle substring + # matches such as /createfile/i matching "CreateFileA". + # See: https://github.com/mandiant/capa/issues/2129 + lowercased_strings: frozenset[str] = frozenset( + feature.value.lower() for feature in string_features if isinstance(feature.value, str) + ) for rule_name, wanted_strings in feature_index.string_rules.items(): for wanted_string in wanted_strings: + # Fast path: pure-literal /i patterns can be resolved via O(1) lookup. + if ( + isinstance(wanted_string, capa.features.common.Regex) + and wanted_string._is_pure_literal_ci + and wanted_string._normalized_lower in lowercased_strings + ): + candidate_rule_names.add(rule_name) + break if wanted_string.evaluate(string_features): candidate_rule_names.add(rule_name) + break # Like with String/Regex features above, we have to scan for Bytes to find candidate rules. # diff --git a/tests/test_match.py b/tests/test_match.py index cfeeb7e9fc..5825bdcf6a 100644 --- a/tests/test_match.py +++ b/tests/test_match.py @@ -816,3 +816,56 @@ def test_index_features_nested_unstable(): assert not index.string_rules assert not index.bytes_rules + + +def test_regex_pure_literal_ci_fast_path_detection(): + """Verify that pure-literal case-insensitive Regex patterns are detected correctly.""" + # Pure literal patterns: no metacharacters, /i flag + r1 = capa.features.common.Regex("/createfile/i") + assert r1._is_pure_literal_ci is True + assert r1._normalized_lower == "createfile" + + r2 = capa.features.common.Regex("/useragent/i") + assert r2._is_pure_literal_ci is True + assert r2._normalized_lower == "useragent" + + # Complex patterns: has metacharacters, should NOT be flagged + r3 = capa.features.common.Regex("/create.*file/i") + assert r3._is_pure_literal_ci is False + + # Case-sensitive pattern: no /i flag + r4 = capa.features.common.Regex("/createfile/") + assert r4._is_pure_literal_ci is False + + +def test_regex_ci_fast_path_correctness(): + """Verify the fast path produces the same results as the full regex engine.""" + rule_text = textwrap.dedent( + """ + rule: + meta: + name: test ci fast path + scopes: + static: function + dynamic: process + features: + - string: /createfile/i + """ + ) + r = capa.rules.Rule.from_yaml(rule_text) + rr = capa.rules.RuleSet([r]) + + # Should match: exact case-insensitive match (fast path) + _, matches = rr.match(capa.rules.Scope.FUNCTION, {String("CreateFile"): {0x0}}, 0x0) + assert "test ci fast path" in matches + + _, matches = rr.match(capa.rules.Scope.FUNCTION, {String("CREATEFILE"): {0x0}}, 0x0) + assert "test ci fast path" in matches + + # Should match: substring match (regex fallback path) + _, matches = rr.match(capa.rules.Scope.FUNCTION, {String("CreateFileA"): {0x0}}, 0x0) + assert "test ci fast path" in matches + + # Should not match + _, matches = rr.match(capa.rules.Scope.FUNCTION, {String("WriteFile"): {0x0}}, 0x0) + assert "test ci fast path" not in matches From d9bd6e4ef4291c7ee214d09be76c385a9c789f01 Mon Sep 17 00:00:00 2001 From: devs6186 Date: Sun, 15 Mar 2026 20:01:45 +0530 Subject: [PATCH 2/3] rules: skip pure-literal case-insensitive regex evaluation via lowercased string fast path Closes #2129 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index db5fe728ea..c561c52900 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -68,6 +68,7 @@ - doc: fix typo in usage.md, add documentation links to README @devs6186 #2274 - doc: add table comparing ways to consume capa output (CLI, IDA, Ghidra, dynamic sandbox, web) @devs6186 #2273 - binja: add mypy config for top-level binaryninja module to fix mypy issues @devs6186 #2399 +- rules: skip regex engine for pure-literal case-insensitive patterns via O(1) lowercased-string lookup #2129 - ci: deprecate macos-13 runner and use Python v3.13 for testing @mike-hunhoff #2777 - ci: pin pip-audit action SHAs and update to v1.1.0 @kami922 #1131 From 8bf87361b9b1374ba24d31874a1c762616833719 Mon Sep 17 00:00:00 2001 From: devs6186 Date: Mon, 16 Mar 2026 02:34:14 +0530 Subject: [PATCH 3/3] rules: lazily build lowercase set for pure-literal /i regex - simplify pure-literal /i detection assignment in Regex - build lowercased string set lazily to avoid unnecessary overhead - keep regex fallback path to preserve substring semantics #2129 --- capa/features/common.py | 13 ++++--------- capa/rules/__init__.py | 36 ++++++++++++++++++------------------ 2 files changed, 22 insertions(+), 27 deletions(-) diff --git a/capa/features/common.py b/capa/features/common.py index f481b00fc8..6eff16fc2d 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -329,16 +329,11 @@ def __init__(self, value: str, description=None): ) from exc # Detect pure-literal case-insensitive patterns: no regex metacharacters, - # just a simple string with the /i flag. For these we can skip the regex - # engine when the lowercased string value is present in the feature set, - # which is a common case for API names, file extensions, and registry keys. + # just a simple string with the /i flag. For these we can skip the regex + # engine when the lowercased string value is present in the feature set. # See: https://github.com/mandiant/capa/issues/2129 - if value.endswith("/i") and re.escape(pat) == pat: - self._is_pure_literal_ci: bool = True - self._normalized_lower: str = pat.lower() - else: - self._is_pure_literal_ci = False - self._normalized_lower = "" + self._is_pure_literal_ci: bool = value.endswith("/i") and re.escape(pat) == pat + self._normalized_lower: str = pat.lower() if self._is_pure_literal_ci else "" def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True): capa.perf.counters["evaluate.feature"] += 1 diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index d090a30c69..69de891415 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -2004,27 +2004,27 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> tuple[Fea string_features[feature] = locations if string_features: - # Pre-compute the set of lowercased string values once per scope evaluation. - # This enables an O(1) fast path for pure-literal case-insensitive patterns: - # instead of invoking the regex engine for every candidate string, we check - # whether the lowercased pattern value is already present in the feature set. - # When found, the pattern is guaranteed to match and we skip the regex call. - # When not found, we still fall back to the full regex scan to handle substring - # matches such as /createfile/i matching "CreateFileA". - # See: https://github.com/mandiant/capa/issues/2129 - lowercased_strings: frozenset[str] = frozenset( - feature.value.lower() for feature in string_features if isinstance(feature.value, str) - ) + # Build this lazily, only when we encounter a pure-literal `/i` regex. + # This preserves fast-path wins while avoiding avoidable overhead in + # workloads where such regexes are uncommon. + lowercased_strings: frozenset[str] | None = None for rule_name, wanted_strings in feature_index.string_rules.items(): for wanted_string in wanted_strings: # Fast path: pure-literal /i patterns can be resolved via O(1) lookup. - if ( - isinstance(wanted_string, capa.features.common.Regex) - and wanted_string._is_pure_literal_ci - and wanted_string._normalized_lower in lowercased_strings - ): - candidate_rule_names.add(rule_name) - break + if isinstance(wanted_string, capa.features.common.Regex) and wanted_string._is_pure_literal_ci: + if lowercased_strings is None: + lowercased_strings = frozenset( + feature.value.lower() + for feature in string_features + if isinstance(feature.value, str) + ) + + if wanted_string._normalized_lower in lowercased_strings: + candidate_rule_names.add(rule_name) + break + + # When the fast path is not sufficient, keep the existing + # regex behavior to preserve substring semantics. if wanted_string.evaluate(string_features): candidate_rule_names.add(rule_name) break