diff --git a/CHANGELOG.md b/CHANGELOG.md index db5fe728ea..c561c52900 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -68,6 +68,7 @@ - doc: fix typo in usage.md, add documentation links to README @devs6186 #2274 - doc: add table comparing ways to consume capa output (CLI, IDA, Ghidra, dynamic sandbox, web) @devs6186 #2273 - binja: add mypy config for top-level binaryninja module to fix mypy issues @devs6186 #2399 +- rules: skip regex engine for pure-literal case-insensitive patterns via O(1) lowercased-string lookup #2129 - ci: deprecate macos-13 runner and use Python v3.13 for testing @mike-hunhoff #2777 - ci: pin pip-audit action SHAs and update to v1.1.0 @kami922 #1131 diff --git a/capa/features/common.py b/capa/features/common.py index 5bde5d3599..6eff16fc2d 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -328,6 +328,13 @@ def __init__(self, value: str, description=None): f"invalid regular expression: {value} it should use Python syntax, try it at https://pythex.org" ) from exc + # Detect pure-literal case-insensitive patterns: no regex metacharacters, + # just a simple string with the /i flag. For these we can skip the regex + # engine when the lowercased string value is present in the feature set. + # See: https://github.com/mandiant/capa/issues/2129 + self._is_pure_literal_ci: bool = value.endswith("/i") and re.escape(pat) == pat + self._normalized_lower: str = pat.lower() if self._is_pure_literal_ci else "" + def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True): capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.regex"] += 1 diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index 23fd0dd3c0..69de891415 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -1988,11 +1988,6 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> tuple[Fea # We may want to try to pre-evaluate these strings, based on their presence in the file, # to reduce the number of evaluations we do here. # See: https://github.com/mandiant/capa/issues/2126 - # - # We may also want to specialize case-insensitive strings, which would enable them to - # be indexed, and therefore skip the scanning here, improving performance. - # This strategy is described here: - # https://github.com/mandiant/capa/issues/2129 if feature_index.string_rules: # This is a FeatureSet that contains only String features. # Since we'll only be evaluating String/Regex features below, we don't care about @@ -2009,10 +2004,30 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> tuple[Fea string_features[feature] = locations if string_features: + # Build this lazily, only when we encounter a pure-literal `/i` regex. + # This preserves fast-path wins while avoiding avoidable overhead in + # workloads where such regexes are uncommon. + lowercased_strings: frozenset[str] | None = None for rule_name, wanted_strings in feature_index.string_rules.items(): for wanted_string in wanted_strings: + # Fast path: pure-literal /i patterns can be resolved via O(1) lookup. + if isinstance(wanted_string, capa.features.common.Regex) and wanted_string._is_pure_literal_ci: + if lowercased_strings is None: + lowercased_strings = frozenset( + feature.value.lower() + for feature in string_features + if isinstance(feature.value, str) + ) + + if wanted_string._normalized_lower in lowercased_strings: + candidate_rule_names.add(rule_name) + break + + # When the fast path is not sufficient, keep the existing + # regex behavior to preserve substring semantics. if wanted_string.evaluate(string_features): candidate_rule_names.add(rule_name) + break # Like with String/Regex features above, we have to scan for Bytes to find candidate rules. # diff --git a/tests/test_match.py b/tests/test_match.py index cfeeb7e9fc..5825bdcf6a 100644 --- a/tests/test_match.py +++ b/tests/test_match.py @@ -816,3 +816,56 @@ def test_index_features_nested_unstable(): assert not index.string_rules assert not index.bytes_rules + + +def test_regex_pure_literal_ci_fast_path_detection(): + """Verify that pure-literal case-insensitive Regex patterns are detected correctly.""" + # Pure literal patterns: no metacharacters, /i flag + r1 = capa.features.common.Regex("/createfile/i") + assert r1._is_pure_literal_ci is True + assert r1._normalized_lower == "createfile" + + r2 = capa.features.common.Regex("/useragent/i") + assert r2._is_pure_literal_ci is True + assert r2._normalized_lower == "useragent" + + # Complex patterns: has metacharacters, should NOT be flagged + r3 = capa.features.common.Regex("/create.*file/i") + assert r3._is_pure_literal_ci is False + + # Case-sensitive pattern: no /i flag + r4 = capa.features.common.Regex("/createfile/") + assert r4._is_pure_literal_ci is False + + +def test_regex_ci_fast_path_correctness(): + """Verify the fast path produces the same results as the full regex engine.""" + rule_text = textwrap.dedent( + """ + rule: + meta: + name: test ci fast path + scopes: + static: function + dynamic: process + features: + - string: /createfile/i + """ + ) + r = capa.rules.Rule.from_yaml(rule_text) + rr = capa.rules.RuleSet([r]) + + # Should match: exact case-insensitive match (fast path) + _, matches = rr.match(capa.rules.Scope.FUNCTION, {String("CreateFile"): {0x0}}, 0x0) + assert "test ci fast path" in matches + + _, matches = rr.match(capa.rules.Scope.FUNCTION, {String("CREATEFILE"): {0x0}}, 0x0) + assert "test ci fast path" in matches + + # Should match: substring match (regex fallback path) + _, matches = rr.match(capa.rules.Scope.FUNCTION, {String("CreateFileA"): {0x0}}, 0x0) + assert "test ci fast path" in matches + + # Should not match + _, matches = rr.match(capa.rules.Scope.FUNCTION, {String("WriteFile"): {0x0}}, 0x0) + assert "test ci fast path" not in matches