Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
- doc: fix typo in usage.md, add documentation links to README @devs6186 #2274
- doc: add table comparing ways to consume capa output (CLI, IDA, Ghidra, dynamic sandbox, web) @devs6186 #2273
- binja: add mypy config for top-level binaryninja module to fix mypy issues @devs6186 #2399
- rules: skip regex engine for pure-literal case-insensitive patterns via O(1) lowercased-string lookup #2129
- ci: deprecate macos-13 runner and use Python v3.13 for testing @mike-hunhoff #2777
- ci: pin pip-audit action SHAs and update to v1.1.0 @kami922 #1131

Expand Down
7 changes: 7 additions & 0 deletions capa/features/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,13 @@ def __init__(self, value: str, description=None):
f"invalid regular expression: {value} it should use Python syntax, try it at https://pythex.org"
) from exc

# Detect pure-literal case-insensitive patterns: no regex metacharacters,
# just a simple string with the /i flag. For these we can skip the regex
# engine when the lowercased string value is present in the feature set.
# See: https://github.com/mandiant/capa/issues/2129
self._is_pure_literal_ci: bool = value.endswith("/i") and re.escape(pat) == pat
self._normalized_lower: str = pat.lower() if self._is_pure_literal_ci else ""

def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True):
capa.perf.counters["evaluate.feature"] += 1
capa.perf.counters["evaluate.feature.regex"] += 1
Expand Down
25 changes: 20 additions & 5 deletions capa/rules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1988,11 +1988,6 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> tuple[Fea
# We may want to try to pre-evaluate these strings, based on their presence in the file,
# to reduce the number of evaluations we do here.
# See: https://github.com/mandiant/capa/issues/2126
#
# We may also want to specialize case-insensitive strings, which would enable them to
# be indexed, and therefore skip the scanning here, improving performance.
# This strategy is described here:
# https://github.com/mandiant/capa/issues/2129
if feature_index.string_rules:
# This is a FeatureSet that contains only String features.
# Since we'll only be evaluating String/Regex features below, we don't care about
Expand All @@ -2009,10 +2004,30 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> tuple[Fea
string_features[feature] = locations

if string_features:
# Build this lazily, only when we encounter a pure-literal `/i` regex.
# This preserves fast-path wins while avoiding avoidable overhead in
# workloads where such regexes are uncommon.
lowercased_strings: frozenset[str] | None = None
for rule_name, wanted_strings in feature_index.string_rules.items():
for wanted_string in wanted_strings:
# Fast path: pure-literal /i patterns can be resolved via O(1) lookup.
if isinstance(wanted_string, capa.features.common.Regex) and wanted_string._is_pure_literal_ci:
if lowercased_strings is None:
lowercased_strings = frozenset(
feature.value.lower()
for feature in string_features
if isinstance(feature.value, str)
)

if wanted_string._normalized_lower in lowercased_strings:
candidate_rule_names.add(rule_name)
break

# When the fast path is not sufficient, keep the existing
# regex behavior to preserve substring semantics.
if wanted_string.evaluate(string_features):
candidate_rule_names.add(rule_name)
break

# Like with String/Regex features above, we have to scan for Bytes to find candidate rules.
#
Expand Down
53 changes: 53 additions & 0 deletions tests/test_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -816,3 +816,56 @@ def test_index_features_nested_unstable():

assert not index.string_rules
assert not index.bytes_rules


def test_regex_pure_literal_ci_fast_path_detection():
"""Verify that pure-literal case-insensitive Regex patterns are detected correctly."""
# Pure literal patterns: no metacharacters, /i flag
r1 = capa.features.common.Regex("/createfile/i")
assert r1._is_pure_literal_ci is True
assert r1._normalized_lower == "createfile"

r2 = capa.features.common.Regex("/useragent/i")
assert r2._is_pure_literal_ci is True
assert r2._normalized_lower == "useragent"

# Complex patterns: has metacharacters, should NOT be flagged
r3 = capa.features.common.Regex("/create.*file/i")
assert r3._is_pure_literal_ci is False

# Case-sensitive pattern: no /i flag
r4 = capa.features.common.Regex("/createfile/")
assert r4._is_pure_literal_ci is False


def test_regex_ci_fast_path_correctness():
"""Verify the fast path produces the same results as the full regex engine."""
rule_text = textwrap.dedent(
"""
rule:
meta:
name: test ci fast path
scopes:
static: function
dynamic: process
features:
- string: /createfile/i
"""
)
r = capa.rules.Rule.from_yaml(rule_text)
rr = capa.rules.RuleSet([r])

# Should match: exact case-insensitive match (fast path)
_, matches = rr.match(capa.rules.Scope.FUNCTION, {String("CreateFile"): {0x0}}, 0x0)
assert "test ci fast path" in matches

_, matches = rr.match(capa.rules.Scope.FUNCTION, {String("CREATEFILE"): {0x0}}, 0x0)
assert "test ci fast path" in matches

# Should match: substring match (regex fallback path)
_, matches = rr.match(capa.rules.Scope.FUNCTION, {String("CreateFileA"): {0x0}}, 0x0)
assert "test ci fast path" in matches

# Should not match
_, matches = rr.match(capa.rules.Scope.FUNCTION, {String("WriteFile"): {0x0}}, 0x0)
assert "test ci fast path" not in matches
Loading