From 4b1a5a4d4babd2086bdb89920e4009ab839d434c Mon Sep 17 00:00:00 2001 From: MesTTo Date: Tue, 23 Jun 2026 18:52:14 +1000 Subject: [PATCH] Add expression trie sidecar --- kernel/src/expression_trie.rs | 309 ++++++++++++++++++++++++++++++++ kernel/src/lib.rs | 3 + kernel/src/pattern_relations.rs | 72 +++----- kernel/src/test_exprs.rs | 54 ++++++ 4 files changed, 393 insertions(+), 45 deletions(-) create mode 100644 kernel/src/expression_trie.rs create mode 100644 kernel/src/test_exprs.rs diff --git a/kernel/src/expression_trie.rs b/kernel/src/expression_trie.rs new file mode 100644 index 00000000..33ea4967 --- /dev/null +++ b/kernel/src/expression_trie.rs @@ -0,0 +1,309 @@ +use std::collections::BTreeMap; + +use crate::pattern_relations::{ + lower_pattern, match_fact_ids, PatternLoweringError, PatternRelationMatchError, + PatternRelationMatches, +}; +use crate::term_identity::{FactId, TermId, TermIdentitySidecar, TermKind}; + +/// Typed preorder token used by the derived expression trie. +#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd)] +pub enum ExpressionTrieToken { + /// Application node with encoded arity. + App(u8), + /// Complete interned symbol item. + Symbol(TermId), + /// Stored schematic new-variable token. + NewVar, + /// Stored schematic variable reference token. + VarRef(u8), +} + +/// Snapshot-local discrimination-trie style index over canonical term roots. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ExpressionTrieIndex { + nodes: Vec, + stats: ExpressionTrieStats, +} + +/// Counters for expression-trie build and candidate lookup. +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub struct ExpressionTrieStats { + /// Complete facts indexed. + pub facts_indexed: usize, + /// Trie nodes allocated, including the root. + pub trie_nodes: usize, + /// Typed preorder tokens inserted across all facts. + pub tokens_indexed: usize, +} + +/// Candidate lookup result before exact filtering. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ExpressionTrieCandidates { + /// Conservative typed prefix used for trie descent. + pub prefix: Box<[ExpressionTrieToken]>, + /// Complete facts below the prefix. + pub facts: Box<[FactId]>, +} + +/// Match result from expression-trie candidate retrieval plus exact filtering. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ExpressionTrieMatches { + /// Prefix-filter candidates. + pub candidates: ExpressionTrieCandidates, + /// Exact relationalized pattern matches over the candidate facts. + pub exact: PatternRelationMatches, +} + +/// Errors from expression-trie construction or matching. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum ExpressionTrieError { + /// A term referenced by a fact or pattern is absent from the sidecar. + UnknownTerm { term: TermId }, + /// Pattern lowering failed. + Lowering(PatternLoweringError), + /// Exact candidate filtering failed. + Match(PatternRelationMatchError), +} + +#[derive(Clone, Debug, Default, Eq, PartialEq)] +struct ExpressionTrieNode { + children: BTreeMap, + facts: Vec, +} + +impl ExpressionTrieIndex { + /// Builds a derived typed expression trie from complete fact roots. + pub fn build(sidecar: &TermIdentitySidecar) -> Result { + let mut index = Self { + nodes: vec![ExpressionTrieNode::default()], + stats: ExpressionTrieStats { + trie_nodes: 1, + ..ExpressionTrieStats::default() + }, + }; + + for fact in sidecar.facts() { + index.insert_fact(sidecar, fact.id, fact.root)?; + } + + Ok(index) + } + + /// Build and lookup counters. + pub fn stats(&self) -> ExpressionTrieStats { + self.stats + } + + /// Returns candidate fact IDs for the grounded typed prefix of `pattern`. + /// + /// Pattern variables stop prefix extraction because they match complete + /// subterms of unknown length. Later constants are checked by the exact + /// relationalized matcher. + pub fn candidates_for_pattern( + &self, + sidecar: &TermIdentitySidecar, + pattern: TermId, + ) -> Result { + let mut prefix = Vec::new(); + append_ground_prefix(sidecar, pattern, &mut prefix)?; + let facts = self.facts_below_prefix(&prefix); + + Ok(ExpressionTrieCandidates { + prefix: prefix.into_boxed_slice(), + facts: facts.into_boxed_slice(), + }) + } + + /// Prefix-filtered exact matching for one pattern term. + pub fn match_pattern( + &self, + sidecar: &TermIdentitySidecar, + pattern: TermId, + ) -> Result { + let candidates = self.candidates_for_pattern(sidecar, pattern)?; + let plan = lower_pattern(sidecar, pattern).map_err(ExpressionTrieError::Lowering)?; + let exact = match_fact_ids(sidecar, &plan, candidates.facts.iter().copied()) + .map_err(ExpressionTrieError::Match)?; + + Ok(ExpressionTrieMatches { candidates, exact }) + } + + fn insert_fact( + &mut self, + sidecar: &TermIdentitySidecar, + fact: FactId, + root: TermId, + ) -> Result<(), ExpressionTrieError> { + let mut path = Vec::new(); + append_exact_tokens(sidecar, root, &mut path)?; + + let mut node = 0usize; + for token in path { + self.stats.tokens_indexed += 1; + if let Some(&child) = self.nodes[node].children.get(&token) { + node = child; + continue; + } + + let child = self.nodes.len(); + self.nodes.push(ExpressionTrieNode::default()); + self.nodes[node].children.insert(token, child); + self.stats.trie_nodes += 1; + node = child; + } + + self.nodes[node].facts.push(fact); + self.stats.facts_indexed += 1; + Ok(()) + } + + fn facts_below_prefix(&self, prefix: &[ExpressionTrieToken]) -> Vec { + let mut node = 0usize; + for token in prefix { + let Some(&child) = self.nodes[node].children.get(token) else { + return Vec::new(); + }; + node = child; + } + + let mut facts = Vec::new(); + self.collect_facts(node, &mut facts); + facts.sort_unstable(); + facts + } + + fn collect_facts(&self, node: usize, facts: &mut Vec) { + facts.extend_from_slice(&self.nodes[node].facts); + for &child in self.nodes[node].children.values() { + self.collect_facts(child, facts); + } + } +} + +fn append_exact_tokens( + sidecar: &TermIdentitySidecar, + term: TermId, + out: &mut Vec, +) -> Result<(), ExpressionTrieError> { + let Some(record) = sidecar.get_term(term) else { + return Err(ExpressionTrieError::UnknownTerm { term }); + }; + + match record.kind { + TermKind::Symbol => out.push(ExpressionTrieToken::Symbol(term)), + TermKind::Application { arity } => { + out.push(ExpressionTrieToken::App(arity)); + for &child in record.children() { + append_exact_tokens(sidecar, child, out)?; + } + } + TermKind::NewVar => out.push(ExpressionTrieToken::NewVar), + TermKind::VarRef(level) => out.push(ExpressionTrieToken::VarRef(level)), + } + + Ok(()) +} + +fn append_ground_prefix( + sidecar: &TermIdentitySidecar, + term: TermId, + out: &mut Vec, +) -> Result { + let Some(record) = sidecar.get_term(term) else { + return Err(ExpressionTrieError::UnknownTerm { term }); + }; + + match record.kind { + TermKind::Symbol => { + out.push(ExpressionTrieToken::Symbol(term)); + Ok(true) + } + TermKind::Application { arity } => { + out.push(ExpressionTrieToken::App(arity)); + for &child in record.children() { + if !append_ground_prefix(sidecar, child, out)? { + return Ok(false); + } + } + Ok(true) + } + TermKind::NewVar | TermKind::VarRef(_) => Ok(false), + } +} + +#[cfg(test)] +mod tests { + use std::collections::BTreeSet; + + use super::*; + use crate::space::Space; + use crate::test_exprs::{ + add_repeated_edge_facts, app, repeated_edge_pattern, repeated_edge_product_roots, sym, + }; + + #[test] + fn typed_expression_trie_filters_repeated_variable_pattern_before_exact_match() { + let mut space = Space::new(); + add_repeated_edge_facts( + &mut space, + br#" +(node Alice) +(tag Bob) +"#, + ); + + let pattern = repeated_edge_pattern(); + let mut sidecar = TermIdentitySidecar::new(); + let pattern_root = sidecar.insert_term(&pattern).unwrap(); + sidecar.extend_from_pathmap(&space.btm).unwrap(); + let index = ExpressionTrieIndex::build(&sidecar).unwrap(); + + let matches = index.match_pattern(&sidecar, pattern_root).unwrap(); + let (product_count, product_roots) = repeated_edge_product_roots(&space); + let trie_roots = matches + .exact + .rows + .iter() + .map(|row| sidecar.get_term(row.root).unwrap().encoded().to_vec()) + .collect::>(); + + assert_eq!(product_count, 2); + assert_eq!(matches.exact.stats.matches, product_count); + assert_eq!(trie_roots, product_roots); + assert_eq!(matches.candidates.prefix.len(), 2); + assert_eq!(matches.candidates.facts.len(), 5); + assert!(matches.candidates.facts.len() < sidecar.stats().facts); + assert_eq!( + matches.exact.stats.facts_scanned, + matches.candidates.facts.len() + ); + } + + #[test] + fn typed_expression_trie_exact_ground_prefix_returns_one_candidate() { + let mut space = Space::new(); + space + .add_all_sexpr( + br#" +(edge Alice Bob) +(edge Bob Carol) +(node Alice) +"#, + ) + .unwrap(); + + let pattern = app(&[sym(b"edge"), sym(b"Alice"), sym(b"Bob")]); + let mut sidecar = TermIdentitySidecar::new(); + let pattern_root = sidecar.insert_term(&pattern).unwrap(); + sidecar.extend_from_pathmap(&space.btm).unwrap(); + let index = ExpressionTrieIndex::build(&sidecar).unwrap(); + + let matches = index.match_pattern(&sidecar, pattern_root).unwrap(); + + assert_eq!(matches.candidates.facts.len(), 1); + assert_eq!(matches.candidates.prefix.len(), 4); + assert_eq!(matches.exact.stats.matches, 1); + assert_eq!(matches.exact.stats.facts_scanned, 1); + } +} diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index e020d925..e20589fc 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -11,6 +11,9 @@ mod pure; pub mod term_identity; pub mod binding_env; pub mod pattern_relations; +pub mod expression_trie; +#[cfg(test)] +mod test_exprs; #[doc(hidden)] pub use mork_expr as __mork_expr; diff --git a/kernel/src/pattern_relations.rs b/kernel/src/pattern_relations.rs index 80daf059..784a206b 100644 --- a/kernel/src/pattern_relations.rs +++ b/kernel/src/pattern_relations.rs @@ -1,5 +1,5 @@ use crate::binding_env::MAX_BINDING_SLOTS; -use crate::term_identity::{TermId, TermIdentitySidecar, TermKind}; +use crate::term_identity::{FactId, TermId, TermIdentitySidecar, TermKind}; /// Query-planner variable identity produced by relationalized pattern lowering. #[repr(transparent)] @@ -124,6 +124,8 @@ pub struct PatternRelationMatches { /// Errors from executing a relationalized sidecar plan. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum PatternRelationMatchError { + /// The sidecar is missing a requested complete fact record. + UnknownFact { fact: FactId }, /// The plan refers to a variable outside its variable table. UnknownVariable { variable: PlanVariable }, /// The sidecar is missing a candidate term referenced by a fact or binding. @@ -159,11 +161,27 @@ pub fn lower_pattern( pub fn match_facts( sidecar: &TermIdentitySidecar, plan: &PatternRelationPlan, +) -> Result { + match_fact_ids(sidecar, plan, sidecar.facts().iter().map(|fact| fact.id)) +} + +/// Executes a lowered pattern against an explicit candidate set of fact IDs. +/// +/// This is the exact filtering boundary used by derived indexes. Candidate +/// generation may be approximate, but every emitted row still passes through the +/// same canonical `TermId` equality checks as [`match_facts`]. +pub fn match_fact_ids( + sidecar: &TermIdentitySidecar, + plan: &PatternRelationPlan, + facts: impl IntoIterator, ) -> Result { let mut matcher = FactMatcher::new(sidecar, plan)?; let mut result = PatternRelationMatches::default(); - for fact in sidecar.facts() { + for fact_id in facts { + let Some(fact) = sidecar.get_fact(fact_id) else { + return Err(PatternRelationMatchError::UnknownFact { fact: fact_id }); + }; result.stats.facts_scanned += 1; let mut state = MatchState::new(plan.variables.len(), plan.atoms.len()); if matcher.match_root(&mut state, fact.root, &mut result.stats)? { @@ -414,31 +432,12 @@ mod tests { use super::*; use crate::space::Space; use crate::term_identity::TermIdentitySidecar; - use mork_expr::{item_byte, Tag}; + use crate::test_exprs::{ + add_repeated_edge_facts, app, repeated_edge_pattern, repeated_edge_product_roots, sym, var, + var_ref, + }; use std::collections::BTreeSet; - fn sym(bytes: &[u8]) -> Vec { - let mut out = vec![item_byte(Tag::SymbolSize(bytes.len() as u8))]; - out.extend_from_slice(bytes); - out - } - - fn app(children: &[Vec]) -> Vec { - let mut out = vec![item_byte(Tag::Arity(children.len() as u8))]; - for child in children { - out.extend_from_slice(child); - } - out - } - - fn var() -> Vec { - vec![item_byte(Tag::NewVar)] - } - - fn var_ref(slot: u8) -> Vec { - vec![item_byte(Tag::VarRef(slot))] - } - fn lower_fact_pattern( pattern: Vec, ) -> (TermIdentitySidecar, PatternRelationPlan, PlanVariable) { @@ -521,33 +520,16 @@ mod tests { #[test] fn sidecar_matcher_preserves_product_query_roots_for_repeated_variable_pattern() { let mut space = Space::new(); - space - .add_all_sexpr( - br#" -(edge Alice (f Alice)) -(edge Alice (f Bob)) -(edge Bob (f Bob)) -(edge Carol (g Carol)) -(edge Dave (f Eve)) -"#, - ) - .unwrap(); + add_repeated_edge_facts(&mut space, b""); - let pattern = app(&[sym(b"edge"), var(), app(&[sym(b"f"), var_ref(0)])]); + let pattern = repeated_edge_pattern(); let mut sidecar = TermIdentitySidecar::new(); let pattern_root = sidecar.insert_term(&pattern).unwrap(); sidecar.extend_from_pathmap(&space.btm).unwrap(); let plan = lower_pattern(&sidecar, pattern_root).unwrap(); let sidecar_matches = match_facts(&sidecar, &plan).unwrap(); - let product_pattern = crate::expr!(space, "[2] , [3] edge $ [2] f _1"); - let mut product_roots = BTreeSet::new(); - let product_count = Space::query_multi(&space.btm, product_pattern, |_, loc| { - let span = unsafe { loc.span().as_ref().unwrap() }; - product_roots.insert(span.to_vec()); - true - }); - + let (product_count, product_roots) = repeated_edge_product_roots(&space); let sidecar_roots = sidecar_matches .rows .iter() diff --git a/kernel/src/test_exprs.rs b/kernel/src/test_exprs.rs new file mode 100644 index 00000000..d2461863 --- /dev/null +++ b/kernel/src/test_exprs.rs @@ -0,0 +1,54 @@ +use std::collections::BTreeSet; + +use crate::space::Space; +use mork_expr::{item_byte, Tag}; + +pub fn sym(bytes: &[u8]) -> Vec { + let mut out = vec![item_byte(Tag::SymbolSize(bytes.len() as u8))]; + out.extend_from_slice(bytes); + out +} + +pub fn app(children: &[Vec]) -> Vec { + let mut out = vec![item_byte(Tag::Arity(children.len() as u8))]; + for child in children { + out.extend_from_slice(child); + } + out +} + +pub fn var() -> Vec { + vec![item_byte(Tag::NewVar)] +} + +pub fn var_ref(slot: u8) -> Vec { + vec![item_byte(Tag::VarRef(slot))] +} + +pub fn repeated_edge_pattern() -> Vec { + app(&[sym(b"edge"), var(), app(&[sym(b"f"), var_ref(0)])]) +} + +pub fn add_repeated_edge_facts(space: &mut Space, extra_facts: &[u8]) { + let mut facts = br#" +(edge Alice (f Alice)) +(edge Alice (f Bob)) +(edge Bob (f Bob)) +(edge Carol (g Carol)) +(edge Dave (f Eve)) +"# + .to_vec(); + facts.extend_from_slice(extra_facts); + space.add_all_sexpr(&facts).unwrap(); +} + +pub fn repeated_edge_product_roots(space: &Space) -> (usize, BTreeSet>) { + let product_pattern = crate::expr!(space, "[2] , [3] edge $ [2] f _1"); + let mut product_roots = BTreeSet::new(); + let product_count = Space::query_multi(&space.btm, product_pattern, |_, loc| { + let span = unsafe { loc.span().as_ref().unwrap() }; + product_roots.insert(span.to_vec()); + true + }); + (product_count, product_roots) +}