From dd9f4159e536bcf41b558e84abf62cd580be3a83 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 31 May 2026 22:12:03 +0000 Subject: [PATCH 01/21] feat(cubesql): merge view joins on shared cube member into single CubeScan Generalize the push-down-cube-join rewrite so that a join between two CubeScans (typically views) on a dimension that resolves to the same underlying cube member is merged into a single CubeScan, just like the existing __cubeJoinField cube-to-cube join. A view dimension keeps its original cube.dimension path in alias_member, which is used to detect that both sides of the equi-join reference the same shared key. Co-authored-by: Pavel Tiunov --- .../src/compile/rewrite/rules/members.rs | 103 +++++++++++++++++- 1 file changed, 101 insertions(+), 2 deletions(-) diff --git a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs index 1429f99e5a7f8..95d9e7020be54 100644 --- a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs +++ b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs @@ -2803,15 +2803,114 @@ impl MemberRules { let left_join_hints_var = var!(left_join_hints_var); let right_join_hints_var = var!(right_join_hints_var); let out_join_hints_var = var!(out_join_hints_var); + let meta_context = self.meta_context.clone(); move |egraph, subst| { - let Some((left_cube, right_cube)) = is_proper_cube_join_condition( + // Resolves a join column to the name of the dimension member it + // references, but only for plain or time dimensions (measures, + // segments, etc. are not valid shared join keys). + fn dimension_member_name( + egraph: &mut CubeEGraph, + members_id: Id, + column: &Column, + ) -> Option { + match egraph[members_id].data.find_member_by_column(column) { + Some(((_, Member::Dimension { name, .. }, _), _)) + | Some(((_, Member::TimeDimension { name, .. }, _), _)) => Some(name.clone()), + _ => None, + } + } + + // Two ways to recognize a joinable pair of CubeScans: + // 1. The classic `left.__cubeJoinField = right.__cubeJoinField` + // condition that comes from the data model join graph. + // 2. A join between two CubeScans (typically views) on a + // dimension that resolves to the *same underlying cube member* + // — e.g. `orders_view.city = customers_view.city` where both + // `city` dimensions are aliases of the same `cube.dimension`. + // Such a join is on the same shared key, so the two scans can + // be merged into a single CubeScan exactly like any other + // cube-to-cube join, letting the query planner treat the result + // as a (multi-fact) query over the combined members. + let cubes = is_proper_cube_join_condition( egraph, subst, left_members_var, left_on_var, right_members_var, right_on_var, - ) else { + ); + let cubes = match cubes { + Some(cubes) => Some(cubes), + None => { + // A view dimension keeps the original `cube.dimension` path + // in `alias_member`; for non-view members we fall back to + // the member name itself. + let resolve_underlying = |member_name: &str| -> String { + meta_context + .find_dimension_with_name(member_name) + .and_then(|dim| dim.alias_member.clone()) + .unwrap_or_else(|| member_name.to_string()) + }; + + let left_join_ons = var_iter!(egraph[subst[left_on_var]], JoinLeftOn) + .cloned() + .collect::>(); + let right_join_ons = var_iter!(egraph[subst[right_on_var]], JoinRightOn) + .cloned() + .collect::>(); + + let mut found = None; + 'pairs: for left_on in left_join_ons.iter() { + for right_on in right_join_ons.iter() { + // Equi-join on a matching set of columns; every + // column pair must resolve to the same underlying + // cube member. + if left_on.is_empty() || left_on.len() != right_on.len() { + continue; + } + let mut left_cube_name: Option = None; + let mut right_cube_name: Option = None; + let mut all_match = true; + for (left_column, right_column) in left_on.iter().zip(right_on.iter()) { + let Some(left_name) = dimension_member_name( + egraph, + subst[left_members_var], + left_column, + ) else { + all_match = false; + break; + }; + let Some(right_name) = dimension_member_name( + egraph, + subst[right_members_var], + right_column, + ) else { + all_match = false; + break; + }; + if resolve_underlying(&left_name) != resolve_underlying(&right_name) + { + all_match = false; + break; + } + left_cube_name = left_name.split('.').next().map(|s| s.to_string()); + right_cube_name = + right_name.split('.').next().map(|s| s.to_string()); + } + if all_match { + if let (Some(left_cube_name), Some(right_cube_name)) = + (left_cube_name, right_cube_name) + { + found = Some((left_cube_name, right_cube_name)); + break 'pairs; + } + } + } + } + found + } + }; + let Some((left_cube, right_cube)) = cubes else { return false; }; From f60313ed99d1664c61d3e0e4646ae4e3ae1ace62 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 31 May 2026 22:18:57 +0000 Subject: [PATCH 02/21] test(cubesql): cover view join merge on shared cube member Co-authored-by: Pavel Tiunov --- rust/cubesql/cubesql/src/compile/test/mod.rs | 1 + .../src/compile/test/test_cube_join_views.rs | 117 ++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs diff --git a/rust/cubesql/cubesql/src/compile/test/mod.rs b/rust/cubesql/cubesql/src/compile/test/mod.rs index 20e63e584b5f8..7dcbc2fa862ec 100644 --- a/rust/cubesql/cubesql/src/compile/test/mod.rs +++ b/rust/cubesql/cubesql/src/compile/test/mod.rs @@ -32,6 +32,7 @@ pub mod test_bi_workarounds; pub mod test_cube_join; #[cfg(test)] pub mod test_cube_join_grouped; +pub mod test_cube_join_views; #[cfg(test)] pub mod test_cube_scan; #[cfg(test)] diff --git a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs new file mode 100644 index 0000000000000..c10b6a9faa666 --- /dev/null +++ b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs @@ -0,0 +1,117 @@ +use cubeclient::models::V1LoadRequestQuery; +use pretty_assertions::assert_eq; + +use crate::{ + compile::{ + rewrite::rewriter::Rewriter, + test::{ + convert_select_to_query_plan_with_meta, init_testing_logger, + utils::LogicalPlanTestUtils, + }, + }, + transport::{CubeMeta, CubeMetaDimension, CubeMetaMeasure}, +}; +use cubeclient::models::V1CubeMetaType; + +/// Two views that both expose the same underlying `Customers.city` +/// dimension (via `aliasMember`). `OrdersView` carries an `Orders` +/// measure while `CustomersView` carries a `Customers` measure, so a +/// query touching both is a multi-fact query joined on the shared key. +fn views_meta() -> Vec { + let dimension = |name: &str, alias: &str| CubeMetaDimension { + name: name.to_string(), + r#type: "string".to_string(), + alias_member: Some(alias.to_string()), + ..CubeMetaDimension::default() + }; + let measure = |name: &str, alias: &str| CubeMetaMeasure { + name: name.to_string(), + title: None, + short_title: None, + description: None, + r#type: "number".to_string(), + agg_type: Some("sum".to_string()), + meta: None, + alias_member: Some(alias.to_string()), + format: None, + format_description: None, + currency: None, + }; + + vec![ + CubeMeta { + name: "OrdersView".to_string(), + description: None, + title: None, + r#type: V1CubeMetaType::View, + dimensions: vec![dimension("OrdersView.city", "Customers.city")], + measures: vec![measure("OrdersView.revenue", "Orders.revenue")], + segments: vec![], + joins: None, + folders: None, + nested_folders: None, + hierarchies: None, + meta: None, + }, + CubeMeta { + name: "CustomersView".to_string(), + description: None, + title: None, + r#type: V1CubeMetaType::View, + dimensions: vec![dimension("CustomersView.city", "Customers.city")], + measures: vec![measure("CustomersView.amount", "Customers.amount")], + segments: vec![], + joins: None, + folders: None, + nested_folders: None, + hierarchies: None, + meta: None, + }, + ] +} + +/// A join between two views on a dimension that resolves to the same +/// underlying cube member (`Customers.city`) should be merged into a +/// single CubeScan over the combined members, exactly like a regular +/// cube-to-cube join. +#[tokio::test] +async fn test_join_two_views_on_shared_member() { + if !Rewriter::sql_push_down_enabled() { + return; + } + init_testing_logger(); + + let logical_plan = convert_select_to_query_plan_with_meta( + r#" + SELECT * + FROM OrdersView + LEFT JOIN CustomersView ON (OrdersView.city = CustomersView.city) + "# + .to_string(), + views_meta(), + ) + .await + .as_logical_plan(); + + assert_eq!( + logical_plan.find_cube_scan().request, + V1LoadRequestQuery { + measures: Some(vec![ + "OrdersView.revenue".to_string(), + "CustomersView.amount".to_string(), + ]), + dimensions: Some(vec![ + "OrdersView.city".to_string(), + "CustomersView.city".to_string(), + ]), + segments: Some(vec![]), + order: Some(vec![]), + ungrouped: Some(true), + join_hints: Some(vec![vec![ + "OrdersView".to_string(), + "CustomersView".to_string(), + ]]), + ..Default::default() + } + ) +} From 76963f1835d2ab229cdf6ead6a949473a601d015 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 31 May 2026 22:40:15 +0000 Subject: [PATCH 03/21] test(cubesql): add group-by view join query for shared cube member Mirror the motivating query exactly: SELECT c.customer_city, measure(o.revenue), measure(c.avg_age) FROM customers_view c LEFT JOIN orders_view o ON o.customer_city = c.customer_city GROUP BY 1 and assert it merges into a single grouped multi-fact CubeScan. Co-authored-by: Pavel Tiunov --- .../src/compile/test/test_cube_join_views.rs | 96 +++++++++++++++---- 1 file changed, 75 insertions(+), 21 deletions(-) diff --git a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs index c10b6a9faa666..3965d02095016 100644 --- a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs +++ b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs @@ -13,10 +13,10 @@ use crate::{ }; use cubeclient::models::V1CubeMetaType; -/// Two views that both expose the same underlying `Customers.city` -/// dimension (via `aliasMember`). `OrdersView` carries an `Orders` -/// measure while `CustomersView` carries a `Customers` measure, so a -/// query touching both is a multi-fact query joined on the shared key. +/// Two views that both expose the same underlying `customers.customer_city` +/// dimension (via `aliasMember`). `orders_view` carries an `orders` measure +/// while `customers_view` carries a `customers` measure, so a query that +/// touches both is a multi-fact query joined on the shared key. fn views_meta() -> Vec { let dimension = |name: &str, alias: &str| CubeMetaDimension { name: name.to_string(), @@ -24,13 +24,13 @@ fn views_meta() -> Vec { alias_member: Some(alias.to_string()), ..CubeMetaDimension::default() }; - let measure = |name: &str, alias: &str| CubeMetaMeasure { + let measure = |name: &str, alias: &str, agg: &str| CubeMetaMeasure { name: name.to_string(), title: None, short_title: None, description: None, r#type: "number".to_string(), - agg_type: Some("sum".to_string()), + agg_type: Some(agg.to_string()), meta: None, alias_member: Some(alias.to_string()), format: None, @@ -40,12 +40,19 @@ fn views_meta() -> Vec { vec![ CubeMeta { - name: "OrdersView".to_string(), + name: "customers_view".to_string(), description: None, title: None, r#type: V1CubeMetaType::View, - dimensions: vec![dimension("OrdersView.city", "Customers.city")], - measures: vec![measure("OrdersView.revenue", "Orders.revenue")], + dimensions: vec![dimension( + "customers_view.customer_city", + "customers.customer_city", + )], + measures: vec![measure( + "customers_view.avg_age", + "customers.avg_age", + "avg", + )], segments: vec![], joins: None, folders: None, @@ -54,12 +61,15 @@ fn views_meta() -> Vec { meta: None, }, CubeMeta { - name: "CustomersView".to_string(), + name: "orders_view".to_string(), description: None, title: None, r#type: V1CubeMetaType::View, - dimensions: vec![dimension("CustomersView.city", "Customers.city")], - measures: vec![measure("CustomersView.amount", "Customers.amount")], + dimensions: vec![dimension( + "orders_view.customer_city", + "customers.customer_city", + )], + measures: vec![measure("orders_view.revenue", "orders.revenue", "sum")], segments: vec![], joins: None, folders: None, @@ -71,7 +81,7 @@ fn views_meta() -> Vec { } /// A join between two views on a dimension that resolves to the same -/// underlying cube member (`Customers.city`) should be merged into a +/// underlying cube member (`customers.customer_city`) should be merged into a /// single CubeScan over the combined members, exactly like a regular /// cube-to-cube join. #[tokio::test] @@ -84,8 +94,9 @@ async fn test_join_two_views_on_shared_member() { let logical_plan = convert_select_to_query_plan_with_meta( r#" SELECT * - FROM OrdersView - LEFT JOIN CustomersView ON (OrdersView.city = CustomersView.city) + FROM customers_view + LEFT JOIN orders_view + ON (orders_view.customer_city = customers_view.customer_city) "# .to_string(), views_meta(), @@ -97,19 +108,62 @@ async fn test_join_two_views_on_shared_member() { logical_plan.find_cube_scan().request, V1LoadRequestQuery { measures: Some(vec![ - "OrdersView.revenue".to_string(), - "CustomersView.amount".to_string(), + "customers_view.avg_age".to_string(), + "orders_view.revenue".to_string(), ]), dimensions: Some(vec![ - "OrdersView.city".to_string(), - "CustomersView.city".to_string(), + "customers_view.customer_city".to_string(), + "orders_view.customer_city".to_string(), ]), segments: Some(vec![]), order: Some(vec![]), ungrouped: Some(true), join_hints: Some(vec![vec![ - "OrdersView".to_string(), - "CustomersView".to_string(), + "customers_view".to_string(), + "orders_view".to_string(), + ]]), + ..Default::default() + } + ) +} + +/// The motivating query: a grouped (multi-fact) query selecting a dimension +/// and measures from each view, joined on the shared `customer_city`. The two +/// view scans are merged into a single grouped CubeScan over the combined +/// members. +#[tokio::test] +async fn test_group_by_join_two_views_on_shared_member() { + if !Rewriter::sql_push_down_enabled() { + return; + } + init_testing_logger(); + + let logical_plan = convert_select_to_query_plan_with_meta( + r#" + SELECT c.customer_city, measure(o.revenue), measure(c.avg_age) + FROM customers_view c + LEFT JOIN orders_view o ON o.customer_city = c.customer_city + GROUP BY 1 + "# + .to_string(), + views_meta(), + ) + .await + .as_logical_plan(); + + assert_eq!( + logical_plan.find_cube_scan().request, + V1LoadRequestQuery { + measures: Some(vec![ + "orders_view.revenue".to_string(), + "customers_view.avg_age".to_string(), + ]), + dimensions: Some(vec!["customers_view.customer_city".to_string()]), + segments: Some(vec![]), + order: Some(vec![]), + join_hints: Some(vec![vec![ + "customers_view".to_string(), + "orders_view".to_string(), ]]), ..Default::default() } From e5f4bf14ff5d31cdd08d4f4f1e968aa78c350a84 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 31 May 2026 23:22:15 +0000 Subject: [PATCH 04/21] feat(cubesql): respect inner/left/right join semantics for view joins When merging a join between two views on a shared cube member, the downstream multi-fact query is rendered as a FULL OUTER JOIN over the shared key. To recover the requested join semantics, the rewrite now adds a measure 'set' filter on each side that must be present: - INNER: both sides required - LEFT: left side required - RIGHT: right side required - FULL: no extra filter Branch presence is detected via a measure of the side (the grouping key is COALESCEd across sides downstream, so it cannot tell sides apart). Covered with left/inner group-by tests. Co-authored-by: Pavel Tiunov --- .../src/compile/rewrite/rules/members.rs | 88 ++++++++++++++++++- .../src/compile/test/test_cube_join_views.rs | 74 ++++++++++++++-- 2 files changed, 152 insertions(+), 10 deletions(-) diff --git a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs index 95d9e7020be54..53766e933a9a0 100644 --- a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs +++ b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs @@ -432,6 +432,9 @@ impl RewriteRules for MemberRules { "?left_join_hints", "?right_join_hints", "?out_join_hints", + "?join_type", + "?left_filters", + "?right_filters", ), ), ]; @@ -2792,6 +2795,9 @@ impl MemberRules { left_join_hints_var: &'static str, right_join_hints_var: &'static str, out_join_hints_var: &'static str, + join_type_var: &'static str, + left_filters_var: &'static str, + right_filters_var: &'static str, ) -> impl Fn(&mut CubeEGraph, &mut Subst) -> bool { let left_alias_to_cube_var = var!(left_alias_to_cube_var); let right_alias_to_cube_var = var!(right_alias_to_cube_var); @@ -2803,6 +2809,9 @@ impl MemberRules { let left_join_hints_var = var!(left_join_hints_var); let right_join_hints_var = var!(right_join_hints_var); let out_join_hints_var = var!(out_join_hints_var); + let join_type_var = var!(join_type_var); + let left_filters_var = var!(left_filters_var); + let right_filters_var = var!(right_filters_var); let meta_context = self.meta_context.clone(); move |egraph, subst| { // Resolves a join column to the name of the dimension member it @@ -2839,8 +2848,8 @@ impl MemberRules { right_members_var, right_on_var, ); - let cubes = match cubes { - Some(cubes) => Some(cubes), + let (cubes, shared_member_join) = match cubes { + Some(cubes) => (Some(cubes), false), None => { // A view dimension keeps the original `cube.dimension` path // in `alias_member`; for non-view members we fall back to @@ -2907,13 +2916,86 @@ impl MemberRules { } } } - found + (found, true) } }; let Some((left_cube, right_cube)) = cubes else { return false; }; + // For a join between two views on a shared cube member, Tesseract + // renders the merged multi-fact scan as a FULL OUTER JOIN over the + // shared key. Re-introduce the requested INNER/LEFT/RIGHT semantics + // by requiring a measure of each "must be present" side to be set + // (FULL adds nothing). Branch presence is detected via a measure + // because the shared grouping key is COALESCEd across sides + // downstream and so cannot distinguish which side a row came from. + if shared_member_join { + fn side_measure(egraph: &CubeEGraph, members_id: Id) -> Option { + egraph[members_id] + .data + .member_name_to_expr + .as_ref() + .and_then(|m| { + m.list.iter().find_map(|(_, member, _)| match member { + Member::Measure { name, .. } => Some(name.clone()), + _ => None, + }) + }) + } + + let mut require_left = false; + let mut require_right = false; + if let Some(join_type) = var_list_iter!(egraph[subst[join_type_var]], JoinJoinType) + .cloned() + .next() + { + match join_type.0 { + datafusion::prelude::JoinType::Inner => { + require_left = true; + require_right = true; + } + datafusion::prelude::JoinType::Left => require_left = true, + datafusion::prelude::JoinType::Right => require_right = true, + _ => {} + } + } + + let mut presence_members = vec![]; + if require_left { + if let Some(name) = side_measure(egraph, subst[left_members_var]) { + presence_members.push(name); + } + } + if require_right { + if let Some(name) = side_measure(egraph, subst[right_members_var]) { + presence_members.push(name); + } + } + + if !presence_members.is_empty() { + let mut acc = subst[left_filters_var]; + for name in presence_members { + let member = egraph.add(LogicalPlanLanguage::FilterMemberMember( + crate::compile::rewrite::FilterMemberMember(name), + )); + let op = egraph.add(LogicalPlanLanguage::FilterMemberOp( + crate::compile::rewrite::FilterMemberOp("set".to_string()), + )); + let values = egraph.add(LogicalPlanLanguage::FilterMemberValues( + crate::compile::rewrite::FilterMemberValues(vec![]), + )); + let filter_member = + egraph.add(LogicalPlanLanguage::FilterMember([member, op, values])); + acc = egraph.add(LogicalPlanLanguage::CubeScanFilters(vec![ + filter_member, + acc, + ])); + } + subst.insert(left_filters_var, acc); + } + } + for left_alias_to_cube in var_iter!(egraph[subst[left_alias_to_cube_var]], CubeScanAliasToCube) { diff --git a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs index 3965d02095016..c04c9d0cda35e 100644 --- a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs +++ b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs @@ -1,4 +1,4 @@ -use cubeclient::models::V1LoadRequestQuery; +use cubeclient::models::{V1LoadRequestQuery, V1LoadRequestQueryFilterItem}; use pretty_assertions::assert_eq; use crate::{ @@ -80,10 +80,22 @@ fn views_meta() -> Vec { ] } +fn set_filter(member: &str) -> V1LoadRequestQueryFilterItem { + V1LoadRequestQueryFilterItem { + member: Some(member.to_string()), + operator: Some("set".to_string()), + values: None, + or: None, + and: None, + } +} + /// A join between two views on a dimension that resolves to the same /// underlying cube member (`customers.customer_city`) should be merged into a /// single CubeScan over the combined members, exactly like a regular -/// cube-to-cube join. +/// cube-to-cube join. As a `LEFT JOIN`, the left ("must be present") side is +/// guarded with a `set` filter so the downstream FULL OUTER multi-fact stitch +/// keeps left-join semantics. #[tokio::test] async fn test_join_two_views_on_shared_member() { if !Rewriter::sql_push_down_enabled() { @@ -117,6 +129,7 @@ async fn test_join_two_views_on_shared_member() { ]), segments: Some(vec![]), order: Some(vec![]), + filters: Some(vec![set_filter("customers_view.avg_age")]), ungrouped: Some(true), join_hints: Some(vec![vec![ "customers_view".to_string(), @@ -127,12 +140,13 @@ async fn test_join_two_views_on_shared_member() { ) } -/// The motivating query: a grouped (multi-fact) query selecting a dimension -/// and measures from each view, joined on the shared `customer_city`. The two -/// view scans are merged into a single grouped CubeScan over the combined -/// members. +/// The motivating query: a grouped (multi-fact) `LEFT JOIN` selecting a +/// dimension and measures from each view, joined on the shared `customer_city`. +/// The two view scans are merged into a single grouped CubeScan, and the left +/// side gets a `set` filter to recover LEFT-join semantics on top of the +/// FULL OUTER multi-fact stitch. #[tokio::test] -async fn test_group_by_join_two_views_on_shared_member() { +async fn test_group_by_left_join_two_views_on_shared_member() { if !Rewriter::sql_push_down_enabled() { return; } @@ -161,6 +175,52 @@ async fn test_group_by_join_two_views_on_shared_member() { dimensions: Some(vec!["customers_view.customer_city".to_string()]), segments: Some(vec![]), order: Some(vec![]), + filters: Some(vec![set_filter("customers_view.avg_age")]), + join_hints: Some(vec![vec![ + "customers_view".to_string(), + "orders_view".to_string(), + ]]), + ..Default::default() + } + ) +} + +/// Same shape but `INNER JOIN`: both sides must be present, so the merged scan +/// carries a `set` filter for a measure of each side. +#[tokio::test] +async fn test_group_by_inner_join_two_views_on_shared_member() { + if !Rewriter::sql_push_down_enabled() { + return; + } + init_testing_logger(); + + let logical_plan = convert_select_to_query_plan_with_meta( + r#" + SELECT c.customer_city, measure(o.revenue), measure(c.avg_age) + FROM customers_view c + INNER JOIN orders_view o ON o.customer_city = c.customer_city + GROUP BY 1 + "# + .to_string(), + views_meta(), + ) + .await + .as_logical_plan(); + + assert_eq!( + logical_plan.find_cube_scan().request, + V1LoadRequestQuery { + measures: Some(vec![ + "orders_view.revenue".to_string(), + "customers_view.avg_age".to_string(), + ]), + dimensions: Some(vec!["customers_view.customer_city".to_string()]), + segments: Some(vec![]), + order: Some(vec![]), + filters: Some(vec![ + set_filter("orders_view.revenue"), + set_filter("customers_view.avg_age"), + ]), join_hints: Some(vec![vec![ "customers_view".to_string(), "orders_view".to_string(), From a18644fb8366381cdbefc7af9e5bd217f4f4af48 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 31 May 2026 23:34:25 +0000 Subject: [PATCH 05/21] refactor(cubesql): use join key (not a measure) for view-join presence filter Detect side presence with the side's join-key dimension instead of an arbitrary measure. The join key is always available and is the actual shared-key marker, avoiding the nullable-measure caveat and the case where a side has no selected measure. - LEFT: left join key must be set - RIGHT: right join key must be set - INNER: both join keys must be set - FULL: no extra filter Co-authored-by: Pavel Tiunov --- .../src/compile/rewrite/rules/members.rs | 37 +++++++------------ .../src/compile/test/test_cube_join_views.rs | 8 ++-- 2 files changed, 17 insertions(+), 28 deletions(-) diff --git a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs index 53766e933a9a0..3c57344b1092d 100644 --- a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs +++ b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs @@ -2848,6 +2848,8 @@ impl MemberRules { right_members_var, right_on_var, ); + let mut shared_left_keys: Vec = vec![]; + let mut shared_right_keys: Vec = vec![]; let (cubes, shared_member_join) = match cubes { Some(cubes) => (Some(cubes), false), None => { @@ -2879,6 +2881,8 @@ impl MemberRules { } let mut left_cube_name: Option = None; let mut right_cube_name: Option = None; + let mut left_keys: Vec = vec![]; + let mut right_keys: Vec = vec![]; let mut all_match = true; for (left_column, right_column) in left_on.iter().zip(right_on.iter()) { let Some(left_name) = dimension_member_name( @@ -2905,12 +2909,16 @@ impl MemberRules { left_cube_name = left_name.split('.').next().map(|s| s.to_string()); right_cube_name = right_name.split('.').next().map(|s| s.to_string()); + left_keys.push(left_name); + right_keys.push(right_name); } if all_match { if let (Some(left_cube_name), Some(right_cube_name)) = (left_cube_name, right_cube_name) { found = Some((left_cube_name, right_cube_name)); + shared_left_keys = left_keys; + shared_right_keys = right_keys; break 'pairs; } } @@ -2926,24 +2934,9 @@ impl MemberRules { // For a join between two views on a shared cube member, Tesseract // renders the merged multi-fact scan as a FULL OUTER JOIN over the // shared key. Re-introduce the requested INNER/LEFT/RIGHT semantics - // by requiring a measure of each "must be present" side to be set - // (FULL adds nothing). Branch presence is detected via a measure - // because the shared grouping key is COALESCEd across sides - // downstream and so cannot distinguish which side a row came from. + // by requiring the join key of each "must be present" side to be + // set (FULL adds nothing). if shared_member_join { - fn side_measure(egraph: &CubeEGraph, members_id: Id) -> Option { - egraph[members_id] - .data - .member_name_to_expr - .as_ref() - .and_then(|m| { - m.list.iter().find_map(|(_, member, _)| match member { - Member::Measure { name, .. } => Some(name.clone()), - _ => None, - }) - }) - } - let mut require_left = false; let mut require_right = false; if let Some(join_type) = var_list_iter!(egraph[subst[join_type_var]], JoinJoinType) @@ -2961,16 +2954,12 @@ impl MemberRules { } } - let mut presence_members = vec![]; + let mut presence_members: Vec = vec![]; if require_left { - if let Some(name) = side_measure(egraph, subst[left_members_var]) { - presence_members.push(name); - } + presence_members.extend(shared_left_keys.iter().cloned()); } if require_right { - if let Some(name) = side_measure(egraph, subst[right_members_var]) { - presence_members.push(name); - } + presence_members.extend(shared_right_keys.iter().cloned()); } if !presence_members.is_empty() { diff --git a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs index c04c9d0cda35e..2ab13089d7f2b 100644 --- a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs +++ b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs @@ -129,7 +129,7 @@ async fn test_join_two_views_on_shared_member() { ]), segments: Some(vec![]), order: Some(vec![]), - filters: Some(vec![set_filter("customers_view.avg_age")]), + filters: Some(vec![set_filter("customers_view.customer_city")]), ungrouped: Some(true), join_hints: Some(vec![vec![ "customers_view".to_string(), @@ -175,7 +175,7 @@ async fn test_group_by_left_join_two_views_on_shared_member() { dimensions: Some(vec!["customers_view.customer_city".to_string()]), segments: Some(vec![]), order: Some(vec![]), - filters: Some(vec![set_filter("customers_view.avg_age")]), + filters: Some(vec![set_filter("customers_view.customer_city")]), join_hints: Some(vec![vec![ "customers_view".to_string(), "orders_view".to_string(), @@ -218,8 +218,8 @@ async fn test_group_by_inner_join_two_views_on_shared_member() { segments: Some(vec![]), order: Some(vec![]), filters: Some(vec![ - set_filter("orders_view.revenue"), - set_filter("customers_view.avg_age"), + set_filter("orders_view.customer_city"), + set_filter("customers_view.customer_city"), ]), join_hints: Some(vec![vec![ "customers_view".to_string(), From 072a43f9641fe2aa339eb9fba8736cb08432c542 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 31 May 2026 23:45:24 +0000 Subject: [PATCH 06/21] feat(cubesql): only merge view joins when the join key is fully within dimensions Make the merge gate explicit: the entire join key must resolve to dimensions (or time dimensions) on both sides and to the same underlying cube member. A join key that touches a measure/segment/etc. is rejected and the join falls back to normal (non-merged) handling. Add a negative test that joining two views on measures is not merged. Co-authored-by: Pavel Tiunov --- .../src/compile/rewrite/rules/members.rs | 10 +++-- .../src/compile/test/test_cube_join_views.rs | 44 +++++++++++++++---- 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs index 3c57344b1092d..65798da62d9b0 100644 --- a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs +++ b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs @@ -2873,9 +2873,13 @@ impl MemberRules { let mut found = None; 'pairs: for left_on in left_join_ons.iter() { for right_on in right_join_ons.iter() { - // Equi-join on a matching set of columns; every - // column pair must resolve to the same underlying - // cube member. + // We can only merge when the *whole* join key is + // fully within dimensions: every column pair must + // resolve to a dimension (or time dimension) on both + // sides and to the same underlying cube member. A + // join key that touches a measure/segment/etc. (or + // mixes underlying members) is rejected, leaving the + // join to the normal (non-merged) handling. if left_on.is_empty() || left_on.len() != right_on.len() { continue; } diff --git a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs index 2ab13089d7f2b..a1e1c285e0d80 100644 --- a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs +++ b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs @@ -1,17 +1,17 @@ -use cubeclient::models::{V1LoadRequestQuery, V1LoadRequestQueryFilterItem}; +use cubeclient::models::{V1CubeMetaType, V1LoadRequestQuery, V1LoadRequestQueryFilterItem}; use pretty_assertions::assert_eq; use crate::{ compile::{ rewrite::rewriter::Rewriter, test::{ - convert_select_to_query_plan_with_meta, init_testing_logger, - utils::LogicalPlanTestUtils, + convert_select_to_query_plan_with_meta, convert_sql_to_cube_query, get_test_session, + get_test_tenant_ctx_with_meta, init_testing_logger, utils::LogicalPlanTestUtils, }, + CompilationError, DatabaseProtocol, }, transport::{CubeMeta, CubeMetaDimension, CubeMetaMeasure}, }; -use cubeclient::models::V1CubeMetaType; /// Two views that both expose the same underlying `customers.customer_city` /// dimension (via `aliasMember`). `orders_view` carries an `orders` measure @@ -94,8 +94,8 @@ fn set_filter(member: &str) -> V1LoadRequestQueryFilterItem { /// underlying cube member (`customers.customer_city`) should be merged into a /// single CubeScan over the combined members, exactly like a regular /// cube-to-cube join. As a `LEFT JOIN`, the left ("must be present") side is -/// guarded with a `set` filter so the downstream FULL OUTER multi-fact stitch -/// keeps left-join semantics. +/// guarded with a `set` filter on its join key so the downstream FULL OUTER +/// multi-fact stitch keeps left-join semantics. #[tokio::test] async fn test_join_two_views_on_shared_member() { if !Rewriter::sql_push_down_enabled() { @@ -143,7 +143,7 @@ async fn test_join_two_views_on_shared_member() { /// The motivating query: a grouped (multi-fact) `LEFT JOIN` selecting a /// dimension and measures from each view, joined on the shared `customer_city`. /// The two view scans are merged into a single grouped CubeScan, and the left -/// side gets a `set` filter to recover LEFT-join semantics on top of the +/// join key gets a `set` filter to recover LEFT-join semantics on top of the /// FULL OUTER multi-fact stitch. #[tokio::test] async fn test_group_by_left_join_two_views_on_shared_member() { @@ -186,7 +186,7 @@ async fn test_group_by_left_join_two_views_on_shared_member() { } /// Same shape but `INNER JOIN`: both sides must be present, so the merged scan -/// carries a `set` filter for a measure of each side. +/// carries a `set` filter on the join key of each side. #[tokio::test] async fn test_group_by_inner_join_two_views_on_shared_member() { if !Rewriter::sql_push_down_enabled() { @@ -229,3 +229,31 @@ async fn test_group_by_inner_join_two_views_on_shared_member() { } ) } + +/// The merge only fires when the join key is fully within dimensions. Joining +/// the two views on a measure (`o.revenue = c.avg_age`) is not a shared-member +/// dimension join, so the scans are not merged and the query is rejected the +/// same way any other unsupported cube join is. +#[tokio::test] +async fn test_join_two_views_on_measure_is_not_merged() { + if !Rewriter::sql_push_down_enabled() { + return; + } + init_testing_logger(); + + let meta = get_test_tenant_ctx_with_meta(views_meta()); + let query = convert_sql_to_cube_query( + &r#" + SELECT * + FROM customers_view c + LEFT JOIN orders_view o ON (o.revenue = c.avg_age) + "# + .to_string(), + meta.clone(), + get_test_session(DatabaseProtocol::PostgreSQL, meta).await, + ) + .await; + + let error = query.unwrap_err(); + assert!(matches!(error, CompilationError::Rewrite(..))); +} From b95e8d750acb7da785049724df32089a810957bf Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Mon, 1 Jun 2026 20:26:15 +0000 Subject: [PATCH 07/21] fix(cubesql): gate view-join test module with cfg(test); drop unused var - Add #[cfg(test)] to the test_cube_join_views module so it is not compiled into non-test builds (fixes unresolved pretty_assertions and unused-import errors under clippy -D warnings and the native builds). - Remove the unused right_filters_var from push_down_cube_join. Co-authored-by: Pavel Tiunov --- rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs | 3 --- rust/cubesql/cubesql/src/compile/test/mod.rs | 1 + 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs index 65798da62d9b0..9b317a5bb38aa 100644 --- a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs +++ b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs @@ -434,7 +434,6 @@ impl RewriteRules for MemberRules { "?out_join_hints", "?join_type", "?left_filters", - "?right_filters", ), ), ]; @@ -2797,7 +2796,6 @@ impl MemberRules { out_join_hints_var: &'static str, join_type_var: &'static str, left_filters_var: &'static str, - right_filters_var: &'static str, ) -> impl Fn(&mut CubeEGraph, &mut Subst) -> bool { let left_alias_to_cube_var = var!(left_alias_to_cube_var); let right_alias_to_cube_var = var!(right_alias_to_cube_var); @@ -2811,7 +2809,6 @@ impl MemberRules { let out_join_hints_var = var!(out_join_hints_var); let join_type_var = var!(join_type_var); let left_filters_var = var!(left_filters_var); - let right_filters_var = var!(right_filters_var); let meta_context = self.meta_context.clone(); move |egraph, subst| { // Resolves a join column to the name of the dimension member it diff --git a/rust/cubesql/cubesql/src/compile/test/mod.rs b/rust/cubesql/cubesql/src/compile/test/mod.rs index 7dcbc2fa862ec..cfefbc79da11d 100644 --- a/rust/cubesql/cubesql/src/compile/test/mod.rs +++ b/rust/cubesql/cubesql/src/compile/test/mod.rs @@ -32,6 +32,7 @@ pub mod test_bi_workarounds; pub mod test_cube_join; #[cfg(test)] pub mod test_cube_join_grouped; +#[cfg(test)] pub mod test_cube_join_views; #[cfg(test)] pub mod test_cube_scan; From 9a682af19f4f4aa4b115c298992cdd40b1047aa5 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Mon, 1 Jun 2026 21:40:34 +0000 Subject: [PATCH 08/21] chore: re-trigger CI (flaky Windows native + concurrency-canceled redshift) Co-authored-by: Pavel Tiunov From 6306a2fc09fd4217241333ca6629f45709fc2d99 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 6 Jun 2026 06:28:02 +0000 Subject: [PATCH 09/21] feat(cubesql): only merge view joins under aggregate grouping by the join key Move the shared-member view-join merge out of push_down_cube_join (which runs on the always-ungrouped raw join) into a new rule that matches an Aggregate over the join. The merge now only fires when: - the query is grouped (an Aggregate sits over the join), and - the GROUP BY is exactly the shared join key. This rejects ungrouped queries (e.g. SELECT * over the join) and queries that group by a non-join-key dimension, both of which would otherwise produce an incorrect multi-fact pushdown. push_down_cube_join is restored to its original __cubeJoinField-only behavior. Tests: grouped left/inner merge (with join-key set filters); ungrouped, group-by-mismatch, and measure-key joins are not merged. Co-authored-by: Pavel Tiunov --- .../src/compile/rewrite/rules/members.rs | 469 ++++++++++++------ .../src/compile/test/test_cube_join_views.rs | 130 ++--- 2 files changed, 389 insertions(+), 210 deletions(-) diff --git a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs index 9b317a5bb38aa..c0aaffec860cd 100644 --- a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs +++ b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs @@ -432,8 +432,87 @@ impl RewriteRules for MemberRules { "?left_join_hints", "?right_join_hints", "?out_join_hints", + ), + ), + // Merge a join between two (view) CubeScans on a dimension that + // resolves to the same underlying cube member into a single + // CubeScan, but ONLY under an aggregate whose GROUP BY is exactly + // that shared join key. The merged scan becomes a multi-fact query + // (FULL OUTER stitched over the group-by key by the planner). + // Gating on the aggregate means ungrouped queries (e.g. SELECT *) + // and queries grouping by a non-join-key dimension are not merged. + transforming_rewrite( + "push-down-aggregate-shared-member-join", + aggregate( + join( + cube_scan( + "?left_alias_to_cube", + "?left_members", + "?left_filters", + "?left_orders", + "CubeScanLimit:None", + "CubeScanOffset:None", + "?left_split", + "CubeScanCanPushdownJoin:true", + "CubeScanWrapped:false", + "CubeScanUngrouped:true", + "?left_join_hints", + ), + cube_scan( + "?right_alias_to_cube", + "?right_members", + "?right_filters", + "?right_orders", + "CubeScanLimit:None", + "CubeScanOffset:None", + "?right_split", + "CubeScanCanPushdownJoin:true", + "CubeScanWrapped:false", + "CubeScanUngrouped:true", + "?right_join_hints", + ), + "?left_on", + "?right_on", + "?join_type", + "?join_constraint", + "?null_equals_null", + ), + "?group_expr", + "?aggr_expr", + "?agg_split", + ), + aggregate( + cube_scan( + "?out_alias_to_cube", + cube_scan_members("?left_members", "?right_members"), + cube_scan_filters("?left_filters", "?right_filters"), + cube_scan_order_empty_tail(), + "CubeScanLimit:None", + "CubeScanOffset:None", + "CubeScanSplit:false", + "CubeScanCanPushdownJoin:true", + "CubeScanWrapped:false", + "CubeScanUngrouped:true", + "?out_join_hints", + ), + "?group_expr", + "?aggr_expr", + "?agg_split", + ), + self.push_down_aggregate_shared_member_join( + "?left_alias_to_cube", + "?right_alias_to_cube", + "?out_alias_to_cube", + "?left_members", + "?right_members", + "?left_on", + "?right_on", "?join_type", + "?left_join_hints", + "?right_join_hints", + "?out_join_hints", "?left_filters", + "?group_expr", ), ), ]; @@ -2794,8 +2873,96 @@ impl MemberRules { left_join_hints_var: &'static str, right_join_hints_var: &'static str, out_join_hints_var: &'static str, + ) -> impl Fn(&mut CubeEGraph, &mut Subst) -> bool { + let left_alias_to_cube_var = var!(left_alias_to_cube_var); + let right_alias_to_cube_var = var!(right_alias_to_cube_var); + let out_alias_to_cube_var = var!(out_alias_to_cube_var); + let left_members_var = var!(left_members_var); + let right_members_var = var!(right_members_var); + let left_on_var = var!(left_on_var); + let right_on_var = var!(right_on_var); + let left_join_hints_var = var!(left_join_hints_var); + let right_join_hints_var = var!(right_join_hints_var); + let out_join_hints_var = var!(out_join_hints_var); + move |egraph, subst| { + let Some((left_cube, right_cube)) = is_proper_cube_join_condition( + egraph, + subst, + left_members_var, + left_on_var, + right_members_var, + right_on_var, + ) else { + return false; + }; + + for left_alias_to_cube in + var_iter!(egraph[subst[left_alias_to_cube_var]], CubeScanAliasToCube) + { + for right_alias_to_cube in + var_iter!(egraph[subst[right_alias_to_cube_var]], CubeScanAliasToCube) + { + for left_join_hints in + var_iter!(egraph[subst[left_join_hints_var]], CubeScanJoinHints) + { + for right_join_hints in + var_iter!(egraph[subst[right_join_hints_var]], CubeScanJoinHints) + { + let out_alias_to_cube = CubeScanAliasToCube( + left_alias_to_cube + .iter() + .chain(right_alias_to_cube.iter()) + .cloned() + .collect(), + ); + + let out_join_hints = CubeScanJoinHints( + left_join_hints + .iter() + .chain(right_join_hints.iter()) + .cloned() + .chain(iter::once(vec![left_cube, right_cube])) + .collect(), + ); + + subst.insert( + out_alias_to_cube_var, + egraph.add(LogicalPlanLanguage::CubeScanAliasToCube( + out_alias_to_cube, + )), + ); + + subst.insert( + out_join_hints_var, + egraph.add(LogicalPlanLanguage::CubeScanJoinHints(out_join_hints)), + ); + + return true; + } + } + } + } + + false + } + } + + #[allow(clippy::too_many_arguments)] + fn push_down_aggregate_shared_member_join( + &self, + left_alias_to_cube_var: &'static str, + right_alias_to_cube_var: &'static str, + out_alias_to_cube_var: &'static str, + left_members_var: &'static str, + right_members_var: &'static str, + left_on_var: &'static str, + right_on_var: &'static str, join_type_var: &'static str, + left_join_hints_var: &'static str, + right_join_hints_var: &'static str, + out_join_hints_var: &'static str, left_filters_var: &'static str, + group_expr_var: &'static str, ) -> impl Fn(&mut CubeEGraph, &mut Subst) -> bool { let left_alias_to_cube_var = var!(left_alias_to_cube_var); let right_alias_to_cube_var = var!(right_alias_to_cube_var); @@ -2804,16 +2971,14 @@ impl MemberRules { let right_members_var = var!(right_members_var); let left_on_var = var!(left_on_var); let right_on_var = var!(right_on_var); + let join_type_var = var!(join_type_var); let left_join_hints_var = var!(left_join_hints_var); let right_join_hints_var = var!(right_join_hints_var); let out_join_hints_var = var!(out_join_hints_var); - let join_type_var = var!(join_type_var); let left_filters_var = var!(left_filters_var); + let group_expr_var = var!(group_expr_var); let meta_context = self.meta_context.clone(); move |egraph, subst| { - // Resolves a join column to the name of the dimension member it - // references, but only for plain or time dimensions (measures, - // segments, etc. are not valid shared join keys). fn dimension_member_name( egraph: &mut CubeEGraph, members_id: Id, @@ -2826,164 +2991,170 @@ impl MemberRules { } } - // Two ways to recognize a joinable pair of CubeScans: - // 1. The classic `left.__cubeJoinField = right.__cubeJoinField` - // condition that comes from the data model join graph. - // 2. A join between two CubeScans (typically views) on a - // dimension that resolves to the *same underlying cube member* - // — e.g. `orders_view.city = customers_view.city` where both - // `city` dimensions are aliases of the same `cube.dimension`. - // Such a join is on the same shared key, so the two scans can - // be merged into a single CubeScan exactly like any other - // cube-to-cube join, letting the query planner treat the result - // as a (multi-fact) query over the combined members. - let cubes = is_proper_cube_join_condition( - egraph, - subst, - left_members_var, - left_on_var, - right_members_var, - right_on_var, - ); - let mut shared_left_keys: Vec = vec![]; - let mut shared_right_keys: Vec = vec![]; - let (cubes, shared_member_join) = match cubes { - Some(cubes) => (Some(cubes), false), - None => { - // A view dimension keeps the original `cube.dimension` path - // in `alias_member`; for non-view members we fall back to - // the member name itself. - let resolve_underlying = |member_name: &str| -> String { - meta_context - .find_dimension_with_name(member_name) - .and_then(|dim| dim.alias_member.clone()) - .unwrap_or_else(|| member_name.to_string()) - }; + let resolve_underlying = |member_name: &str| -> String { + meta_context + .find_dimension_with_name(member_name) + .and_then(|dim| dim.alias_member.clone()) + .unwrap_or_else(|| member_name.to_string()) + }; - let left_join_ons = var_iter!(egraph[subst[left_on_var]], JoinLeftOn) - .cloned() - .collect::>(); - let right_join_ons = var_iter!(egraph[subst[right_on_var]], JoinRightOn) - .cloned() - .collect::>(); + // The join must be on dimensions that resolve to the same + // underlying cube member on both sides (a shared key). + let left_join_ons = var_iter!(egraph[subst[left_on_var]], JoinLeftOn) + .cloned() + .collect::>(); + let right_join_ons = var_iter!(egraph[subst[right_on_var]], JoinRightOn) + .cloned() + .collect::>(); - let mut found = None; - 'pairs: for left_on in left_join_ons.iter() { - for right_on in right_join_ons.iter() { - // We can only merge when the *whole* join key is - // fully within dimensions: every column pair must - // resolve to a dimension (or time dimension) on both - // sides and to the same underlying cube member. A - // join key that touches a measure/segment/etc. (or - // mixes underlying members) is rejected, leaving the - // join to the normal (non-merged) handling. - if left_on.is_empty() || left_on.len() != right_on.len() { - continue; - } - let mut left_cube_name: Option = None; - let mut right_cube_name: Option = None; - let mut left_keys: Vec = vec![]; - let mut right_keys: Vec = vec![]; - let mut all_match = true; - for (left_column, right_column) in left_on.iter().zip(right_on.iter()) { - let Some(left_name) = dimension_member_name( - egraph, - subst[left_members_var], - left_column, - ) else { - all_match = false; - break; - }; - let Some(right_name) = dimension_member_name( - egraph, - subst[right_members_var], - right_column, - ) else { - all_match = false; - break; - }; - if resolve_underlying(&left_name) != resolve_underlying(&right_name) - { - all_match = false; - break; - } - left_cube_name = left_name.split('.').next().map(|s| s.to_string()); - right_cube_name = - right_name.split('.').next().map(|s| s.to_string()); - left_keys.push(left_name); - right_keys.push(right_name); - } - if all_match { - if let (Some(left_cube_name), Some(right_cube_name)) = - (left_cube_name, right_cube_name) - { - found = Some((left_cube_name, right_cube_name)); - shared_left_keys = left_keys; - shared_right_keys = right_keys; - break 'pairs; - } - } + let mut matched: Option<( + String, + String, + Vec, + Vec, + Vec, + Vec, + )> = None; + 'pairs: for left_on in left_join_ons.iter() { + for right_on in right_join_ons.iter() { + if left_on.is_empty() || left_on.len() != right_on.len() { + continue; + } + let mut left_cube_name: Option = None; + let mut right_cube_name: Option = None; + let mut left_keys: Vec = vec![]; + let mut right_keys: Vec = vec![]; + let mut all_match = true; + for (left_column, right_column) in left_on.iter().zip(right_on.iter()) { + let Some(left_name) = + dimension_member_name(egraph, subst[left_members_var], left_column) + else { + all_match = false; + break; + }; + let Some(right_name) = + dimension_member_name(egraph, subst[right_members_var], right_column) + else { + all_match = false; + break; + }; + if resolve_underlying(&left_name) != resolve_underlying(&right_name) { + all_match = false; + break; + } + left_cube_name = left_name.split('.').next().map(|s| s.to_string()); + right_cube_name = right_name.split('.').next().map(|s| s.to_string()); + left_keys.push(left_name); + right_keys.push(right_name); + } + if all_match { + if let (Some(left_cube_name), Some(right_cube_name)) = + (left_cube_name, right_cube_name) + { + matched = Some(( + left_cube_name, + right_cube_name, + left_keys, + right_keys, + left_on.clone(), + right_on.clone(), + )); + break 'pairs; } } - (found, true) } - }; - let Some((left_cube, right_cube)) = cubes else { + } + + let Some(( + left_cube, + right_cube, + shared_left_keys, + shared_right_keys, + matched_left_cols, + matched_right_cols, + )) = matched + else { return false; }; - // For a join between two views on a shared cube member, Tesseract - // renders the merged multi-fact scan as a FULL OUTER JOIN over the - // shared key. Re-introduce the requested INNER/LEFT/RIGHT semantics - // by requiring the join key of each "must be present" side to be - // set (FULL adds nothing). - if shared_member_join { - let mut require_left = false; - let mut require_right = false; - if let Some(join_type) = var_list_iter!(egraph[subst[join_type_var]], JoinJoinType) - .cloned() - .next() + // The join key must be fully within the GROUP BY dimensions: every + // group-by column must be one of the join-key columns, and every + // join-key pair must be grouped. This is what makes the multi-fact + // stitch over the group-by key match the requested join. + let Some(group_referenced_expr) = + &egraph.index(subst[group_expr_var]).data.referenced_expr + else { + return false; + }; + let group_cols = referenced_columns(group_referenced_expr); + if group_cols.is_empty() { + return false; + } + let join_key_cols: HashSet = matched_left_cols + .iter() + .chain(matched_right_cols.iter()) + .map(|c| c.flat_name()) + .collect(); + if !group_cols.iter().all(|c| join_key_cols.contains(c)) { + return false; + } + let group_set: HashSet<&String> = group_cols.iter().collect(); + for (left_col, right_col) in matched_left_cols.iter().zip(matched_right_cols.iter()) { + if !group_set.contains(&left_col.flat_name()) + && !group_set.contains(&right_col.flat_name()) { - match join_type.0 { - datafusion::prelude::JoinType::Inner => { - require_left = true; - require_right = true; - } - datafusion::prelude::JoinType::Left => require_left = true, - datafusion::prelude::JoinType::Right => require_right = true, - _ => {} - } + return false; } + } - let mut presence_members: Vec = vec![]; - if require_left { - presence_members.extend(shared_left_keys.iter().cloned()); - } - if require_right { - presence_members.extend(shared_right_keys.iter().cloned()); + // Re-introduce INNER/LEFT/RIGHT semantics on top of the FULL OUTER + // multi-fact stitch by requiring the join key of each "must be + // present" side to be set (FULL adds nothing). + let mut require_left = false; + let mut require_right = false; + if let Some(join_type) = var_list_iter!(egraph[subst[join_type_var]], JoinJoinType) + .cloned() + .next() + { + match join_type.0 { + datafusion::prelude::JoinType::Inner => { + require_left = true; + require_right = true; + } + datafusion::prelude::JoinType::Left => require_left = true, + datafusion::prelude::JoinType::Right => require_right = true, + _ => {} } + } - if !presence_members.is_empty() { - let mut acc = subst[left_filters_var]; - for name in presence_members { - let member = egraph.add(LogicalPlanLanguage::FilterMemberMember( - crate::compile::rewrite::FilterMemberMember(name), - )); - let op = egraph.add(LogicalPlanLanguage::FilterMemberOp( - crate::compile::rewrite::FilterMemberOp("set".to_string()), - )); - let values = egraph.add(LogicalPlanLanguage::FilterMemberValues( - crate::compile::rewrite::FilterMemberValues(vec![]), - )); - let filter_member = - egraph.add(LogicalPlanLanguage::FilterMember([member, op, values])); - acc = egraph.add(LogicalPlanLanguage::CubeScanFilters(vec![ - filter_member, - acc, - ])); - } - subst.insert(left_filters_var, acc); + let mut presence_members: Vec = vec![]; + if require_left { + presence_members.extend(shared_left_keys.iter().cloned()); + } + if require_right { + presence_members.extend(shared_right_keys.iter().cloned()); + } + + if !presence_members.is_empty() { + let mut acc = subst[left_filters_var]; + for name in presence_members { + let member = egraph.add(LogicalPlanLanguage::FilterMemberMember( + crate::compile::rewrite::FilterMemberMember(name), + )); + let op = egraph.add(LogicalPlanLanguage::FilterMemberOp( + crate::compile::rewrite::FilterMemberOp("set".to_string()), + )); + let values = egraph.add(LogicalPlanLanguage::FilterMemberValues( + crate::compile::rewrite::FilterMemberValues(vec![]), + )); + let filter_member = + egraph.add(LogicalPlanLanguage::FilterMember([member, op, values])); + acc = egraph.add(LogicalPlanLanguage::CubeScanFilters(vec![ + filter_member, + acc, + ])); } + subst.insert(left_filters_var, acc); } for left_alias_to_cube in @@ -3011,7 +3182,7 @@ impl MemberRules { .iter() .chain(right_join_hints.iter()) .cloned() - .chain(iter::once(vec![left_cube, right_cube])) + .chain(iter::once(vec![left_cube.clone(), right_cube.clone()])) .collect(), ); diff --git a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs index a1e1c285e0d80..64373622fd2b8 100644 --- a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs +++ b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs @@ -44,10 +44,12 @@ fn views_meta() -> Vec { description: None, title: None, r#type: V1CubeMetaType::View, - dimensions: vec![dimension( - "customers_view.customer_city", - "customers.customer_city", - )], + dimensions: vec![ + dimension("customers_view.customer_city", "customers.customer_city"), + // A second dimension that is NOT a join key, used to test that a + // query grouping by it (instead of the join key) is not merged. + dimension("customers_view.status", "customers.status"), + ], measures: vec![measure( "customers_view.avg_age", "customers.avg_age", @@ -90,61 +92,11 @@ fn set_filter(member: &str) -> V1LoadRequestQueryFilterItem { } } -/// A join between two views on a dimension that resolves to the same -/// underlying cube member (`customers.customer_city`) should be merged into a -/// single CubeScan over the combined members, exactly like a regular -/// cube-to-cube join. As a `LEFT JOIN`, the left ("must be present") side is -/// guarded with a `set` filter on its join key so the downstream FULL OUTER -/// multi-fact stitch keeps left-join semantics. -#[tokio::test] -async fn test_join_two_views_on_shared_member() { - if !Rewriter::sql_push_down_enabled() { - return; - } - init_testing_logger(); - - let logical_plan = convert_select_to_query_plan_with_meta( - r#" - SELECT * - FROM customers_view - LEFT JOIN orders_view - ON (orders_view.customer_city = customers_view.customer_city) - "# - .to_string(), - views_meta(), - ) - .await - .as_logical_plan(); - - assert_eq!( - logical_plan.find_cube_scan().request, - V1LoadRequestQuery { - measures: Some(vec![ - "customers_view.avg_age".to_string(), - "orders_view.revenue".to_string(), - ]), - dimensions: Some(vec![ - "customers_view.customer_city".to_string(), - "orders_view.customer_city".to_string(), - ]), - segments: Some(vec![]), - order: Some(vec![]), - filters: Some(vec![set_filter("customers_view.customer_city")]), - ungrouped: Some(true), - join_hints: Some(vec![vec![ - "customers_view".to_string(), - "orders_view".to_string(), - ]]), - ..Default::default() - } - ) -} - /// The motivating query: a grouped (multi-fact) `LEFT JOIN` selecting a -/// dimension and measures from each view, joined on the shared `customer_city`. -/// The two view scans are merged into a single grouped CubeScan, and the left -/// join key gets a `set` filter to recover LEFT-join semantics on top of the -/// FULL OUTER multi-fact stitch. +/// dimension and measures from each view, joined on the shared `customer_city` +/// which is also the GROUP BY key. The two view scans are merged into a single +/// grouped CubeScan, and the left join key gets a `set` filter to recover +/// LEFT-join semantics on top of the FULL OUTER multi-fact stitch. #[tokio::test] async fn test_group_by_left_join_two_views_on_shared_member() { if !Rewriter::sql_push_down_enabled() { @@ -230,10 +182,65 @@ async fn test_group_by_inner_join_two_views_on_shared_member() { ) } +/// Ungrouped query (`SELECT *`): the shared-member merge only applies to +/// grouped queries, so an ungrouped join is not merged and is rejected the +/// same way any other unsupported cube join is. +#[tokio::test] +async fn test_ungrouped_join_two_views_on_shared_member_is_not_merged() { + if !Rewriter::sql_push_down_enabled() { + return; + } + init_testing_logger(); + + let meta = get_test_tenant_ctx_with_meta(views_meta()); + let query = convert_sql_to_cube_query( + &r#" + SELECT * + FROM customers_view + LEFT JOIN orders_view + ON (orders_view.customer_city = customers_view.customer_city) + "# + .to_string(), + meta.clone(), + get_test_session(DatabaseProtocol::PostgreSQL, meta).await, + ) + .await; + + let error = query.unwrap_err(); + assert!(matches!(error, CompilationError::Rewrite(..))); +} + +/// The join is over a dimension (`customer_city`) that is not in the GROUP BY +/// (the query groups by `status` instead). The merge requires the join key to +/// be the group-by key, so this is not merged and is rejected. +#[tokio::test] +async fn test_group_by_join_dimension_not_in_group_by_is_not_merged() { + if !Rewriter::sql_push_down_enabled() { + return; + } + init_testing_logger(); + + let meta = get_test_tenant_ctx_with_meta(views_meta()); + let query = convert_sql_to_cube_query( + &r#" + SELECT c.status, measure(o.revenue), measure(c.avg_age) + FROM customers_view c + LEFT JOIN orders_view o ON o.customer_city = c.customer_city + GROUP BY 1 + "# + .to_string(), + meta.clone(), + get_test_session(DatabaseProtocol::PostgreSQL, meta).await, + ) + .await; + + let error = query.unwrap_err(); + assert!(matches!(error, CompilationError::Rewrite(..))); +} + /// The merge only fires when the join key is fully within dimensions. Joining /// the two views on a measure (`o.revenue = c.avg_age`) is not a shared-member -/// dimension join, so the scans are not merged and the query is rejected the -/// same way any other unsupported cube join is. +/// dimension join, so the scans are not merged and the query is rejected. #[tokio::test] async fn test_join_two_views_on_measure_is_not_merged() { if !Rewriter::sql_push_down_enabled() { @@ -244,9 +251,10 @@ async fn test_join_two_views_on_measure_is_not_merged() { let meta = get_test_tenant_ctx_with_meta(views_meta()); let query = convert_sql_to_cube_query( &r#" - SELECT * + SELECT c.customer_city, measure(o.revenue) FROM customers_view c LEFT JOIN orders_view o ON (o.revenue = c.avg_age) + GROUP BY 1 "# .to_string(), meta.clone(), From 93e642b9e6ea3c0de87bf292848bce42dee1e07c Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 6 Jun 2026 19:46:34 +0000 Subject: [PATCH 10/21] feat(cubesql): gate view-join merge on the Tesseract SQL planner The shared-member view-join merge produces a multi-fact query that is only handled correctly by the Tesseract SQL planner (FULL OUTER stitch over the shared key). Add an `enable_tesseract_sql_planner` config option (read from CUBEJS_TESSERACT_SQL_PLANNER) and only fire the rewrite when it is enabled. Add a test that the merge does not happen when Tesseract is disabled. Co-authored-by: Pavel Tiunov --- .../src/compile/rewrite/rules/members.rs | 8 ++ .../src/compile/test/test_cube_join_views.rs | 118 ++++++++++-------- rust/cubesql/cubesql/src/config/mod.rs | 9 ++ 3 files changed, 84 insertions(+), 51 deletions(-) diff --git a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs index c0aaffec860cd..f9c7a1e43f3d7 100644 --- a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs +++ b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs @@ -2978,7 +2978,15 @@ impl MemberRules { let left_filters_var = var!(left_filters_var); let group_expr_var = var!(group_expr_var); let meta_context = self.meta_context.clone(); + // Merging a view join into a single multi-fact CubeScan relies on the + // Tesseract SQL planner (it stitches the fact groups with a FULL OUTER + // JOIN over the shared key). Only enable this rewrite when Tesseract is + // enabled; the legacy planner would mis-handle the resulting query. + let enable_tesseract_sql_planner = self.config_obj.enable_tesseract_sql_planner(); move |egraph, subst| { + if !enable_tesseract_sql_planner { + return false; + } fn dimension_member_name( egraph: &mut CubeEGraph, members_id: Id, diff --git a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs index 64373622fd2b8..e70581d89fca0 100644 --- a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs +++ b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use cubeclient::models::{V1CubeMetaType, V1LoadRequestQuery, V1LoadRequestQueryFilterItem}; use pretty_assertions::assert_eq; @@ -5,11 +7,12 @@ use crate::{ compile::{ rewrite::rewriter::Rewriter, test::{ - convert_select_to_query_plan_with_meta, convert_sql_to_cube_query, get_test_session, - get_test_tenant_ctx_with_meta, init_testing_logger, utils::LogicalPlanTestUtils, + convert_sql_to_cube_query, get_test_session_with_config, get_test_tenant_ctx_with_meta, + init_testing_logger, utils::LogicalPlanTestUtils, }, - CompilationError, DatabaseProtocol, + CompilationError, DatabaseProtocol, QueryPlan, }, + config::{ConfigObj, ConfigObjImpl}, transport::{CubeMeta, CubeMetaDimension, CubeMetaMeasure}, }; @@ -92,6 +95,26 @@ fn set_filter(member: &str) -> V1LoadRequestQueryFilterItem { } } +/// Plans `sql` against the two views, with the Tesseract SQL planner enabled or +/// disabled. The shared-member view-join merge only fires when Tesseract is +/// enabled. +async fn plan_view_join(sql: &str, tesseract: bool) -> Result { + let meta = get_test_tenant_ctx_with_meta(views_meta()); + let mut config = ConfigObjImpl::default(); + config.tesseract_sql_planner = tesseract; + let config: Arc = Arc::new(config); + let session = + get_test_session_with_config(DatabaseProtocol::PostgreSQL, config, meta.clone()).await; + convert_sql_to_cube_query(&sql.to_string(), meta, session).await +} + +const GROUPED_LEFT_JOIN: &str = r#" + SELECT c.customer_city, measure(o.revenue), measure(c.avg_age) + FROM customers_view c + LEFT JOIN orders_view o ON o.customer_city = c.customer_city + GROUP BY 1 +"#; + /// The motivating query: a grouped (multi-fact) `LEFT JOIN` selecting a /// dimension and measures from each view, joined on the shared `customer_city` /// which is also the GROUP BY key. The two view scans are merged into a single @@ -104,18 +127,10 @@ async fn test_group_by_left_join_two_views_on_shared_member() { } init_testing_logger(); - let logical_plan = convert_select_to_query_plan_with_meta( - r#" - SELECT c.customer_city, measure(o.revenue), measure(c.avg_age) - FROM customers_view c - LEFT JOIN orders_view o ON o.customer_city = c.customer_city - GROUP BY 1 - "# - .to_string(), - views_meta(), - ) - .await - .as_logical_plan(); + let logical_plan = plan_view_join(GROUPED_LEFT_JOIN, true) + .await + .unwrap() + .as_logical_plan(); assert_eq!( logical_plan.find_cube_scan().request, @@ -146,17 +161,17 @@ async fn test_group_by_inner_join_two_views_on_shared_member() { } init_testing_logger(); - let logical_plan = convert_select_to_query_plan_with_meta( + let logical_plan = plan_view_join( r#" SELECT c.customer_city, measure(o.revenue), measure(c.avg_age) FROM customers_view c INNER JOIN orders_view o ON o.customer_city = c.customer_city GROUP BY 1 - "# - .to_string(), - views_meta(), + "#, + true, ) .await + .unwrap() .as_logical_plan(); assert_eq!( @@ -182,9 +197,22 @@ async fn test_group_by_inner_join_two_views_on_shared_member() { ) } +/// The merge relies on the Tesseract SQL planner; with it disabled the join is +/// not merged and the query is rejected like any other unsupported cube join. +#[tokio::test] +async fn test_grouped_view_join_not_merged_without_tesseract() { + if !Rewriter::sql_push_down_enabled() { + return; + } + init_testing_logger(); + + let error = plan_view_join(GROUPED_LEFT_JOIN, false).await.unwrap_err(); + assert!(matches!(error, CompilationError::Rewrite(..))); +} + /// Ungrouped query (`SELECT *`): the shared-member merge only applies to -/// grouped queries, so an ungrouped join is not merged and is rejected the -/// same way any other unsupported cube join is. +/// grouped queries, so an ungrouped join is not merged and is rejected even +/// when Tesseract is enabled. #[tokio::test] async fn test_ungrouped_join_two_views_on_shared_member_is_not_merged() { if !Rewriter::sql_push_down_enabled() { @@ -192,21 +220,17 @@ async fn test_ungrouped_join_two_views_on_shared_member_is_not_merged() { } init_testing_logger(); - let meta = get_test_tenant_ctx_with_meta(views_meta()); - let query = convert_sql_to_cube_query( - &r#" + let error = plan_view_join( + r#" SELECT * FROM customers_view LEFT JOIN orders_view ON (orders_view.customer_city = customers_view.customer_city) - "# - .to_string(), - meta.clone(), - get_test_session(DatabaseProtocol::PostgreSQL, meta).await, + "#, + true, ) - .await; - - let error = query.unwrap_err(); + .await + .unwrap_err(); assert!(matches!(error, CompilationError::Rewrite(..))); } @@ -220,21 +244,17 @@ async fn test_group_by_join_dimension_not_in_group_by_is_not_merged() { } init_testing_logger(); - let meta = get_test_tenant_ctx_with_meta(views_meta()); - let query = convert_sql_to_cube_query( - &r#" + let error = plan_view_join( + r#" SELECT c.status, measure(o.revenue), measure(c.avg_age) FROM customers_view c LEFT JOIN orders_view o ON o.customer_city = c.customer_city GROUP BY 1 - "# - .to_string(), - meta.clone(), - get_test_session(DatabaseProtocol::PostgreSQL, meta).await, + "#, + true, ) - .await; - - let error = query.unwrap_err(); + .await + .unwrap_err(); assert!(matches!(error, CompilationError::Rewrite(..))); } @@ -248,20 +268,16 @@ async fn test_join_two_views_on_measure_is_not_merged() { } init_testing_logger(); - let meta = get_test_tenant_ctx_with_meta(views_meta()); - let query = convert_sql_to_cube_query( - &r#" + let error = plan_view_join( + r#" SELECT c.customer_city, measure(o.revenue) FROM customers_view c LEFT JOIN orders_view o ON (o.revenue = c.avg_age) GROUP BY 1 - "# - .to_string(), - meta.clone(), - get_test_session(DatabaseProtocol::PostgreSQL, meta).await, + "#, + true, ) - .await; - - let error = query.unwrap_err(); + .await + .unwrap_err(); assert!(matches!(error, CompilationError::Rewrite(..))); } diff --git a/rust/cubesql/cubesql/src/config/mod.rs b/rust/cubesql/cubesql/src/config/mod.rs index d7977a5d4feb7..6dc13293ab64d 100644 --- a/rust/cubesql/cubesql/src/config/mod.rs +++ b/rust/cubesql/cubesql/src/config/mod.rs @@ -117,6 +117,8 @@ pub trait ConfigObj: DIService + Debug { fn max_sessions(&self) -> usize; fn no_implicit_order(&self) -> bool; + + fn enable_tesseract_sql_planner(&self) -> bool; } #[derive(Debug, Clone)] @@ -138,6 +140,7 @@ pub struct ConfigObjImpl { pub non_streaming_query_max_row_limit: i32, pub max_sessions: usize, pub no_implicit_order: bool, + pub tesseract_sql_planner: bool, } impl ConfigObjImpl { @@ -181,6 +184,7 @@ impl ConfigObjImpl { non_streaming_query_max_row_limit: env_parse("CUBEJS_DB_QUERY_LIMIT", 50000), max_sessions: env_parse("CUBEJS_MAX_SESSIONS", 1024), no_implicit_order: env_parse("CUBESQL_SQL_NO_IMPLICIT_ORDER", true), + tesseract_sql_planner: env_parse("CUBEJS_TESSERACT_SQL_PLANNER", false), } } } @@ -251,6 +255,10 @@ impl ConfigObj for ConfigObjImpl { fn max_sessions(&self) -> usize { self.max_sessions } + + fn enable_tesseract_sql_planner(&self) -> bool { + self.tesseract_sql_planner + } } impl Config { @@ -284,6 +292,7 @@ impl Config { non_streaming_query_max_row_limit: 50000, max_sessions: 1024, no_implicit_order: true, + tesseract_sql_planner: false, }), } } From 3b618866a6fe4ae49e386eed39cd54f3a36b4a1f Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 6 Jun 2026 21:01:28 +0000 Subject: [PATCH 11/21] docs: document multi-fact queries via SQL API view joins Document that joining two views on a dimension that resolves to the same underlying cube member (grouped by that key) triggers a multi-fact query in the SQL API, including the join-type semantics (inner/left/right/full) and the Tesseract requirement. Add the behavior to the multi-fact views page and a cross-referencing section in the SQL API joins reference. Co-authored-by: Pavel Tiunov --- .../docs/data-modeling/multi-fact-views.mdx | 51 +++++++++++++++++++ .../core-data-apis/sql-api/joins.mdx | 28 +++++++++- 2 files changed, 78 insertions(+), 1 deletion(-) diff --git a/docs-mintlify/docs/data-modeling/multi-fact-views.mdx b/docs-mintlify/docs/data-modeling/multi-fact-views.mdx index bd9634615ab87..0e094c8c56af0 100644 --- a/docs-mintlify/docs/data-modeling/multi-fact-views.mdx +++ b/docs-mintlify/docs/data-modeling/multi-fact-views.mdx @@ -310,6 +310,56 @@ The combined result shows measures from each fact table side by side: Charlie has no orders and Diana has no returns — both are still included with `NULL` values for the missing fact table. +## Joining views in the SQL API + +You don't have to define a dedicated multi-fact view to get multi-fact +behavior. The [SQL API][ref-sql-api] produces the same query when you **join +two views on a dimension they share** and group by that dimension. + +Suppose `orders_view` and `returns_view` are two separate views that each +expose the customer's `name` (both backed by the same underlying +`customers.name` member). Joining them on `name` and grouping by it triggers a +multi-fact query: + +```sql +SELECT + o.name, + MEASURE(o.total_amount), + MEASURE(r.total_refund) +FROM orders_view o +LEFT JOIN returns_view r ON r.name = o.name +GROUP BY 1 +``` + +Cube recognizes that both `name` columns resolve to the same cube member, +merges the two view scans into a single multi-fact query, and runs it with the +separate-subquery-then-join strategy described +[above](#what-cube-does-under-the-hood). + +This rewrite applies only when: + +- The Tesseract SQL planner is enabled via + [`CUBEJS_TESSERACT_SQL_PLANNER`][ref-tesseract-env]. +- Both sides of the join condition resolve to the **same underlying cube + member** (a shared dimension), and the join key is composed only of + dimensions. +- The query is **grouped by the join key** — every grouped dimension is the + shared join key. Ungrouped joins (such as `SELECT *`) and queries that group + by a different dimension are not merged and fall back to standard join + handling. + +### Join type + +The facts are stitched together with a `FULL JOIN` on the shared key, and the +`JOIN` type in your SQL controls which rows are kept: + +| SQL join | Result | +| --- | --- | +| `FULL [OUTER] JOIN` | every key from either view (default multi-fact behavior) | +| `INNER JOIN` | only keys present in **both** views | +| `LEFT JOIN` | every key from the left view; right-side measures are `NULL` when missing | +| `RIGHT JOIN` | every key from the right view; left-side measures are `NULL` when missing | + ## Common patterns ### Time as the shared dimension @@ -417,5 +467,6 @@ to that fact's subquery. [ref-views]: /docs/data-modeling/views [ref-view-ref]: /reference/data-modeling/view [ref-segments]: /reference/data-modeling/segments +[ref-sql-api]: /reference/core-data-apis/sql-api [ref-tesseract-env]: /reference/configuration/environment-variables#cubejs_tesseract_sql_planner [link-tesseract]: https://cube.dev/blog/introducing-tesseract diff --git a/docs-mintlify/reference/core-data-apis/sql-api/joins.mdx b/docs-mintlify/reference/core-data-apis/sql-api/joins.mdx index 042c0e575dc10..48ae8a7b4935e 100644 --- a/docs-mintlify/reference/core-data-apis/sql-api/joins.mdx +++ b/docs-mintlify/reference/core-data-apis/sql-api/joins.mdx @@ -207,7 +207,33 @@ LIMIT 5; Please note that, even if `product_description` is in the inner selection, it isn't evaluated in the final query as it isn't used in any way. +## Joining views on a shared dimension + +When you join two views on a dimension that resolves to the **same underlying +cube member** and group by that dimension, Cube doesn't perform a row-level +join. Instead it merges them into a single +[multi-fact query][ref-multi-fact-views]: each view becomes its own +aggregating subquery and the results are stitched together on the shared key, +so measures from both views are combined without fan-out. + +```sql +SELECT + o.name, + MEASURE(o.total_amount), + MEASURE(r.total_refund) +FROM orders_view o +LEFT JOIN returns_view r ON r.name = o.name +GROUP BY 1 +``` + +The `JOIN` type (`INNER`, `LEFT`, `RIGHT`, `FULL`) controls which keys are +kept. This requires the [Tesseract SQL planner][ref-tesseract-env] and only +applies to grouped queries whose `GROUP BY` is the join key. See +[multi-fact views][ref-multi-fact-views] for the full explanation. + [ref-views]: /docs/data-modeling/views [ref-join-paths]: /docs/data-modeling/joins#join-paths -[ref-join-hints]: /docs/data-modeling/joins#join-hints \ No newline at end of file +[ref-join-hints]: /docs/data-modeling/joins#join-hints +[ref-multi-fact-views]: /docs/data-modeling/multi-fact-views +[ref-tesseract-env]: /reference/configuration/environment-variables#cubejs_tesseract_sql_planner \ No newline at end of file From 4872fcb2824a8d2f8635b480cd0a5d6ab8ca2839 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 6 Jun 2026 21:15:21 +0000 Subject: [PATCH 12/21] fix(cubesql): require all join-key columns on a side to share one cube/view A CubeScan can expose members from multiple cubes/views, so enforce that every join-key column on each side resolves to the same cube/view. A mismatch would make the merged join hint ambiguous, so such joins are no longer merged. Co-authored-by: Pavel Tiunov --- .../src/compile/rewrite/rules/members.rs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs index f9c7a1e43f3d7..d543e519ecbcb 100644 --- a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs +++ b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs @@ -3050,8 +3050,23 @@ impl MemberRules { all_match = false; break; } - left_cube_name = left_name.split('.').next().map(|s| s.to_string()); - right_cube_name = right_name.split('.').next().map(|s| s.to_string()); + // A CubeScan can expose members from multiple cubes/views, + // so every join-key column on a given side must resolve to + // the same cube/view. Otherwise the merged join hint would + // be ambiguous and the merge is not a single shared-member + // join we can represent. + let this_left_cube = left_name.split('.').next().map(|s| s.to_string()); + let this_right_cube = right_name.split('.').next().map(|s| s.to_string()); + if left_cube_name.is_some() && left_cube_name != this_left_cube { + all_match = false; + break; + } + if right_cube_name.is_some() && right_cube_name != this_right_cube { + all_match = false; + break; + } + left_cube_name = this_left_cube; + right_cube_name = this_right_cube; left_keys.push(left_name); right_keys.push(right_name); } From 45db195e9d84a609e000242545b9594cf47ff52d Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 7 Jun 2026 04:22:22 +0000 Subject: [PATCH 13/21] refactor(cubesql): build view-join presence filters only on successful merge; test RIGHT/FULL Address review nits: - Construct the join-semantics set filters and mutate subst inside the innermost iteration, right before returning true, so a false return never leaves a stale subst entry or orphan filter e-nodes. - Add RIGHT JOIN and FULL JOIN tests to lock in the join-type table (right join key set filter; no filter for full). (The composite-key single-cube-per-side check was already added in a prior commit.) Co-authored-by: Pavel Tiunov --- .../src/compile/rewrite/rules/members.rs | 53 +++++++----- .../src/compile/test/test_cube_join_views.rs | 85 +++++++++++++++++++ 2 files changed, 116 insertions(+), 22 deletions(-) diff --git a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs index d543e519ecbcb..e6805710582a3 100644 --- a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs +++ b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs @@ -3158,28 +3158,6 @@ impl MemberRules { presence_members.extend(shared_right_keys.iter().cloned()); } - if !presence_members.is_empty() { - let mut acc = subst[left_filters_var]; - for name in presence_members { - let member = egraph.add(LogicalPlanLanguage::FilterMemberMember( - crate::compile::rewrite::FilterMemberMember(name), - )); - let op = egraph.add(LogicalPlanLanguage::FilterMemberOp( - crate::compile::rewrite::FilterMemberOp("set".to_string()), - )); - let values = egraph.add(LogicalPlanLanguage::FilterMemberValues( - crate::compile::rewrite::FilterMemberValues(vec![]), - )); - let filter_member = - egraph.add(LogicalPlanLanguage::FilterMember([member, op, values])); - acc = egraph.add(LogicalPlanLanguage::CubeScanFilters(vec![ - filter_member, - acc, - ])); - } - subst.insert(left_filters_var, acc); - } - for left_alias_to_cube in var_iter!(egraph[subst[left_alias_to_cube_var]], CubeScanAliasToCube) { @@ -3221,6 +3199,37 @@ impl MemberRules { egraph.add(LogicalPlanLanguage::CubeScanJoinHints(out_join_hints)), ); + // Add the join-semantics presence filters only once a + // concrete merge is being produced, so a `false` return + // never leaves a stale `subst` entry behind. + if !presence_members.is_empty() { + let mut acc = subst[left_filters_var]; + for name in &presence_members { + let member = + egraph.add(LogicalPlanLanguage::FilterMemberMember( + crate::compile::rewrite::FilterMemberMember( + name.clone(), + ), + )); + let op = egraph.add(LogicalPlanLanguage::FilterMemberOp( + crate::compile::rewrite::FilterMemberOp("set".to_string()), + )); + let values = + egraph.add(LogicalPlanLanguage::FilterMemberValues( + crate::compile::rewrite::FilterMemberValues(vec![]), + )); + let filter_member = + egraph.add(LogicalPlanLanguage::FilterMember([ + member, op, values, + ])); + acc = egraph.add(LogicalPlanLanguage::CubeScanFilters(vec![ + filter_member, + acc, + ])); + } + subst.insert(left_filters_var, acc); + } + return true; } } diff --git a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs index e70581d89fca0..2c470f280007c 100644 --- a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs +++ b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs @@ -281,3 +281,88 @@ async fn test_join_two_views_on_measure_is_not_merged() { .unwrap_err(); assert!(matches!(error, CompilationError::Rewrite(..))); } + +/// `RIGHT JOIN`: the right side must be present, so the merged scan carries a +/// `set` filter on the right join key. +#[tokio::test] +async fn test_group_by_right_join_two_views_on_shared_member() { + if !Rewriter::sql_push_down_enabled() { + return; + } + init_testing_logger(); + + let logical_plan = plan_view_join( + r#" + SELECT c.customer_city, measure(o.revenue), measure(c.avg_age) + FROM customers_view c + RIGHT JOIN orders_view o ON o.customer_city = c.customer_city + GROUP BY 1 + "#, + true, + ) + .await + .unwrap() + .as_logical_plan(); + + assert_eq!( + logical_plan.find_cube_scan().request, + V1LoadRequestQuery { + measures: Some(vec![ + "orders_view.revenue".to_string(), + "customers_view.avg_age".to_string(), + ]), + dimensions: Some(vec!["customers_view.customer_city".to_string()]), + segments: Some(vec![]), + order: Some(vec![]), + filters: Some(vec![set_filter("orders_view.customer_city")]), + join_hints: Some(vec![vec![ + "customers_view".to_string(), + "orders_view".to_string(), + ]]), + ..Default::default() + } + ) +} + +/// `FULL JOIN`: every key from either side is kept (default multi-fact +/// behavior), so no presence `set` filter is added. +#[tokio::test] +async fn test_group_by_full_join_two_views_on_shared_member() { + if !Rewriter::sql_push_down_enabled() { + return; + } + init_testing_logger(); + + let logical_plan = plan_view_join( + r#" + SELECT c.customer_city, measure(o.revenue), measure(c.avg_age) + FROM customers_view c + FULL JOIN orders_view o ON o.customer_city = c.customer_city + GROUP BY 1 + "#, + true, + ) + .await + .unwrap() + .as_logical_plan(); + + assert_eq!( + logical_plan.find_cube_scan().request, + V1LoadRequestQuery { + measures: Some(vec![ + "orders_view.revenue".to_string(), + "customers_view.avg_age".to_string(), + ]), + dimensions: Some(vec!["customers_view.customer_city".to_string()]), + segments: Some(vec![]), + order: Some(vec![]), + // FULL JOIN adds no presence filter. + filters: None, + join_hints: Some(vec![vec![ + "customers_view".to_string(), + "orders_view".to_string(), + ]]), + ..Default::default() + } + ) +} From cee814ef11de8d8d1e2eea40544a738b4f8edadb Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 7 Jun 2026 18:42:53 +0000 Subject: [PATCH 14/21] feat(cubesql): MultiFactJoinWrapper for N-way view joins and filter push-down Introduce a MultiFactJoinWrapper intermediate egraph node so the shared-member view-join rewrite is no longer a single aggregate-bound rule. The rewrite now splits into: - shared-member-join-to-wrapper: Join(CubeScan, CubeScan) -> wrapper(CubeScan) - shared-member-join-extend-wrapper: Join(wrapper(CubeScan), CubeScan) -> wrapper(CubeScan), enabling joins of 3+ views - multi-fact-join-wrapper-filter-push-down: Filter(wrapper) -> wrapper(Filter), pushing WHERE/ON filters into the merged scan - aggregate-multi-fact-join-wrapper: unwrap only when GROUP BY matches the recorded join key The wrapper records the join key (as underlying cube members) so the finalize rule can verify the GROUP BY, while joins and filters compose beforehand. Adds tests for 3-way and 4-way FULL joins, a WHERE filter, and an ON-clause filter, in addition to the existing 2-way LEFT/INNER/RIGHT/FULL coverage. Co-authored-by: Pavel Tiunov --- .../cubesql/src/compile/rewrite/cost.rs | 1 + .../cubesql/src/compile/rewrite/mod.rs | 14 + .../src/compile/rewrite/rules/members.rs | 346 +++++++++++++----- .../src/compile/test/test_cube_join_views.rs | 223 ++++++++++- 4 files changed, 481 insertions(+), 103 deletions(-) diff --git a/rust/cubesql/cubesql/src/compile/rewrite/cost.rs b/rust/cubesql/cubesql/src/compile/rewrite/cost.rs index 16ffed9991a53..1245ad047d31c 100644 --- a/rust/cubesql/cubesql/src/compile/rewrite/cost.rs +++ b/rust/cubesql/cubesql/src/compile/rewrite/cost.rs @@ -127,6 +127,7 @@ impl BestCubePlan { LogicalPlanLanguage::JoinCheckStage(_) => 1, LogicalPlanLanguage::JoinCheckPushDown(_) => 1, LogicalPlanLanguage::JoinCheckPullUp(_) => 1, + LogicalPlanLanguage::MultiFactJoinWrapper(_) => 1, LogicalPlanLanguage::SortProjectionPushdownReplacer(_) => 1, LogicalPlanLanguage::SortProjectionPullupReplacer(_) => 1, // Not really replacers but those should be deemed as mandatory rewrites and as soon as diff --git a/rust/cubesql/cubesql/src/compile/rewrite/mod.rs b/rust/cubesql/cubesql/src/compile/rewrite/mod.rs index d888ad5a9ac05..30f77e3106ce1 100644 --- a/rust/cubesql/cubesql/src/compile/rewrite/mod.rs +++ b/rust/cubesql/cubesql/src/compile/rewrite/mod.rs @@ -541,6 +541,16 @@ crate::plan_to_language! { left_input: Arc, right_input: Arc, }, + // Intermediate node produced while merging a join of two (view) + // CubeScans on a shared cube member into a single multi-fact CubeScan. + // `input` is the merged CubeScan; `join_members` holds the underlying + // cube members the scans were joined on, so the aggregate finalize rule + // can verify the GROUP BY matches the join key. Rewrite-only: it must be + // eliminated (unwrapped at the aggregate) before extraction. + MultiFactJoinWrapper { + input: Arc, + join_members: Vec, + }, } } @@ -2266,6 +2276,10 @@ fn cube_scan_wrapper(input: impl Display, finalized: impl Display) -> String { format!("(CubeScanWrapper {} {})", input, finalized) } +fn multi_fact_join_wrapper(input: impl Display, join_members: impl Display) -> String { + format!("(MultiFactJoinWrapper {} {})", input, join_members) +} + fn distinct(input: impl Display) -> String { format!("(Distinct {})", input) } diff --git a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs index e6805710582a3..864d4335a8128 100644 --- a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs +++ b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs @@ -7,10 +7,10 @@ use crate::{ binary_expr, cast_expr, change_user_expr, column_expr, cross_join, cube_scan, cube_scan_filters, cube_scan_filters_empty_tail, cube_scan_members, cube_scan_members_empty_tail, cube_scan_order_empty_tail, dimension_expr, distinct, - expr_column_name, fun_expr, join, like_expr, limit, list_concat_pushdown_replacer, - list_concat_pushup_replacer, literal_expr, literal_member, measure_expr, - member_pushdown_replacer, member_replacer, original_expr_name, projection, - referenced_columns, rewrite, + expr_column_name, filter, fun_expr, join, like_expr, limit, + list_concat_pushdown_replacer, list_concat_pushup_replacer, literal_expr, + literal_member, measure_expr, member_pushdown_replacer, member_replacer, + multi_fact_join_wrapper, original_expr_name, projection, referenced_columns, rewrite, rewriter::{CubeEGraph, CubeRewrite, RewriteRules}, rules::{ replacer_flat_push_down_node_substitute_rules, replacer_push_down_node, @@ -26,9 +26,10 @@ use crate::{ LikeExprLikeType, LikeExprNegated, LikeType, LimitFetch, LimitSkip, ListType, LiteralExprValue, LiteralMemberRelation, LiteralMemberValue, LogicalPlanLanguage, MeasureName, MemberErrorAliasToCube, MemberErrorError, MemberErrorPriority, - MemberPushdownReplacerAliasToCube, MemberReplacerAliasToCube, ProjectionAlias, - TableScanFetch, TableScanProjection, TableScanSourceTableName, TableScanTableName, - TimeDimensionDateRange, TimeDimensionGranularity, TimeDimensionName, + MemberPushdownReplacerAliasToCube, MemberReplacerAliasToCube, + MultiFactJoinWrapperJoinMembers, ProjectionAlias, TableScanFetch, TableScanProjection, + TableScanSourceTableName, TableScanTableName, TimeDimensionDateRange, + TimeDimensionGranularity, TimeDimensionName, }, }, config::ConfigObj, @@ -434,17 +435,88 @@ impl RewriteRules for MemberRules { "?out_join_hints", ), ), - // Merge a join between two (view) CubeScans on a dimension that - // resolves to the same underlying cube member into a single - // CubeScan, but ONLY under an aggregate whose GROUP BY is exactly - // that shared join key. The merged scan becomes a multi-fact query - // (FULL OUTER stitched over the group-by key by the planner). - // Gating on the aggregate means ungrouped queries (e.g. SELECT *) - // and queries grouping by a non-join-key dimension are not merged. + // Merge a join of two (view) CubeScans on a dimension that resolves + // to the same underlying cube member into a single CubeScan wrapped + // in a MultiFactJoinWrapper. The wrapper records the join key (as + // underlying cube members) so the aggregate finalize rule can later + // require the GROUP BY to match it, and so additional joins (3+ + // views) and WHERE filters can compose before finalization. transforming_rewrite( - "push-down-aggregate-shared-member-join", - aggregate( - join( + "shared-member-join-to-wrapper", + join( + cube_scan( + "?left_alias_to_cube", + "?left_members", + "?left_filters", + "?left_orders", + "CubeScanLimit:None", + "CubeScanOffset:None", + "?left_split", + "CubeScanCanPushdownJoin:true", + "CubeScanWrapped:false", + "CubeScanUngrouped:true", + "?left_join_hints", + ), + cube_scan( + "?right_alias_to_cube", + "?right_members", + "?right_filters", + "?right_orders", + "CubeScanLimit:None", + "CubeScanOffset:None", + "?right_split", + "CubeScanCanPushdownJoin:true", + "CubeScanWrapped:false", + "CubeScanUngrouped:true", + "?right_join_hints", + ), + "?left_on", + "?right_on", + "?join_type", + "?join_constraint", + "?null_equals_null", + ), + multi_fact_join_wrapper( + cube_scan( + "?out_alias_to_cube", + cube_scan_members("?left_members", "?right_members"), + cube_scan_filters("?left_filters", "?right_filters"), + cube_scan_order_empty_tail(), + "CubeScanLimit:None", + "CubeScanOffset:None", + "CubeScanSplit:false", + "CubeScanCanPushdownJoin:true", + "CubeScanWrapped:false", + "CubeScanUngrouped:true", + "?out_join_hints", + ), + "?join_members", + ), + self.merge_shared_member_join( + "?left_alias_to_cube", + "?right_alias_to_cube", + "?out_alias_to_cube", + "?left_members", + "?right_members", + "?left_on", + "?right_on", + "?join_type", + "?left_join_hints", + "?right_join_hints", + "?out_join_hints", + "?left_filters", + "?join_members", + None, + ), + ), + // Extend a MultiFactJoinWrapper with another joined (view) CubeScan, + // supporting joins of 3+ views. The new join must again be on a + // dimension resolving to the same underlying member; its key is + // unioned into the wrapper's recorded join members. + transforming_rewrite( + "shared-member-join-extend-wrapper", + join( + multi_fact_join_wrapper( cube_scan( "?left_alias_to_cube", "?left_members", @@ -458,30 +530,28 @@ impl RewriteRules for MemberRules { "CubeScanUngrouped:true", "?left_join_hints", ), - cube_scan( - "?right_alias_to_cube", - "?right_members", - "?right_filters", - "?right_orders", - "CubeScanLimit:None", - "CubeScanOffset:None", - "?right_split", - "CubeScanCanPushdownJoin:true", - "CubeScanWrapped:false", - "CubeScanUngrouped:true", - "?right_join_hints", - ), - "?left_on", - "?right_on", - "?join_type", - "?join_constraint", - "?null_equals_null", + "?prev_join_members", ), - "?group_expr", - "?aggr_expr", - "?agg_split", + cube_scan( + "?right_alias_to_cube", + "?right_members", + "?right_filters", + "?right_orders", + "CubeScanLimit:None", + "CubeScanOffset:None", + "?right_split", + "CubeScanCanPushdownJoin:true", + "CubeScanWrapped:false", + "CubeScanUngrouped:true", + "?right_join_hints", + ), + "?left_on", + "?right_on", + "?join_type", + "?join_constraint", + "?null_equals_null", ), - aggregate( + multi_fact_join_wrapper( cube_scan( "?out_alias_to_cube", cube_scan_members("?left_members", "?right_members"), @@ -495,11 +565,9 @@ impl RewriteRules for MemberRules { "CubeScanUngrouped:true", "?out_join_hints", ), - "?group_expr", - "?aggr_expr", - "?agg_split", + "?join_members", ), - self.push_down_aggregate_shared_member_join( + self.merge_shared_member_join( "?left_alias_to_cube", "?right_alias_to_cube", "?out_alias_to_cube", @@ -512,8 +580,34 @@ impl RewriteRules for MemberRules { "?right_join_hints", "?out_join_hints", "?left_filters", + "?join_members", + Some("?prev_join_members"), + ), + ), + // Push a Filter (e.g. a WHERE on top of the join) down through the + // wrapper into the merged CubeScan, where the standard filter + // push-down rules turn it into a Cube query filter. + rewrite( + "multi-fact-join-wrapper-filter-push-down", + filter( + "?filter_expr", + multi_fact_join_wrapper("?wrapped_input", "?join_members"), + ), + multi_fact_join_wrapper(filter("?filter_expr", "?wrapped_input"), "?join_members"), + ), + // Finalize: once the query is grouped and the GROUP BY matches the + // recorded join key, drop the wrapper so the standard aggregate + // push-down turns the merged scan into a (multi-fact) CubeScan. + transforming_rewrite( + "aggregate-multi-fact-join-wrapper", + aggregate( + multi_fact_join_wrapper("?scan", "?join_members"), "?group_expr", + "?aggr_expr", + "?agg_split", ), + aggregate("?scan", "?group_expr", "?aggr_expr", "?agg_split"), + self.finalize_shared_member_join("?scan", "?join_members", "?group_expr"), ), ]; @@ -2948,7 +3042,8 @@ impl MemberRules { } #[allow(clippy::too_many_arguments)] - fn push_down_aggregate_shared_member_join( + #[allow(clippy::too_many_arguments)] + fn merge_shared_member_join( &self, left_alias_to_cube_var: &'static str, right_alias_to_cube_var: &'static str, @@ -2962,7 +3057,8 @@ impl MemberRules { right_join_hints_var: &'static str, out_join_hints_var: &'static str, left_filters_var: &'static str, - group_expr_var: &'static str, + join_members_var: &'static str, + prev_join_members_var: Option<&'static str>, ) -> impl Fn(&mut CubeEGraph, &mut Subst) -> bool { let left_alias_to_cube_var = var!(left_alias_to_cube_var); let right_alias_to_cube_var = var!(right_alias_to_cube_var); @@ -2976,7 +3072,8 @@ impl MemberRules { let right_join_hints_var = var!(right_join_hints_var); let out_join_hints_var = var!(out_join_hints_var); let left_filters_var = var!(left_filters_var); - let group_expr_var = var!(group_expr_var); + let join_members_var = var!(join_members_var); + let prev_join_members_var = prev_join_members_var.map(|v| var!(v)); let meta_context = self.meta_context.clone(); // Merging a view join into a single multi-fact CubeScan relies on the // Tesseract SQL planner (it stitches the fact groups with a FULL OUTER @@ -3015,14 +3112,7 @@ impl MemberRules { .cloned() .collect::>(); - let mut matched: Option<( - String, - String, - Vec, - Vec, - Vec, - Vec, - )> = None; + let mut matched: Option<(String, String, Vec, Vec)> = None; 'pairs: for left_on in left_join_ons.iter() { for right_on in right_join_ons.iter() { if left_on.is_empty() || left_on.len() != right_on.len() { @@ -3074,61 +3164,17 @@ impl MemberRules { if let (Some(left_cube_name), Some(right_cube_name)) = (left_cube_name, right_cube_name) { - matched = Some(( - left_cube_name, - right_cube_name, - left_keys, - right_keys, - left_on.clone(), - right_on.clone(), - )); + matched = + Some((left_cube_name, right_cube_name, left_keys, right_keys)); break 'pairs; } } } } - let Some(( - left_cube, - right_cube, - shared_left_keys, - shared_right_keys, - matched_left_cols, - matched_right_cols, - )) = matched - else { - return false; - }; - - // The join key must be fully within the GROUP BY dimensions: every - // group-by column must be one of the join-key columns, and every - // join-key pair must be grouped. This is what makes the multi-fact - // stitch over the group-by key match the requested join. - let Some(group_referenced_expr) = - &egraph.index(subst[group_expr_var]).data.referenced_expr - else { + let Some((left_cube, right_cube, shared_left_keys, shared_right_keys)) = matched else { return false; }; - let group_cols = referenced_columns(group_referenced_expr); - if group_cols.is_empty() { - return false; - } - let join_key_cols: HashSet = matched_left_cols - .iter() - .chain(matched_right_cols.iter()) - .map(|c| c.flat_name()) - .collect(); - if !group_cols.iter().all(|c| join_key_cols.contains(c)) { - return false; - } - let group_set: HashSet<&String> = group_cols.iter().collect(); - for (left_col, right_col) in matched_left_cols.iter().zip(matched_right_cols.iter()) { - if !group_set.contains(&left_col.flat_name()) - && !group_set.contains(&right_col.flat_name()) - { - return false; - } - } // Re-introduce INNER/LEFT/RIGHT semantics on top of the FULL OUTER // multi-fact stitch by requiring the join key of each "must be @@ -3158,6 +3204,22 @@ impl MemberRules { presence_members.extend(shared_right_keys.iter().cloned()); } + // The join key as underlying cube members, unioned with any keys + // already recorded on the left wrapper (for chained 3+ view joins). + let mut join_member_names: Vec = shared_left_keys + .iter() + .map(|k| resolve_underlying(k)) + .collect(); + if let Some(prev_var) = prev_join_members_var { + if let Some(prev) = + var_iter!(egraph[subst[prev_var]], MultiFactJoinWrapperJoinMembers).next() + { + join_member_names.extend(prev.iter().cloned()); + } + } + join_member_names.sort(); + join_member_names.dedup(); + for left_alias_to_cube in var_iter!(egraph[subst[left_alias_to_cube_var]], CubeScanAliasToCube) { @@ -3199,6 +3261,12 @@ impl MemberRules { egraph.add(LogicalPlanLanguage::CubeScanJoinHints(out_join_hints)), ); + let join_members_id = + egraph.add(LogicalPlanLanguage::MultiFactJoinWrapperJoinMembers( + MultiFactJoinWrapperJoinMembers(join_member_names.clone()), + )); + subst.insert(join_members_var, join_members_id); + // Add the join-semantics presence filters only once a // concrete merge is being produced, so a `false` return // never leaves a stale `subst` entry behind. @@ -3240,6 +3308,82 @@ impl MemberRules { } } + // Finalize a MultiFactJoinWrapper: only unwrap it (letting the standard + // aggregate push-down produce the merged multi-fact CubeScan) when the + // query's GROUP BY is exactly the recorded shared join key. This rejects + // ungrouped queries (no aggregate matches) and queries grouping by a + // non-join-key dimension. + fn finalize_shared_member_join( + &self, + scan_var: &'static str, + join_members_var: &'static str, + group_expr_var: &'static str, + ) -> impl Fn(&mut CubeEGraph, &mut Subst) -> bool { + let scan_var = var!(scan_var); + let join_members_var = var!(join_members_var); + let group_expr_var = var!(group_expr_var); + let meta_context = self.meta_context.clone(); + move |egraph, subst| { + fn dimension_member_name( + egraph: &mut CubeEGraph, + scan_id: Id, + column: &Column, + ) -> Option { + match egraph[scan_id].data.find_member_by_column(column) { + Some(((_, Member::Dimension { name, .. }, _), _)) + | Some(((_, Member::TimeDimension { name, .. }, _), _)) => Some(name.clone()), + _ => None, + } + } + + let resolve_underlying = |member_name: &str| -> String { + meta_context + .find_dimension_with_name(member_name) + .and_then(|dim| dim.alias_member.clone()) + .unwrap_or_else(|| member_name.to_string()) + }; + + let join_members: Vec = match var_iter!( + egraph[subst[join_members_var]], + MultiFactJoinWrapperJoinMembers + ) + .next() + { + Some(jm) => jm.clone(), + None => return false, + }; + if join_members.is_empty() { + return false; + } + let join_set: HashSet = join_members.into_iter().collect(); + + let group_exprs = match &egraph.index(subst[group_expr_var]).data.referenced_expr { + Some(exprs) => exprs.clone(), + None => return false, + }; + if group_exprs.is_empty() { + return false; + } + + // Every GROUP BY expression must be a dimension column of the merged + // scan whose underlying cube member is part of the recorded join key. + let mut group_underlying: HashSet = HashSet::new(); + for expr in &group_exprs { + let Expr::Column(column) = expr else { + return false; + }; + let Some(member_name) = dimension_member_name(egraph, subst[scan_var], column) + else { + return false; + }; + group_underlying.insert(resolve_underlying(&member_name)); + } + + // GROUP BY must match the join key exactly. + group_underlying == join_set + } + } + fn push_down_cross_join_to_cubescan_rewrite( &self, name: &str, diff --git a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs index 2c470f280007c..ba513fec228e5 100644 --- a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs +++ b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs @@ -70,11 +70,45 @@ fn views_meta() -> Vec { description: None, title: None, r#type: V1CubeMetaType::View, + dimensions: vec![ + dimension("orders_view.customer_city", "customers.customer_city"), + dimension("orders_view.status", "orders.status"), + ], + measures: vec![measure("orders_view.revenue", "orders.revenue", "sum")], + segments: vec![], + joins: None, + folders: None, + nested_folders: None, + hierarchies: None, + meta: None, + }, + CubeMeta { + name: "returns_view".to_string(), + description: None, + title: None, + r#type: V1CubeMetaType::View, dimensions: vec![dimension( - "orders_view.customer_city", + "returns_view.customer_city", "customers.customer_city", )], - measures: vec![measure("orders_view.revenue", "orders.revenue", "sum")], + measures: vec![measure("returns_view.refunds", "returns.refunds", "sum")], + segments: vec![], + joins: None, + folders: None, + nested_folders: None, + hierarchies: None, + meta: None, + }, + CubeMeta { + name: "payments_view".to_string(), + description: None, + title: None, + r#type: V1CubeMetaType::View, + dimensions: vec![dimension( + "payments_view.customer_city", + "customers.customer_city", + )], + measures: vec![measure("payments_view.paid", "payments.paid", "sum")], segments: vec![], joins: None, folders: None, @@ -366,3 +400,188 @@ async fn test_group_by_full_join_two_views_on_shared_member() { } ) } + +/// Joining three views on the shared key (FULL JOIN, so no presence filters) +/// merges into a single multi-fact CubeScan with all three measures. +#[tokio::test] +async fn test_group_by_full_join_three_views_on_shared_member() { + if !Rewriter::sql_push_down_enabled() { + return; + } + init_testing_logger(); + + let logical_plan = plan_view_join( + r#" + SELECT c.customer_city, measure(o.revenue), measure(r.refunds) + FROM customers_view c + FULL JOIN orders_view o ON o.customer_city = c.customer_city + FULL JOIN returns_view r ON r.customer_city = c.customer_city + GROUP BY 1 + "#, + true, + ) + .await + .unwrap() + .as_logical_plan(); + + assert_eq!( + logical_plan.find_cube_scan().request, + V1LoadRequestQuery { + measures: Some(vec![ + "orders_view.revenue".to_string(), + "returns_view.refunds".to_string(), + ]), + dimensions: Some(vec!["customers_view.customer_city".to_string()]), + segments: Some(vec![]), + order: Some(vec![]), + join_hints: Some(vec![ + vec!["customers_view".to_string(), "orders_view".to_string()], + vec!["customers_view".to_string(), "returns_view".to_string()], + ]), + ..Default::default() + } + ) +} + +/// Joining four views on the shared key (FULL JOIN) merges into a single +/// multi-fact CubeScan with all four measures. +#[tokio::test] +async fn test_group_by_full_join_four_views_on_shared_member() { + if !Rewriter::sql_push_down_enabled() { + return; + } + init_testing_logger(); + + let logical_plan = plan_view_join( + r#" + SELECT c.customer_city, measure(o.revenue), measure(r.refunds), measure(p.paid) + FROM customers_view c + FULL JOIN orders_view o ON o.customer_city = c.customer_city + FULL JOIN returns_view r ON r.customer_city = c.customer_city + FULL JOIN payments_view p ON p.customer_city = c.customer_city + GROUP BY 1 + "#, + true, + ) + .await + .unwrap() + .as_logical_plan(); + + assert_eq!( + logical_plan.find_cube_scan().request, + V1LoadRequestQuery { + measures: Some(vec![ + "orders_view.revenue".to_string(), + "returns_view.refunds".to_string(), + "payments_view.paid".to_string(), + ]), + dimensions: Some(vec!["customers_view.customer_city".to_string()]), + segments: Some(vec![]), + order: Some(vec![]), + join_hints: Some(vec![ + vec!["customers_view".to_string(), "orders_view".to_string()], + vec!["customers_view".to_string(), "returns_view".to_string()], + vec!["customers_view".to_string(), "payments_view".to_string()], + ]), + ..Default::default() + } + ) +} + +/// A WHERE filter on top of the join is pushed through the wrapper into the +/// merged scan and shows up as a Cube query filter alongside the join-semantics +/// `set` filter. +#[tokio::test] +async fn test_group_by_left_join_with_where_filter() { + if !Rewriter::sql_push_down_enabled() { + return; + } + init_testing_logger(); + + let logical_plan = plan_view_join( + r#" + SELECT c.customer_city, measure(o.revenue) + FROM customers_view c + LEFT JOIN orders_view o ON o.customer_city = c.customer_city + WHERE c.status = 'active' + GROUP BY 1 + "#, + true, + ) + .await + .unwrap() + .as_logical_plan(); + + assert_eq!( + logical_plan.find_cube_scan().request, + V1LoadRequestQuery { + measures: Some(vec!["orders_view.revenue".to_string()]), + dimensions: Some(vec!["customers_view.customer_city".to_string()]), + segments: Some(vec![]), + order: Some(vec![]), + filters: Some(vec![ + set_filter("customers_view.customer_city"), + V1LoadRequestQueryFilterItem { + member: Some("customers_view.status".to_string()), + operator: Some("equals".to_string()), + values: Some(vec!["active".to_string()]), + or: None, + and: None, + }, + ]), + join_hints: Some(vec![vec![ + "customers_view".to_string(), + "orders_view".to_string(), + ]]), + ..Default::default() + } + ) +} + +/// A filter placed in the ON clause (in addition to the shared-key equality). +#[tokio::test] +async fn test_group_by_left_join_with_on_filter() { + if !Rewriter::sql_push_down_enabled() { + return; + } + init_testing_logger(); + + let logical_plan = plan_view_join( + r#" + SELECT c.customer_city, measure(o.revenue) + FROM customers_view c + LEFT JOIN orders_view o + ON o.customer_city = c.customer_city AND o.status = 'completed' + GROUP BY 1 + "#, + true, + ) + .await + .unwrap() + .as_logical_plan(); + + assert_eq!( + logical_plan.find_cube_scan().request, + V1LoadRequestQuery { + measures: Some(vec!["orders_view.revenue".to_string()]), + dimensions: Some(vec!["customers_view.customer_city".to_string()]), + segments: Some(vec![]), + order: Some(vec![]), + filters: Some(vec![ + set_filter("customers_view.customer_city"), + V1LoadRequestQueryFilterItem { + member: Some("orders_view.status".to_string()), + operator: Some("equals".to_string()), + values: Some(vec!["completed".to_string()]), + or: None, + and: None, + }, + ]), + join_hints: Some(vec![vec![ + "customers_view".to_string(), + "orders_view".to_string(), + ]]), + ..Default::default() + } + ) +} From 1aca9d9f0d43981109bf79c5e995bc9abbfb67f9 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 7 Jun 2026 18:43:35 +0000 Subject: [PATCH 15/21] docs: document N-way view joins and filter support in the SQL API Co-authored-by: Pavel Tiunov --- .../docs/data-modeling/multi-fact-views.mdx | 33 ++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/docs-mintlify/docs/data-modeling/multi-fact-views.mdx b/docs-mintlify/docs/data-modeling/multi-fact-views.mdx index 0e094c8c56af0..56832f52384bd 100644 --- a/docs-mintlify/docs/data-modeling/multi-fact-views.mdx +++ b/docs-mintlify/docs/data-modeling/multi-fact-views.mdx @@ -314,7 +314,7 @@ with `NULL` values for the missing fact table. You don't have to define a dedicated multi-fact view to get multi-fact behavior. The [SQL API][ref-sql-api] produces the same query when you **join -two views on a dimension they share** and group by that dimension. +two or more views on a dimension they share** and group by that dimension. Suppose `orders_view` and `returns_view` are two separate views that each expose the customer's `name` (both backed by the same underlying @@ -348,6 +348,37 @@ This rewrite applies only when: by a different dimension are not merged and fall back to standard join handling. +### Joining three or more views + +The rewrite is not limited to two views. Chained joins on the same shared key +are merged into a single multi-fact query, with each view contributing its own +aggregating subquery: + +```sql +SELECT + o.name, + MEASURE(o.total_amount), + MEASURE(r.total_refund), + MEASURE(p.total_paid) +FROM orders_view o +FULL JOIN returns_view r ON r.name = o.name +FULL JOIN payments_view p ON p.name = o.name +GROUP BY 1 +``` + +### Filtering the join + +Filters on top of the join are supported and are applied to the merged query: + +- A `WHERE` clause is pushed into the merged scan. A predicate on a dimension + shared by all facts filters the whole result; a predicate on a fact-specific + dimension filters only that fact's subquery. +- A predicate in the `ON` clause that the planner can attach to a single side + (for example, a condition on the optional side of a `LEFT JOIN`) becomes a + filter on that fact. Predicates that the SQL planner can't push to one side + of an outer join (such as a left-table condition in a `LEFT JOIN ON`) aren't + supported by the planner and will raise an error. + ### Join type The facts are stitched together with a `FULL JOIN` on the shared key, and the From 144bb60f17757e60fb12d96faaed23185874a75f Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 7 Jun 2026 19:42:06 +0000 Subject: [PATCH 16/21] fix(cubesql): address review nits on MultiFactJoinWrapper rewrite - Remove duplicate #[allow(clippy::too_many_arguments)] on merge_shared_member_join - Document the left-deep-only assumption of shared-member-join-extend-wrapper (right-associative a JOIN (b JOIN c) is not chained) - Document that finalize only accepts plain-column GROUP BY (wrapped exprs like DATE_TRUNC fall back to standard handling) - Add a 3-way LEFT join test pinning per-pass presence-filter accumulation through the extend-wrapper rule Co-authored-by: Pavel Tiunov --- .../src/compile/rewrite/rules/members.rs | 11 ++++- .../src/compile/test/test_cube_join_views.rs | 47 +++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs index 864d4335a8128..0c06c49baa9cc 100644 --- a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs +++ b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs @@ -513,6 +513,12 @@ impl RewriteRules for MemberRules { // supporting joins of 3+ views. The new join must again be on a // dimension resolving to the same underlying member; its key is // unioned into the wrapper's recorded join members. + // + // Only the wrapper-on-the-left shape is matched, which is the + // left-deep tree SQL parsers produce for `a JOIN b JOIN c`. A + // right-associative `a JOIN (b JOIN c)` (explicit parentheses) keeps + // the wrapper on the right and is not chained; it falls back to + // standard join handling. transforming_rewrite( "shared-member-join-extend-wrapper", join( @@ -3041,7 +3047,6 @@ impl MemberRules { } } - #[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)] fn merge_shared_member_join( &self, @@ -3367,6 +3372,10 @@ impl MemberRules { // Every GROUP BY expression must be a dimension column of the merged // scan whose underlying cube member is part of the recorded join key. + // Only plain column references are accepted: a wrapped expression + // (e.g. `GROUP BY DATE_TRUNC('day', c.day)`) can't be matched against + // the recorded join key, so the wrapper is left in place and the + // query falls back to standard join handling. let mut group_underlying: HashSet = HashSet::new(); for expr in &group_exprs { let Expr::Column(column) = expr else { diff --git a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs index ba513fec228e5..6b983dc3ef05d 100644 --- a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs +++ b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs @@ -585,3 +585,50 @@ async fn test_group_by_left_join_with_on_filter() { } ) } + +/// A 3-way LEFT join pins the per-pass presence-filter accumulation through +/// `shared-member-join-extend-wrapper`: each LEFT join contributes a `set` +/// filter on its own left-side join key. +#[tokio::test] +async fn test_group_by_left_join_three_views_presence_filters() { + if !Rewriter::sql_push_down_enabled() { + return; + } + init_testing_logger(); + + let logical_plan = plan_view_join( + r#" + SELECT c.customer_city, measure(o.revenue), measure(r.refunds) + FROM customers_view c + LEFT JOIN orders_view o ON o.customer_city = c.customer_city + LEFT JOIN returns_view r ON r.customer_city = o.customer_city + GROUP BY 1 + "#, + true, + ) + .await + .unwrap() + .as_logical_plan(); + + assert_eq!( + logical_plan.find_cube_scan().request, + V1LoadRequestQuery { + measures: Some(vec![ + "orders_view.revenue".to_string(), + "returns_view.refunds".to_string(), + ]), + dimensions: Some(vec!["customers_view.customer_city".to_string()]), + segments: Some(vec![]), + order: Some(vec![]), + filters: Some(vec![ + set_filter("orders_view.customer_city"), + set_filter("customers_view.customer_city"), + ]), + join_hints: Some(vec![ + vec!["customers_view".to_string(), "orders_view".to_string()], + vec!["orders_view".to_string(), "returns_view".to_string()], + ]), + ..Default::default() + } + ) +} From 3d11f94999a0f1deb305aed56aff52afdfe1ff1c Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 7 Jun 2026 20:06:59 +0000 Subject: [PATCH 17/21] feat(cubesql): support view joins on date_trunc / shared time dimensions A join written directly on DATE_TRUNC (ON DATE_TRUNC(g, a.ts) = DATE_TRUNC(g, b.ts)) is lowered by the planner to Filter(, CrossJoin(...)) rather than a column equi-join, so it never reached the shared-member merge. Add a shared-time-member-cross-join-to-wrapper rule that recognizes this shape, resolves both truncated columns to the same underlying time member at the same granularity, and merges into an INNER multi-fact CubeScan (both keys marked present). Grouping by DATE_TRUNC already worked via referenced-column collapse; the finalize comment is corrected accordingly. Adds time dimensions to the test views and tests for join-on-raw-time + GROUP BY DATE_TRUNC (LEFT) and join-on-DATE_TRUNC + GROUP BY DATE_TRUNC (INNER). Co-authored-by: Pavel Tiunov --- .../docs/data-modeling/multi-fact-views.mdx | 29 ++ .../src/compile/rewrite/rules/members.rs | 309 +++++++++++++++++- .../src/compile/test/test_cube_join_views.rs | 107 +++++- 3 files changed, 439 insertions(+), 6 deletions(-) diff --git a/docs-mintlify/docs/data-modeling/multi-fact-views.mdx b/docs-mintlify/docs/data-modeling/multi-fact-views.mdx index 56832f52384bd..23e866254e0ae 100644 --- a/docs-mintlify/docs/data-modeling/multi-fact-views.mdx +++ b/docs-mintlify/docs/data-modeling/multi-fact-views.mdx @@ -366,6 +366,35 @@ FULL JOIN payments_view p ON p.name = o.name GROUP BY 1 ``` +### Joining on a time dimension + +A common multi-fact pattern joins facts on a shared time dimension and groups by +a truncated grain. Both shapes are supported: + +- **Join on the raw time column, group by `DATE_TRUNC`:** + +```sql +SELECT DATE_TRUNC('day', o.created_at), MEASURE(o.total_amount), MEASURE(r.total_refund) +FROM orders_view o +LEFT JOIN returns_view r ON r.created_at = o.created_at +GROUP BY 1 +``` + +- **Join directly on `DATE_TRUNC`:** + +```sql +SELECT DATE_TRUNC('day', o.created_at), MEASURE(o.total_amount), MEASURE(r.total_refund) +FROM orders_view o +JOIN returns_view r ON DATE_TRUNC('day', r.created_at) = DATE_TRUNC('day', o.created_at) +GROUP BY 1 +``` + +In both cases the grouped column is emitted as a time dimension with its +granularity. A join written directly on `DATE_TRUNC` is an `INNER` join (the SQL +planner expresses it as a filtered cross join), so both sides must share a key; +both truncated columns must resolve to the same underlying time member at the +same granularity. + ### Filtering the join Filters on top of the join are supported and are applied to the merged query: diff --git a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs index 0c06c49baa9cc..8a7422e7ae872 100644 --- a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs +++ b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs @@ -590,6 +590,95 @@ impl RewriteRules for MemberRules { Some("?prev_join_members"), ), ), + // Merge an INNER join expressed as a date-truncated equality + // (`ON DATE_TRUNC(g, a.ts) = DATE_TRUNC(g, b.ts)`), which the SQL + // planner lowers to Filter(, CrossJoin(...)) rather than a column + // equi-join, into a single multi-fact CubeScan. Both truncated + // columns must resolve to the same underlying time member at the + // same granularity. A filtered cross join is INNER, so both keys are + // marked present. + transforming_rewrite( + "shared-time-member-cross-join-to-wrapper", + filter( + binary_expr( + self.fun_expr( + "DateTrunc", + vec![ + literal_expr("?left_granularity"), + column_expr("?left_column"), + ], + ), + "=", + self.fun_expr( + "DateTrunc", + vec![ + literal_expr("?right_granularity"), + column_expr("?right_column"), + ], + ), + ), + cross_join( + cube_scan( + "?left_alias_to_cube", + "?left_members", + "?left_filters", + "?left_orders", + "CubeScanLimit:None", + "CubeScanOffset:None", + "?left_split", + "CubeScanCanPushdownJoin:true", + "CubeScanWrapped:false", + "CubeScanUngrouped:true", + "?left_join_hints", + ), + cube_scan( + "?right_alias_to_cube", + "?right_members", + "?right_filters", + "?right_orders", + "CubeScanLimit:None", + "CubeScanOffset:None", + "?right_split", + "CubeScanCanPushdownJoin:true", + "CubeScanWrapped:false", + "CubeScanUngrouped:true", + "?right_join_hints", + ), + ), + ), + multi_fact_join_wrapper( + cube_scan( + "?out_alias_to_cube", + cube_scan_members("?left_members", "?right_members"), + cube_scan_filters("?left_filters", "?right_filters"), + cube_scan_order_empty_tail(), + "CubeScanLimit:None", + "CubeScanOffset:None", + "CubeScanSplit:false", + "CubeScanCanPushdownJoin:true", + "CubeScanWrapped:false", + "CubeScanUngrouped:true", + "?out_join_hints", + ), + "?join_members", + ), + self.merge_shared_time_cross_join( + "?left_alias_to_cube", + "?right_alias_to_cube", + "?out_alias_to_cube", + "?left_members", + "?right_members", + "?left_column", + "?left_granularity", + "?right_column", + "?right_granularity", + "?left_join_hints", + "?right_join_hints", + "?out_join_hints", + "?left_filters", + "?join_members", + ), + ), // Push a Filter (e.g. a WHERE on top of the join) down through the // wrapper into the merged CubeScan, where the standard filter // push-down rules turn it into a Cube query filter. @@ -3313,6 +3402,213 @@ impl MemberRules { } } + // Same merge as `merge_shared_member_join`, but the join is a date-truncated + // equality the planner lowered to Filter(CrossJoin(...)). Resolves the two + // truncated columns to time-dimension members on each side, requires the + // same underlying member at the same granularity, and produces an INNER + // multi-fact CubeScan wrapped in a MultiFactJoinWrapper. + #[allow(clippy::too_many_arguments)] + fn merge_shared_time_cross_join( + &self, + left_alias_to_cube_var: &'static str, + right_alias_to_cube_var: &'static str, + out_alias_to_cube_var: &'static str, + left_members_var: &'static str, + right_members_var: &'static str, + left_column_var: &'static str, + left_granularity_var: &'static str, + right_column_var: &'static str, + right_granularity_var: &'static str, + left_join_hints_var: &'static str, + right_join_hints_var: &'static str, + out_join_hints_var: &'static str, + left_filters_var: &'static str, + join_members_var: &'static str, + ) -> impl Fn(&mut CubeEGraph, &mut Subst) -> bool { + let left_alias_to_cube_var = var!(left_alias_to_cube_var); + let right_alias_to_cube_var = var!(right_alias_to_cube_var); + let out_alias_to_cube_var = var!(out_alias_to_cube_var); + let left_members_var = var!(left_members_var); + let right_members_var = var!(right_members_var); + let left_column_var = var!(left_column_var); + let left_granularity_var = var!(left_granularity_var); + let right_column_var = var!(right_column_var); + let right_granularity_var = var!(right_granularity_var); + let left_join_hints_var = var!(left_join_hints_var); + let right_join_hints_var = var!(right_join_hints_var); + let out_join_hints_var = var!(out_join_hints_var); + let left_filters_var = var!(left_filters_var); + let join_members_var = var!(join_members_var); + let meta_context = self.meta_context.clone(); + let enable_tesseract_sql_planner = self.config_obj.enable_tesseract_sql_planner(); + move |egraph, subst| { + if !enable_tesseract_sql_planner { + return false; + } + fn dimension_member_name( + egraph: &mut CubeEGraph, + members_id: Id, + column: &Column, + ) -> Option { + match egraph[members_id].data.find_member_by_column(column) { + Some(((_, Member::Dimension { name, .. }, _), _)) + | Some(((_, Member::TimeDimension { name, .. }, _), _)) => Some(name.clone()), + _ => None, + } + } + + let resolve_underlying = |member_name: &str| -> String { + meta_context + .find_dimension_with_name(member_name) + .and_then(|dim| dim.alias_member.clone()) + .unwrap_or_else(|| member_name.to_string()) + }; + + // Both sides must be truncated to the same granularity for the + // stitch key to line up. + let Some(left_granularity) = + var_iter!(egraph[subst[left_granularity_var]], LiteralExprValue) + .find_map(|v| utils::parse_granularity(v, false)) + else { + return false; + }; + let Some(right_granularity) = + var_iter!(egraph[subst[right_granularity_var]], LiteralExprValue) + .find_map(|v| utils::parse_granularity(v, false)) + else { + return false; + }; + if left_granularity != right_granularity { + return false; + } + + let Some(binary_left_col) = var_iter!(egraph[subst[left_column_var]], ColumnExprColumn) + .next() + .cloned() + else { + return false; + }; + let Some(binary_right_col) = + var_iter!(egraph[subst[right_column_var]], ColumnExprColumn) + .next() + .cloned() + else { + return false; + }; + + // The equality columns may be written in either order relative to + // the cross-join sides, so resolve each against both scans and pick + // the assignment where one column belongs to the left scan and the + // other to the right. + let bl_on_left = + dimension_member_name(egraph, subst[left_members_var], &binary_left_col); + let br_on_right = + dimension_member_name(egraph, subst[right_members_var], &binary_right_col); + let br_on_left = + dimension_member_name(egraph, subst[left_members_var], &binary_right_col); + let bl_on_right = + dimension_member_name(egraph, subst[right_members_var], &binary_left_col); + let (left_key, right_key) = if let (Some(l), Some(r)) = (bl_on_left, br_on_right) { + (l, r) + } else if let (Some(l), Some(r)) = (br_on_left, bl_on_right) { + (l, r) + } else { + return false; + }; + + if resolve_underlying(&left_key) != resolve_underlying(&right_key) { + return false; + } + let Some(left_cube) = left_key.split('.').next().map(|s| s.to_string()) else { + return false; + }; + let Some(right_cube) = right_key.split('.').next().map(|s| s.to_string()) else { + return false; + }; + + // A filtered cross join is an INNER join: require both keys present. + let presence_members: Vec = vec![left_key.clone(), right_key.clone()]; + let mut join_member_names: Vec = vec![resolve_underlying(&left_key)]; + join_member_names.sort(); + join_member_names.dedup(); + + for left_alias_to_cube in + var_iter!(egraph[subst[left_alias_to_cube_var]], CubeScanAliasToCube) + { + for right_alias_to_cube in + var_iter!(egraph[subst[right_alias_to_cube_var]], CubeScanAliasToCube) + { + for left_join_hints in + var_iter!(egraph[subst[left_join_hints_var]], CubeScanJoinHints) + { + for right_join_hints in + var_iter!(egraph[subst[right_join_hints_var]], CubeScanJoinHints) + { + let out_alias_to_cube = CubeScanAliasToCube( + left_alias_to_cube + .iter() + .chain(right_alias_to_cube.iter()) + .cloned() + .collect(), + ); + + let out_join_hints = CubeScanJoinHints( + left_join_hints + .iter() + .chain(right_join_hints.iter()) + .cloned() + .chain(iter::once(vec![left_cube.clone(), right_cube.clone()])) + .collect(), + ); + + subst.insert( + out_alias_to_cube_var, + egraph.add(LogicalPlanLanguage::CubeScanAliasToCube( + out_alias_to_cube, + )), + ); + + subst.insert( + out_join_hints_var, + egraph.add(LogicalPlanLanguage::CubeScanJoinHints(out_join_hints)), + ); + + let join_members_id = + egraph.add(LogicalPlanLanguage::MultiFactJoinWrapperJoinMembers( + MultiFactJoinWrapperJoinMembers(join_member_names.clone()), + )); + subst.insert(join_members_var, join_members_id); + + let mut acc = subst[left_filters_var]; + for name in &presence_members { + let member = egraph.add(LogicalPlanLanguage::FilterMemberMember( + crate::compile::rewrite::FilterMemberMember(name.clone()), + )); + let op = egraph.add(LogicalPlanLanguage::FilterMemberOp( + crate::compile::rewrite::FilterMemberOp("set".to_string()), + )); + let values = egraph.add(LogicalPlanLanguage::FilterMemberValues( + crate::compile::rewrite::FilterMemberValues(vec![]), + )); + let filter_member = egraph + .add(LogicalPlanLanguage::FilterMember([member, op, values])); + acc = egraph.add(LogicalPlanLanguage::CubeScanFilters(vec![ + filter_member, + acc, + ])); + } + subst.insert(left_filters_var, acc); + + return true; + } + } + } + } + + false + } + } + // Finalize a MultiFactJoinWrapper: only unwrap it (letting the standard // aggregate push-down produce the merged multi-fact CubeScan) when the // query's GROUP BY is exactly the recorded shared join key. This rejects @@ -3370,12 +3666,15 @@ impl MemberRules { return false; } - // Every GROUP BY expression must be a dimension column of the merged + // Every GROUP BY expression must reference a dimension of the merged // scan whose underlying cube member is part of the recorded join key. - // Only plain column references are accepted: a wrapped expression - // (e.g. `GROUP BY DATE_TRUNC('day', c.day)`) can't be matched against - // the recorded join key, so the wrapper is left in place and the - // query falls back to standard join handling. + // `group_exprs` comes from `referenced_expr`, which collapses a + // wrapped expression to the column(s) it references, so a time-grain + // group-by such as `DATE_TRUNC('day', c.created_at)` is matched via + // its inner `created_at` column. An expression that references no + // column (or more/other columns than the join key) won't match, so + // the wrapper is left in place and the query falls back to standard + // join handling. let mut group_underlying: HashSet = HashSet::new(); for expr in &group_exprs { let Expr::Column(column) = expr else { diff --git a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs index 6b983dc3ef05d..8df7f8f51015d 100644 --- a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs +++ b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs @@ -1,6 +1,9 @@ use std::sync::Arc; -use cubeclient::models::{V1CubeMetaType, V1LoadRequestQuery, V1LoadRequestQueryFilterItem}; +use cubeclient::models::{ + V1CubeMetaType, V1LoadRequestQuery, V1LoadRequestQueryFilterItem, + V1LoadRequestQueryTimeDimension, +}; use pretty_assertions::assert_eq; use crate::{ @@ -27,6 +30,12 @@ fn views_meta() -> Vec { alias_member: Some(alias.to_string()), ..CubeMetaDimension::default() }; + let time_dimension = |name: &str, alias: &str| CubeMetaDimension { + name: name.to_string(), + r#type: "time".to_string(), + alias_member: Some(alias.to_string()), + ..CubeMetaDimension::default() + }; let measure = |name: &str, alias: &str, agg: &str| CubeMetaMeasure { name: name.to_string(), title: None, @@ -52,6 +61,7 @@ fn views_meta() -> Vec { // A second dimension that is NOT a join key, used to test that a // query grouping by it (instead of the join key) is not merged. dimension("customers_view.status", "customers.status"), + time_dimension("customers_view.created_at", "customers.created_at"), ], measures: vec![measure( "customers_view.avg_age", @@ -73,6 +83,7 @@ fn views_meta() -> Vec { dimensions: vec![ dimension("orders_view.customer_city", "customers.customer_city"), dimension("orders_view.status", "orders.status"), + time_dimension("orders_view.created_at", "customers.created_at"), ], measures: vec![measure("orders_view.revenue", "orders.revenue", "sum")], segments: vec![], @@ -632,3 +643,97 @@ async fn test_group_by_left_join_three_views_presence_filters() { } ) } + +/// Joining two views on a raw shared time column and grouping by +/// `DATE_TRUNC('day', ...)` merges into a single multi-fact CubeScan with the +/// grouped column emitted as a `timeDimensions` entry (granularity `day`). +#[tokio::test] +async fn test_left_join_raw_time_group_by_date_trunc() { + if !Rewriter::sql_push_down_enabled() { + return; + } + init_testing_logger(); + + let logical_plan = plan_view_join( + r#" + SELECT DATE_TRUNC('day', c.created_at), measure(o.revenue) + FROM customers_view c + LEFT JOIN orders_view o ON o.created_at = c.created_at + GROUP BY 1 + "#, + true, + ) + .await + .unwrap() + .as_logical_plan(); + + assert_eq!( + logical_plan.find_cube_scan().request, + V1LoadRequestQuery { + measures: Some(vec!["orders_view.revenue".to_string()]), + dimensions: Some(vec![]), + segments: Some(vec![]), + time_dimensions: Some(vec![V1LoadRequestQueryTimeDimension { + dimension: "customers_view.created_at".to_string(), + granularity: Some("day".to_string()), + date_range: None, + }]), + order: Some(vec![]), + filters: Some(vec![set_filter("customers_view.created_at")]), + join_hints: Some(vec![vec![ + "customers_view".to_string(), + "orders_view".to_string(), + ]]), + ..Default::default() + } + ) +} + +/// Joining two views directly on `DATE_TRUNC('day', ...)` (which the SQL planner +/// lowers to `Filter(, CrossJoin(...))`, i.e. an INNER join) merges into a +/// single multi-fact CubeScan. Both truncated keys are marked present (INNER). +#[tokio::test] +async fn test_inner_join_on_date_trunc_group_by_date_trunc() { + if !Rewriter::sql_push_down_enabled() { + return; + } + init_testing_logger(); + + let logical_plan = plan_view_join( + r#" + SELECT DATE_TRUNC('day', c.created_at), measure(o.revenue) + FROM customers_view c + LEFT JOIN orders_view o + ON DATE_TRUNC('day', o.created_at) = DATE_TRUNC('day', c.created_at) + GROUP BY 1 + "#, + true, + ) + .await + .unwrap() + .as_logical_plan(); + + assert_eq!( + logical_plan.find_cube_scan().request, + V1LoadRequestQuery { + measures: Some(vec!["orders_view.revenue".to_string()]), + dimensions: Some(vec![]), + segments: Some(vec![]), + time_dimensions: Some(vec![V1LoadRequestQueryTimeDimension { + dimension: "customers_view.created_at".to_string(), + granularity: Some("day".to_string()), + date_range: None, + }]), + order: Some(vec![]), + filters: Some(vec![ + set_filter("orders_view.created_at"), + set_filter("customers_view.created_at"), + ]), + join_hints: Some(vec![vec![ + "customers_view".to_string(), + "orders_view".to_string(), + ]]), + ..Default::default() + } + ) +} From e5e24bf39f2b8fa1b1a9c4c7d709e1f3cbf99624 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 7 Jun 2026 20:09:25 +0000 Subject: [PATCH 18/21] test(cubesql): cover composite-key view joins (multiple dimensions) Add a shared customer_state dimension and tests for joining two views on a composite key (customer_city + customer_state) and grouping by both, plus a negative test that a partial GROUP BY (only one of the two join keys) does not merge. Co-authored-by: Pavel Tiunov --- .../src/compile/test/test_cube_join_views.rs | 80 +++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs index 8df7f8f51015d..97d8d4cb8d305 100644 --- a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs +++ b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs @@ -58,6 +58,7 @@ fn views_meta() -> Vec { r#type: V1CubeMetaType::View, dimensions: vec![ dimension("customers_view.customer_city", "customers.customer_city"), + dimension("customers_view.customer_state", "customers.customer_state"), // A second dimension that is NOT a join key, used to test that a // query grouping by it (instead of the join key) is not merged. dimension("customers_view.status", "customers.status"), @@ -82,6 +83,7 @@ fn views_meta() -> Vec { r#type: V1CubeMetaType::View, dimensions: vec![ dimension("orders_view.customer_city", "customers.customer_city"), + dimension("orders_view.customer_state", "customers.customer_state"), dimension("orders_view.status", "orders.status"), time_dimension("orders_view.created_at", "customers.created_at"), ], @@ -737,3 +739,81 @@ async fn test_inner_join_on_date_trunc_group_by_date_trunc() { } ) } + +/// Joining two views on a composite key (two shared dimensions) and grouping by +/// both merges into a single multi-fact CubeScan. The GROUP BY must cover the +/// full join key, and each LEFT-join key column is marked present. +#[tokio::test] +async fn test_left_join_on_multiple_dimensions_group_by_both() { + if !Rewriter::sql_push_down_enabled() { + return; + } + init_testing_logger(); + + let logical_plan = plan_view_join( + r#" + SELECT c.customer_city, c.customer_state, measure(o.revenue) + FROM customers_view c + LEFT JOIN orders_view o + ON o.customer_city = c.customer_city + AND o.customer_state = c.customer_state + GROUP BY 1, 2 + "#, + true, + ) + .await + .unwrap() + .as_logical_plan(); + + assert_eq!( + logical_plan.find_cube_scan().request, + V1LoadRequestQuery { + measures: Some(vec!["orders_view.revenue".to_string()]), + dimensions: Some(vec![ + "customers_view.customer_city".to_string(), + "customers_view.customer_state".to_string(), + ]), + segments: Some(vec![]), + order: Some(vec![]), + filters: Some(vec![ + set_filter("customers_view.customer_state"), + set_filter("customers_view.customer_city"), + ]), + join_hints: Some(vec![vec![ + "customers_view".to_string(), + "orders_view".to_string(), + ]]), + ..Default::default() + } + ) +} + +/// Grouping by only part of a composite join key must not merge: the GROUP BY +/// must cover the full join key, so this falls back to standard join handling +/// (which errors for ungrouped-style cube joins). +#[tokio::test] +async fn test_join_on_multiple_dimensions_partial_group_by_is_not_merged() { + if !Rewriter::sql_push_down_enabled() { + return; + } + init_testing_logger(); + + let result = plan_view_join( + r#" + SELECT c.customer_city, measure(o.revenue) + FROM customers_view c + LEFT JOIN orders_view o + ON o.customer_city = c.customer_city + AND o.customer_state = c.customer_state + GROUP BY 1 + "#, + true, + ) + .await; + + assert!( + result.is_err(), + "expected partial-group-by composite join not to merge, got: {:?}", + result.map(|p| p.as_logical_plan().find_cube_scan().request) + ); +} From 3614128842525b23735dfa8cf23f04ff0b154a8e Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 7 Jun 2026 21:39:43 +0000 Subject: [PATCH 19/21] feat(cubesql): support view joins on date_trunc combined with a dimension The planner turns 'ON a.dim = b.dim AND DATE_TRUNC(g, a.ts) = DATE_TRUNC(g, b.ts)' into Filter(, Join(a.dim = b.dim, ...)). The column join becomes a MultiFactJoinWrapper; add a multi-fact-join-wrapper-absorb-time-key rule that folds the truncated time member into the wrapper's recorded join key (marking both time columns present, since a post-join equality is INNER on that key) so a query grouped by both the time dimension and the dimension merges into one multi-fact CubeScan. Adds a test for the mixed DATE_TRUNC + dimension join. Co-authored-by: Pavel Tiunov --- .../docs/data-modeling/multi-fact-views.mdx | 12 + .../src/compile/rewrite/rules/members.rs | 211 ++++++++++++++++++ .../src/compile/test/test_cube_join_views.rs | 55 +++++ 3 files changed, 278 insertions(+) diff --git a/docs-mintlify/docs/data-modeling/multi-fact-views.mdx b/docs-mintlify/docs/data-modeling/multi-fact-views.mdx index 23e866254e0ae..8aa9ab49bf013 100644 --- a/docs-mintlify/docs/data-modeling/multi-fact-views.mdx +++ b/docs-mintlify/docs/data-modeling/multi-fact-views.mdx @@ -395,6 +395,18 @@ planner expresses it as a filtered cross join), so both sides must share a key; both truncated columns must resolve to the same underlying time member at the same granularity. +You can also combine a `DATE_TRUNC` equality with a plain dimension equality in +the same join (a composite key), and group by both: + +```sql +SELECT DATE_TRUNC('day', o.created_at), o.name, MEASURE(o.total_amount), MEASURE(r.total_refund) +FROM orders_view o +JOIN returns_view r + ON DATE_TRUNC('day', r.created_at) = DATE_TRUNC('day', o.created_at) + AND r.name = o.name +GROUP BY 1, 2 +``` + ### Filtering the join Filters on top of the join are supported and are applied to the merged query: diff --git a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs index 8a7422e7ae872..632e862a4bacd 100644 --- a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs +++ b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs @@ -679,6 +679,79 @@ impl RewriteRules for MemberRules { "?join_members", ), ), + // Absorb a date-truncated equality filter sitting on top of a + // MultiFactJoinWrapper as an additional (time) join key. This covers + // joins on a mix of a plain dimension and a DATE_TRUNC: the planner + // turns `ON a.dim = b.dim AND DATE_TRUNC(g, a.ts) = DATE_TRUNC(g, b.ts)` + // into Filter(, Join(a.dim = b.dim, ...)). The column join + // becomes the wrapper; this rule folds the truncated time member into + // the recorded join key (and marks both time columns present, since a + // post-join equality is effectively INNER on that key). + transforming_rewrite( + "multi-fact-join-wrapper-absorb-time-key", + filter( + binary_expr( + self.fun_expr( + "DateTrunc", + vec![ + literal_expr("?abs_left_granularity"), + column_expr("?abs_left_column"), + ], + ), + "=", + self.fun_expr( + "DateTrunc", + vec![ + literal_expr("?abs_right_granularity"), + column_expr("?abs_right_column"), + ], + ), + ), + multi_fact_join_wrapper( + cube_scan( + "?abs_alias_to_cube", + "?abs_members", + "?abs_filters", + "?abs_orders", + "?abs_limit", + "?abs_offset", + "?abs_split", + "?abs_can_pushdown_join", + "?abs_wrapped", + "?abs_ungrouped", + "?abs_join_hints", + ), + "?abs_prev_join_members", + ), + ), + multi_fact_join_wrapper( + cube_scan( + "?abs_alias_to_cube", + "?abs_members", + "?abs_out_filters", + "?abs_orders", + "?abs_limit", + "?abs_offset", + "?abs_split", + "?abs_can_pushdown_join", + "?abs_wrapped", + "?abs_ungrouped", + "?abs_join_hints", + ), + "?abs_join_members", + ), + self.absorb_time_key_into_wrapper( + "?abs_members", + "?abs_left_column", + "?abs_left_granularity", + "?abs_right_column", + "?abs_right_granularity", + "?abs_filters", + "?abs_out_filters", + "?abs_prev_join_members", + "?abs_join_members", + ), + ), // Push a Filter (e.g. a WHERE on top of the join) down through the // wrapper into the merged CubeScan, where the standard filter // push-down rules turn it into a Cube query filter. @@ -3609,6 +3682,144 @@ impl MemberRules { } } + // Fold a date-truncated equality (DATE_TRUNC(g, a.ts) = DATE_TRUNC(g, b.ts)) + // that sits on top of a MultiFactJoinWrapper into the wrapper's recorded join + // key. Both truncated columns must resolve to the same underlying time member + // (at the same granularity) on the merged scan; both are marked present. + #[allow(clippy::too_many_arguments)] + fn absorb_time_key_into_wrapper( + &self, + members_var: &'static str, + left_column_var: &'static str, + left_granularity_var: &'static str, + right_column_var: &'static str, + right_granularity_var: &'static str, + filters_var: &'static str, + out_filters_var: &'static str, + prev_join_members_var: &'static str, + join_members_var: &'static str, + ) -> impl Fn(&mut CubeEGraph, &mut Subst) -> bool { + let members_var = var!(members_var); + let left_column_var = var!(left_column_var); + let left_granularity_var = var!(left_granularity_var); + let right_column_var = var!(right_column_var); + let right_granularity_var = var!(right_granularity_var); + let filters_var = var!(filters_var); + let out_filters_var = var!(out_filters_var); + let prev_join_members_var = var!(prev_join_members_var); + let join_members_var = var!(join_members_var); + let meta_context = self.meta_context.clone(); + let enable_tesseract_sql_planner = self.config_obj.enable_tesseract_sql_planner(); + move |egraph, subst| { + if !enable_tesseract_sql_planner { + return false; + } + fn dimension_member_name( + egraph: &mut CubeEGraph, + members_id: Id, + column: &Column, + ) -> Option { + match egraph[members_id].data.find_member_by_column(column) { + Some(((_, Member::Dimension { name, .. }, _), _)) + | Some(((_, Member::TimeDimension { name, .. }, _), _)) => Some(name.clone()), + _ => None, + } + } + + let resolve_underlying = |member_name: &str| -> String { + meta_context + .find_dimension_with_name(member_name) + .and_then(|dim| dim.alias_member.clone()) + .unwrap_or_else(|| member_name.to_string()) + }; + + let Some(left_granularity) = + var_iter!(egraph[subst[left_granularity_var]], LiteralExprValue) + .find_map(|v| utils::parse_granularity(v, false)) + else { + return false; + }; + let Some(right_granularity) = + var_iter!(egraph[subst[right_granularity_var]], LiteralExprValue) + .find_map(|v| utils::parse_granularity(v, false)) + else { + return false; + }; + if left_granularity != right_granularity { + return false; + } + + let Some(left_col) = var_iter!(egraph[subst[left_column_var]], ColumnExprColumn) + .next() + .cloned() + else { + return false; + }; + let Some(right_col) = var_iter!(egraph[subst[right_column_var]], ColumnExprColumn) + .next() + .cloned() + else { + return false; + }; + + // Both columns live on the merged scan; resolve them to time members. + let Some(left_key) = dimension_member_name(egraph, subst[members_var], &left_col) + else { + return false; + }; + let Some(right_key) = dimension_member_name(egraph, subst[members_var], &right_col) + else { + return false; + }; + if resolve_underlying(&left_key) != resolve_underlying(&right_key) { + return false; + } + + // Time key recorded for the GROUP BY check at finalize, unioned with + // the keys already on the wrapper. + let mut join_member_names: Vec = vec![resolve_underlying(&left_key)]; + if let Some(prev) = var_iter!( + egraph[subst[prev_join_members_var]], + MultiFactJoinWrapperJoinMembers + ) + .next() + { + join_member_names.extend(prev.iter().cloned()); + } + join_member_names.sort(); + join_member_names.dedup(); + + let join_members_id = egraph.add(LogicalPlanLanguage::MultiFactJoinWrapperJoinMembers( + MultiFactJoinWrapperJoinMembers(join_member_names), + )); + subst.insert(join_members_var, join_members_id); + + // INNER on the time key: both columns must be present. + let presence_members = [left_key, right_key]; + let mut acc = subst[filters_var]; + for name in &presence_members { + let member = egraph.add(LogicalPlanLanguage::FilterMemberMember( + crate::compile::rewrite::FilterMemberMember(name.clone()), + )); + let op = egraph.add(LogicalPlanLanguage::FilterMemberOp( + crate::compile::rewrite::FilterMemberOp("set".to_string()), + )); + let values = egraph.add(LogicalPlanLanguage::FilterMemberValues( + crate::compile::rewrite::FilterMemberValues(vec![]), + )); + let filter_member = + egraph.add(LogicalPlanLanguage::FilterMember([member, op, values])); + acc = egraph.add(LogicalPlanLanguage::CubeScanFilters(vec![ + filter_member, + acc, + ])); + } + subst.insert(out_filters_var, acc); + + true + } + } + // Finalize a MultiFactJoinWrapper: only unwrap it (letting the standard // aggregate push-down produce the merged multi-fact CubeScan) when the // query's GROUP BY is exactly the recorded shared join key. This rejects diff --git a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs index 97d8d4cb8d305..3bca15f6766cf 100644 --- a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs +++ b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs @@ -817,3 +817,58 @@ async fn test_join_on_multiple_dimensions_partial_group_by_is_not_merged() { result.map(|p| p.as_logical_plan().find_cube_scan().request) ); } + +/// Joining on a mix of a `DATE_TRUNC` equality and a plain dimension equality. +/// The SQL planner makes the column equality the join key and keeps the +/// truncated-time equality as a filter on top; the rewrite folds the time +/// member into the join key so the whole thing merges into one multi-fact scan +/// grouped by the time dimension and the dimension. Both join (INNER) keys and +/// the absorbed time key are marked present. +#[tokio::test] +async fn test_inner_join_on_date_trunc_and_dimension() { + if !Rewriter::sql_push_down_enabled() { + return; + } + init_testing_logger(); + + let logical_plan = plan_view_join( + r#" + SELECT DATE_TRUNC('day', c.created_at), c.customer_city, measure(o.revenue) + FROM customers_view c + JOIN orders_view o + ON DATE_TRUNC('day', o.created_at) = DATE_TRUNC('day', c.created_at) + AND o.customer_city = c.customer_city + GROUP BY 1, 2 + "#, + true, + ) + .await + .unwrap() + .as_logical_plan(); + + assert_eq!( + logical_plan.find_cube_scan().request, + V1LoadRequestQuery { + measures: Some(vec!["orders_view.revenue".to_string()]), + dimensions: Some(vec!["customers_view.customer_city".to_string()]), + segments: Some(vec![]), + time_dimensions: Some(vec![V1LoadRequestQueryTimeDimension { + dimension: "customers_view.created_at".to_string(), + granularity: Some("day".to_string()), + date_range: None, + }]), + order: Some(vec![]), + filters: Some(vec![ + set_filter("customers_view.created_at"), + set_filter("orders_view.created_at"), + set_filter("orders_view.customer_city"), + set_filter("customers_view.customer_city"), + ]), + join_hints: Some(vec![vec![ + "customers_view".to_string(), + "orders_view".to_string(), + ]]), + ..Default::default() + } + ) +} From 379a2a10f8d6f69ec64aef5ba5c05c96d6e68ffe Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Mon, 8 Jun 2026 00:00:34 +0000 Subject: [PATCH 20/21] fix(cubesql): store join-key granularity and require it to match GROUP BY Per review: the multi-fact stitch happens at the GROUP BY grain, so the join key's granularity must equal the GROUP BY granularity for a time member. Previously only the underlying member name was recorded, so joining on DATE_TRUNC('month', ...) while grouping by DATE_TRUNC('day', ...) merged and stitched at day grain, diverging from the written join. Store join_members as (underlying member, Option) on the wrapper (None for plain dimensions, Some(grain) for DATE_TRUNC time keys), and at finalize extract each GROUP BY expression's granularity from its original_expr (rather than referenced_expr, which drops the grain) and require the full (member, granularity) sets to match. Replaces the raw-time-join test with a granularity-mismatch negative test. Co-authored-by: Pavel Tiunov --- .../docs/data-modeling/multi-fact-views.mdx | 6 ++ .../cubesql/src/compile/rewrite/mod.rs | 8 +- .../src/compile/rewrite/rules/members.rs | 87 ++++++++++++------- .../src/compile/test/test_cube_join_views.rs | 46 +++++----- 4 files changed, 89 insertions(+), 58 deletions(-) diff --git a/docs-mintlify/docs/data-modeling/multi-fact-views.mdx b/docs-mintlify/docs/data-modeling/multi-fact-views.mdx index 8aa9ab49bf013..327aa50cf2b40 100644 --- a/docs-mintlify/docs/data-modeling/multi-fact-views.mdx +++ b/docs-mintlify/docs/data-modeling/multi-fact-views.mdx @@ -395,6 +395,12 @@ planner expresses it as a filtered cross join), so both sides must share a key; both truncated columns must resolve to the same underlying time member at the same granularity. +The join-key granularity must match the `GROUP BY` granularity, because the +facts are stitched together at the grain you group by. Joining on +`DATE_TRUNC('month', …)` while grouping by `DATE_TRUNC('day', …)` is not merged +(it would silently stitch at day grain, diverging from the month-grain join) and +falls back to standard handling. + You can also combine a `DATE_TRUNC` equality with a plain dimension equality in the same join (a composite key), and group by both: diff --git a/rust/cubesql/cubesql/src/compile/rewrite/mod.rs b/rust/cubesql/cubesql/src/compile/rewrite/mod.rs index 30f77e3106ce1..eef32945cece1 100644 --- a/rust/cubesql/cubesql/src/compile/rewrite/mod.rs +++ b/rust/cubesql/cubesql/src/compile/rewrite/mod.rs @@ -544,12 +544,14 @@ crate::plan_to_language! { // Intermediate node produced while merging a join of two (view) // CubeScans on a shared cube member into a single multi-fact CubeScan. // `input` is the merged CubeScan; `join_members` holds the underlying - // cube members the scans were joined on, so the aggregate finalize rule - // can verify the GROUP BY matches the join key. Rewrite-only: it must be + // cube members the scans were joined on, each paired with the join-key + // granularity (`Some` for a `DATE_TRUNC` time key, `None` for a plain + // dimension), so the aggregate finalize rule can verify the GROUP BY + // matches the join key at the same grain. Rewrite-only: it must be // eliminated (unwrapped at the aggregate) before extraction. MultiFactJoinWrapper { input: Arc, - join_members: Vec, + join_members: Vec<(String, Option)>, }, } } diff --git a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs index 632e862a4bacd..ebccb095aec62 100644 --- a/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs +++ b/rust/cubesql/cubesql/src/compile/rewrite/rules/members.rs @@ -42,7 +42,7 @@ use cubeclient::models::V1CubeMetaMeasure; use datafusion::{ arrow::datatypes::DataType, logical_plan::{Column, DFSchema, Expr, Operator}, - physical_plan::aggregates::AggregateFunction, + physical_plan::{aggregates::AggregateFunction, functions::BuiltinScalarFunction}, scalar::ScalarValue, }; use egg::{Id, Subst, Var}; @@ -3373,9 +3373,9 @@ impl MemberRules { // The join key as underlying cube members, unioned with any keys // already recorded on the left wrapper (for chained 3+ view joins). - let mut join_member_names: Vec = shared_left_keys + let mut join_member_names: Vec<(String, Option)> = shared_left_keys .iter() - .map(|k| resolve_underlying(k)) + .map(|k| (resolve_underlying(k), None)) .collect(); if let Some(prev_var) = prev_join_members_var { if let Some(prev) = @@ -3601,7 +3601,10 @@ impl MemberRules { // A filtered cross join is an INNER join: require both keys present. let presence_members: Vec = vec![left_key.clone(), right_key.clone()]; - let mut join_member_names: Vec = vec![resolve_underlying(&left_key)]; + let mut join_member_names: Vec<(String, Option)> = vec![( + resolve_underlying(&left_key), + Some(left_granularity.clone()), + )]; join_member_names.sort(); join_member_names.dedup(); @@ -3777,7 +3780,10 @@ impl MemberRules { // Time key recorded for the GROUP BY check at finalize, unioned with // the keys already on the wrapper. - let mut join_member_names: Vec = vec![resolve_underlying(&left_key)]; + let mut join_member_names: Vec<(String, Option)> = vec![( + resolve_underlying(&left_key), + Some(left_granularity.clone()), + )]; if let Some(prev) = var_iter!( egraph[subst[prev_join_members_var]], MultiFactJoinWrapperJoinMembers @@ -3855,7 +3861,7 @@ impl MemberRules { .unwrap_or_else(|| member_name.to_string()) }; - let join_members: Vec = match var_iter!( + let join_members: Vec<(String, Option)> = match var_iter!( egraph[subst[join_members_var]], MultiFactJoinWrapperJoinMembers ) @@ -3867,39 +3873,62 @@ impl MemberRules { if join_members.is_empty() { return false; } - let join_set: HashSet = join_members.into_iter().collect(); - - let group_exprs = match &egraph.index(subst[group_expr_var]).data.referenced_expr { - Some(exprs) => exprs.clone(), - None => return false, - }; - if group_exprs.is_empty() { + let join_set: HashSet<(String, Option)> = join_members.into_iter().collect(); + + // The actual GROUP BY expressions (with their full structure, so a + // `DATE_TRUNC(g, col)` keeps its granularity). `referenced_expr` + // can't be used here because it collapses a wrapped expression to + // its inner column and would drop the grain. + let group_child_ids: Vec = + match var_list_iter!(egraph[subst[group_expr_var]], AggregateGroupExpr).next() { + Some(ids) => ids.clone(), + None => return false, + }; + if group_child_ids.is_empty() { return false; } - // Every GROUP BY expression must reference a dimension of the merged - // scan whose underlying cube member is part of the recorded join key. - // `group_exprs` comes from `referenced_expr`, which collapses a - // wrapped expression to the column(s) it references, so a time-grain - // group-by such as `DATE_TRUNC('day', c.created_at)` is matched via - // its inner `created_at` column. An expression that references no - // column (or more/other columns than the join key) won't match, so - // the wrapper is left in place and the query falls back to standard - // join handling. - let mut group_underlying: HashSet = HashSet::new(); - for expr in &group_exprs { - let Expr::Column(column) = expr else { + // Every GROUP BY expression must be either a plain dimension column + // (no granularity) or a `DATE_TRUNC(g, col)` over one, and the + // resulting (underlying member, granularity) pair must be part of + // the recorded join key. A join on `DATE_TRUNC('month', ...)` paired + // with `GROUP BY DATE_TRUNC('day', ...)` therefore won't merge: the + // multi-fact stitch happens at the GROUP BY grain, which must match + // the grain the user joined on. + let mut group_set: HashSet<(String, Option)> = HashSet::new(); + for child_id in &group_child_ids { + let Some(OriginalExpr::Expr(expr)) = egraph[*child_id].data.original_expr.clone() + else { return false; }; - let Some(member_name) = dimension_member_name(egraph, subst[scan_var], column) + let (column, granularity) = match &expr { + Expr::Column(col) => (col.clone(), None), + Expr::ScalarFunction { + fun: BuiltinScalarFunction::DateTrunc, + args, + } if args.len() == 2 => { + let Expr::Literal(scalar) = &args[0] else { + return false; + }; + let Some(granularity) = utils::parse_granularity(scalar, false) else { + return false; + }; + let Expr::Column(col) = &args[1] else { + return false; + }; + (col.clone(), Some(granularity)) + } + _ => return false, + }; + let Some(member_name) = dimension_member_name(egraph, subst[scan_var], &column) else { return false; }; - group_underlying.insert(resolve_underlying(&member_name)); + group_set.insert((resolve_underlying(&member_name), granularity)); } - // GROUP BY must match the join key exactly. - group_underlying == join_set + // GROUP BY must match the join key exactly, member and grain. + group_set == join_set } } diff --git a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs index 3bca15f6766cf..434d562ee578d 100644 --- a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs +++ b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs @@ -646,49 +646,43 @@ async fn test_group_by_left_join_three_views_presence_filters() { ) } -/// Joining two views on a raw shared time column and grouping by -/// `DATE_TRUNC('day', ...)` merges into a single multi-fact CubeScan with the -/// grouped column emitted as a `timeDimensions` entry (granularity `day`). +/// The join-key granularity must match the GROUP BY granularity: the multi-fact +/// stitch happens at the GROUP BY grain, so joining on `DATE_TRUNC('month', ...)` +/// while grouping by `DATE_TRUNC('day', ...)` must not merge (it would silently +/// stitch at day grain, diverging from the month-grain join). #[tokio::test] -async fn test_left_join_raw_time_group_by_date_trunc() { +async fn test_join_date_trunc_granularity_mismatch_is_not_merged() { if !Rewriter::sql_push_down_enabled() { return; } init_testing_logger(); - let logical_plan = plan_view_join( + let request = plan_view_join( r#" SELECT DATE_TRUNC('day', c.created_at), measure(o.revenue) FROM customers_view c - LEFT JOIN orders_view o ON o.created_at = c.created_at + JOIN orders_view o + ON DATE_TRUNC('month', o.created_at) = DATE_TRUNC('month', c.created_at) GROUP BY 1 "#, true, ) .await .unwrap() - .as_logical_plan(); + .as_logical_plan() + .find_cube_scan() + .request; + // The merge did not happen: rather than a single grouped multi-fact scan, + // the query falls back to the raw ungrouped cross-join scan (pushed to the + // cube as SQL), so the measure is not pushed and the scan stays ungrouped. assert_eq!( - logical_plan.find_cube_scan().request, - V1LoadRequestQuery { - measures: Some(vec!["orders_view.revenue".to_string()]), - dimensions: Some(vec![]), - segments: Some(vec![]), - time_dimensions: Some(vec![V1LoadRequestQueryTimeDimension { - dimension: "customers_view.created_at".to_string(), - granularity: Some("day".to_string()), - date_range: None, - }]), - order: Some(vec![]), - filters: Some(vec![set_filter("customers_view.created_at")]), - join_hints: Some(vec![vec![ - "customers_view".to_string(), - "orders_view".to_string(), - ]]), - ..Default::default() - } - ) + request.ungrouped, + Some(true), + "expected month-grain join with day-grain GROUP BY not to merge, got: {:?}", + request + ); + assert_eq!(request.measures, Some(vec![]), "got: {:?}", request); } /// Joining two views directly on `DATE_TRUNC('day', ...)` (which the SQL planner From cebb74b5b87d65d7c945f573e1767561ed2f5f96 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Tue, 9 Jun 2026 00:33:25 +0000 Subject: [PATCH 21/21] docs(cubesql): require grained join for time-dimension multi-fact merge Align docs with the strict (member, grain) match: the join key granularity must equal the GROUP BY granularity, so a raw-time-column join does not pair with a DATE_TRUNC group-by. Remove the now-unsupported 'join on the raw time column, group by DATE_TRUNC' example and document the requirement. Add a negative test pinning that the raw-time join + DATE_TRUNC group-by is not merged. Co-authored-by: Pavel Tiunov --- .../docs/data-modeling/multi-fact-views.mdx | 38 +++++++++---------- .../src/compile/test/test_cube_join_views.rs | 29 ++++++++++++++ 2 files changed, 46 insertions(+), 21 deletions(-) diff --git a/docs-mintlify/docs/data-modeling/multi-fact-views.mdx b/docs-mintlify/docs/data-modeling/multi-fact-views.mdx index 327aa50cf2b40..d957b1d535f27 100644 --- a/docs-mintlify/docs/data-modeling/multi-fact-views.mdx +++ b/docs-mintlify/docs/data-modeling/multi-fact-views.mdx @@ -369,18 +369,7 @@ GROUP BY 1 ### Joining on a time dimension A common multi-fact pattern joins facts on a shared time dimension and groups by -a truncated grain. Both shapes are supported: - -- **Join on the raw time column, group by `DATE_TRUNC`:** - -```sql -SELECT DATE_TRUNC('day', o.created_at), MEASURE(o.total_amount), MEASURE(r.total_refund) -FROM orders_view o -LEFT JOIN returns_view r ON r.created_at = o.created_at -GROUP BY 1 -``` - -- **Join directly on `DATE_TRUNC`:** +a truncated grain. **Join on `DATE_TRUNC` at the same granularity you group by:** ```sql SELECT DATE_TRUNC('day', o.created_at), MEASURE(o.total_amount), MEASURE(r.total_refund) @@ -389,17 +378,24 @@ JOIN returns_view r ON DATE_TRUNC('day', r.created_at) = DATE_TRUNC('day', o.cre GROUP BY 1 ``` -In both cases the grouped column is emitted as a time dimension with its -granularity. A join written directly on `DATE_TRUNC` is an `INNER` join (the SQL -planner expresses it as a filtered cross join), so both sides must share a key; -both truncated columns must resolve to the same underlying time member at the -same granularity. +The grouped column is emitted as a time dimension with its granularity. A join +written on `DATE_TRUNC` is an `INNER` join (the SQL planner expresses it as a +filtered cross join), so both sides must share a key; both truncated columns +must resolve to the same underlying time member at the same granularity. The join-key granularity must match the `GROUP BY` granularity, because the -facts are stitched together at the grain you group by. Joining on -`DATE_TRUNC('month', …)` while grouping by `DATE_TRUNC('day', …)` is not merged -(it would silently stitch at day grain, diverging from the month-grain join) and -falls back to standard handling. +facts are stitched together at the grain you group by. This has two +consequences: + +- Joining on `DATE_TRUNC('month', …)` while grouping by `DATE_TRUNC('day', …)` + is not merged (it would silently stitch at day grain, diverging from the + month-grain join). +- Joining on the **raw** time column (`ON r.created_at = o.created_at`, an + exact-timestamp join) while grouping by `DATE_TRUNC('day', …)` is likewise not + merged — the row-grain join doesn't match the day-grain group-by. Truncate the + join key to the grain you group by instead. + +In both cases the query falls back to standard join handling. You can also combine a `DATE_TRUNC` equality with a plain dimension equality in the same join (a composite key), and group by both: diff --git a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs index 434d562ee578d..ee72cca37f4a0 100644 --- a/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs +++ b/rust/cubesql/cubesql/src/compile/test/test_cube_join_views.rs @@ -866,3 +866,32 @@ async fn test_inner_join_on_date_trunc_and_dimension() { } ) } + +/// A join on the raw time column (exact-timestamp equality, "no grain") does not +/// match a truncated `DATE_TRUNC('day', ...)` GROUP BY, so it is not merged: the +/// multi-fact stitch happens at the GROUP BY grain, which must be the grain the +/// user joined on. Truncate the join key to the grain you group by instead. +#[tokio::test] +async fn test_raw_time_join_with_date_trunc_group_by_is_not_merged() { + if !Rewriter::sql_push_down_enabled() { + return; + } + init_testing_logger(); + + let result = plan_view_join( + r#" + SELECT DATE_TRUNC('day', c.created_at), measure(o.revenue) + FROM customers_view c + LEFT JOIN orders_view o ON o.created_at = c.created_at + GROUP BY 1 + "#, + true, + ) + .await; + + assert!( + result.is_err(), + "expected raw-time-column join with a DATE_TRUNC GROUP BY not to merge, got: {:?}", + result.map(|p| p.as_logical_plan().find_cube_scan().request) + ); +}