cube-js · paveltiunov · Jun 6, 2026 · claude · Jun 6, 2026 · claude
@@ -73,7 +73,7 @@ impl InlineAggregateStream {
             aggregate_expressions(&agg.aggr_expr, &agg.mode, agg_group_by.num_group_exprs())?;
 
         let filter_expressions = match agg.mode {
-            InlineAggregateMode::Partial => agg_filter_expr,
+            InlineAggregateMode::Partial | InlineAggregateMode::Single => agg_filter_expr,
             InlineAggregateMode::Final => {
                 vec![None; agg.aggr_expr.len()]
             }
@@ -113,7 +113,7 @@ fn aggregate_expressions(
     col_idx_base: usize,
 ) -> DFResult<Vec<Vec<Arc<dyn PhysicalExpr>>>> {
     match mode {
-        InlineAggregateMode::Partial => Ok(aggr_expr
+        InlineAggregateMode::Partial | InlineAggregateMode::Single => Ok(aggr_expr
             .iter()
             .map(|agg| {
                 let mut result = agg.expressions();
@@ -225,7 +225,6 @@ impl Stream for InlineAggregateStream {
                 ExecutionState::ProducingOutput(batch) => {
                     let batch = batch.clone();
 
-                    // Determine next state
                     self.exec_state = if self.input_done {
                         ExecutionState::Done
                     } else {
@@ -267,7 +266,7 @@ impl InlineAggregateStream {
                     let state = acc.state(emit_to)?;
                     aggr_arrays.extend(state);
                 }
-                InlineAggregateMode::Final => {
+                InlineAggregateMode::Final | InlineAggregateMode::Single => {
                     // Emit final aggregated values
                     aggr_arrays.push(acc.evaluate(emit_to)?);
                 }
@@ -333,7 +332,7 @@ impl InlineAggregateStream {
             // Call the appropriate method on each aggregator with
             // the entire input row and the relevant group indexes
             match self.mode {
-                InlineAggregateMode::Partial => {
+                InlineAggregateMode::Partial | InlineAggregateMode::Single => {
                     acc.update_batch(values, group_indices, opt_filter, total_num_groups)?;
                 }
                 _ => {

@@ -29,6 +29,7 @@ use std::sync::Arc;
 pub enum InlineAggregateMode {
     Partial,
     Final,
+    Single,
 }
 
 #[derive(Debug, Clone)]
@@ -66,10 +67,11 @@ impl InlineAggregateExec {
             return None;
         }
 
-        // Only support Partial and Final modes
+        // Only support Partial, Final, and Single modes
         let mode = match aggregate.mode() {
             AggregateMode::Partial => InlineAggregateMode::Partial,
             AggregateMode::Final => InlineAggregateMode::Final,
+            AggregateMode::Single => InlineAggregateMode::Single,
             _ => return None,
         };
 
@@ -111,6 +113,11 @@ impl InlineAggregateExec {
         self.limit
     }
 
+    pub fn with_limit(mut self, limit: Option<usize>) -> Self {
+        self.limit = limit;
+        self
+    }
+
     pub fn aggr_expr(&self) -> &[Arc<AggregateFunctionExpr>] {
         &self.aggr_expr
     }
@@ -151,7 +158,7 @@ impl ExecutionPlan for InlineAggregateExec {
 
     fn required_input_distribution(&self) -> Vec<Distribution> {
         match &self.mode {
-            InlineAggregateMode::Partial => {
+            InlineAggregateMode::Partial | InlineAggregateMode::Single => {
                 vec![Distribution::UnspecifiedDistribution]
             }
             InlineAggregateMode::Final => {
@@ -181,7 +188,7 @@ impl ExecutionPlan for InlineAggregateExec {
             group_by: self.group_by.clone(),
             aggr_expr: self.aggr_expr.clone(),
             filter_expr: self.filter_expr.clone(),
-            limit: self.limit.clone(),
+            limit: self.limit,
             input: children[0].clone(),
             schema: self.schema.clone(),
             input_schema: self.input_schema.clone(),

@@ -84,7 +84,7 @@ pub fn push_aggregate_to_workers(
             let worker_input = p_partial.clone().with_new_children(vec![w.input.clone()])?;
 
             // Worker plan, execute partial aggregate inside the worker.
-            Arc::new(WorkerExec::new(
+            let new_worker = WorkerExec::new(
                 worker_input,
                 w.max_batch_rows,
                 // TODO upgrade DF: WorkerExec limit_and_reverse must be wrong here.  Should be
@@ -98,7 +98,9 @@ pub fn push_aggregate_to_workers(
                 WorkerPlanningParams {
                     worker_partition_count: w.properties().output_partitioning().partition_count(),
                 },
-            ))
+                w.worker_sort_and_limit.clone(),
+            );
+            Arc::new(new_worker)
         } else {
             return Ok(p_final);
         };

@@ -5,16 +5,19 @@
 pub mod rewrite_plan;
 pub mod rolling_optimizer;
 mod trace_data_loaded;

 use super::serialized_plan::PreSerializedPlan;
 use crate::cluster::{Cluster, WorkerPlanningParams};
 use crate::queryplanner::optimizations::distributed_partial_aggregate::{
    add_limit_to_workers, ensure_partition_merge, push_aggregate_to_workers,
     replace_suboptimal_merge_sorts,
 };
 use crate::queryplanner::optimizations::inline_aggregate_rewriter::replace_with_inline_aggregate;
+use crate::queryplanner::check_memory::CheckMemoryExec;
+use crate::queryplanner::inline_aggregate::{InlineAggregateExec, InlineAggregateMode};
 use crate::queryplanner::planning::CubeExtensionPlanner;
 use crate::queryplanner::pretty_printers::{pp_phys_plan_ext, PPOptions};
+use crate::queryplanner::query_executor::ClusterSendExec;
 use crate::queryplanner::rolling::RollingWindowPlanner;
 use crate::queryplanner::trace_data_loaded::DataLoadedSize;
 use crate::util::memory::MemoryHandler;
@@ -25,7 +28,10 @@
 use datafusion::execution::context::QueryPlanner;
 use datafusion::execution::SessionState;
 use datafusion::logical_expr::LogicalPlan;
+use datafusion::physical_expr::LexOrdering;
 use datafusion::physical_optimizer::PhysicalOptimizerRule;
+use datafusion::physical_plan::projection::ProjectionExec;
+use datafusion::physical_plan::sorts::sort::SortExec;
 use datafusion::physical_plan::ExecutionPlan;
 use datafusion::physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner};
 use distributed_partial_aggregate::ensure_partition_merge_with_acceptable_parent;
@@ -148,6 +154,10 @@
     // Replace sorted AggregateExec with InlineAggregateExec for better performance
     let p = rewrite_physical_plan(p, &mut |p| replace_with_inline_aggregate(p))?;
 
+    // Apply worker_sort_and_limit AFTER aggregate restructuring, so the SortExec
+    // wraps the aggregate output (not the raw scan input).
+    let p = rewrite_physical_plan(p, &mut |p| apply_worker_sort_and_limit(p))?;
+
     Ok(p)
 }
 
@@ -177,10 +187,190 @@
         "Rewrote physical plan by add_limit_to_workers:\n{}",
         pp_phys_plan_ext(p.as_ref(), &PPOptions::show_nonmeta())
     );
+    let p = push_sort_to_workers(p)?;
+    log::trace!(
+        "Rewrote physical plan by push_sort_to_workers:\n{}",
+        pp_phys_plan_ext(p.as_ref(), &PPOptions::show_nonmeta())
+    );
     let p = rewrite_physical_plan(p, &mut |p| replace_suboptimal_merge_sorts(p))?;
     log::trace!(
         "Rewrote physical plan by replace_suboptimal_merge_sorts:\n{}",
         pp_phys_plan_ext(p.as_ref(), &PPOptions::show_nonmeta())
     );
     Ok(p)
 }
+
+/// When the router plan has `SortExec(fetch=N)` sorting by GROUP BY columns,
+/// and the worker uses `InlineAggregateExec` (streaming aggregate where groups don't overlap),
+/// push a matching `SortExec(fetch=N)` to the worker. DataFusion's SortExec with fetch uses
+/// a bounded heap, so this limits worker output to N rows with O(N) memory.
+fn push_sort_to_workers(
+    p: Arc<dyn ExecutionPlan>,
+) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
+    let Some(sort) = p.as_any().downcast_ref::<SortExec>() else {
+        return Ok(p);
+    };
+    let Some(fetch) = sort.fetch() else {
+        return Ok(p);
+    };
+
+    let sort_exprs = sort.expr().clone();
+
+    // Walk down through Projection → InlineAggregate(Final) → ClusterSend
+    let sort_input = sort.input();
+    let (below_proj, col_mapping) = if let Some(proj) = sort_input.as_any().downcast_ref::<ProjectionExec>() {
+        // Map sort column indices through the projection
+        let mapping: Vec<Option<usize>> = proj.expr().iter().map(|(expr, _name)| {
+            expr.as_any()
+                .downcast_ref::<datafusion::physical_plan::expressions::Column>()
+                .map(|c| c.index())
+        }).collect();
+        (proj.input().clone(), Some(mapping))
+    } else {
+        (sort_input.clone(), None)
+    };
+
+    let Some(inline_final) = below_proj.as_any().downcast_ref::<InlineAggregateExec>() else {
+        return Ok(p);
+    };
+    if *inline_final.mode() != InlineAggregateMode::Final {
+        return Ok(p);
+    }
+    let group_key_len = inline_final.group_expr().expr().len();
+
+    // Translate sort expressions to pre-projection column indices
+    let translated_sort_exprs: Vec<_> = sort_exprs.iter().map(|se| {
+        if let Some(col) = se.expr.as_any().downcast_ref::<datafusion::physical_plan::expressions::Column>() {
+            let actual_idx = if let Some(ref mapping) = col_mapping {
+                mapping.get(col.index()).copied().flatten()
+            } else {
+                Some(col.index())
+            };
+            actual_idx.filter(|&idx| idx < group_key_len)
+        } else {
+            None
+        }
+    }).collect();
+
+    // All sort columns must be GROUP BY columns
+    if translated_sort_exprs.iter().any(|x| x.is_none()) {
+        return Ok(p);
+    }
+
+    // Find ClusterSendExec below InlineAggregate(Final), possibly through CheckMemoryExec
+    let final_input = inline_final.input();
+    let (cluster_send, through_check_memory) =
+        if let Some(cs) = final_input.as_any().downcast_ref::<ClusterSendExec>() {
+            (cs, false)
+        } else if let Some(cm) = final_input.as_any().downcast_ref::<CheckMemoryExec>() {
+            if let Some(cs) = cm.input.as_any().downcast_ref::<ClusterSendExec>() {
+                (cs, true)
+            } else {
+                return Ok(p);
+            }
+        } else {
+            return Ok(p);
+        };
+
+    // Don't override if limit_and_reverse is already set
+    if cluster_send.limit_and_reverse.is_some() {
+        return Ok(p);
+    }
+
+    let worker_input = &cluster_send.input_for_optimizations;
+
+    // Verify the worker has InlineAggregateExec(Partial) - confirms groups don't overlap
+    let has_inline_partial = worker_input
+        .as_any()
+        .downcast_ref::<InlineAggregateExec>()
+        .map_or(false, |ia| *ia.mode() == InlineAggregateMode::Partial);
+    if !has_inline_partial {
+        return Ok(p);
+    }
+
+    // Build sort expressions for the worker (same column indices, same options)
+    let worker_schema = worker_input.schema();
+    let worker_sort_exprs: Vec<_> = sort_exprs.iter().zip(translated_sort_exprs.iter()).map(|(se, &mapped_idx)| {
+        let idx = mapped_idx.unwrap();
+        datafusion::physical_expr::PhysicalSortExpr {
+            expr: Arc::new(datafusion::physical_plan::expressions::Column::new(
+                worker_schema.field(idx).name(),
+                idx,
+            )),
+            options: se.options,
+        }
+    }).collect();
+
+    // Wrap the worker plan: SortExec(fetch=N) → InlinePartialAggregate
+    let new_worker_input: Arc<dyn ExecutionPlan> = Arc::new(
+        SortExec::new(LexOrdering::new(worker_sort_exprs), worker_input.clone())
+            .with_fetch(Some(fetch)),
+    );
+
+    // Rebuild ClusterSendExec with the new worker input
+    let new_cluster_send: Arc<dyn ExecutionPlan> = Arc::new(
+        cluster_send.with_changed_schema(new_worker_input, cluster_send.required_input_ordering.clone()),
+    );
+
+    // Re-wrap with CheckMemoryExec if it was present
+    let new_final_child: Arc<dyn ExecutionPlan> = if through_check_memory {
+        final_input.clone().with_new_children(vec![new_cluster_send])?
+    } else {
+        new_cluster_send
+    };
+
+    // Rebuild InlineAggregate(Final) with new child
+    let new_inline_final: Arc<dyn ExecutionPlan> =
+        Arc::clone(&below_proj).with_new_children(vec![new_final_child])?;
+
+    // Rebuild Projection if present
+    let new_sort_input = if sort_input.as_any().downcast_ref::<ProjectionExec>().is_some() {
+        sort_input.clone().with_new_children(vec![new_inline_final])?
+    } else {
+        new_inline_final
+    };
+
+    // Rebuild SortExec with the new subtree
+    p.with_new_children(vec![new_sort_input])
+}
+
+/// Apply worker_sort_and_limit on a WorkerExec by wrapping its child
+/// (the partial aggregate) with SortExec(fetch=N). Must run AFTER
+/// push_aggregate_to_workers and replace_with_inline_aggregate.
+fn apply_worker_sort_and_limit(
+    p: Arc<dyn ExecutionPlan>,
+) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
+    use crate::queryplanner::planning::WorkerExec;
+
+    let Some(w) = p.as_any().downcast_ref::<WorkerExec>() else {
+        return Ok(p);
+    };
+    let Some((sort_cols, limit)) = w.worker_sort_and_limit.as_ref() else {
+        return Ok(p);
+    };
+
+    let input = &w.input;
+    let schema = input.schema();
+    let sort_exprs: Vec<_> = sort_cols
+        .iter()
+        .map(|(col_idx, asc, nulls_first)| {
+            datafusion::physical_expr::PhysicalSortExpr {
+                expr: Arc::new(
+                    datafusion::physical_plan::expressions::Column::new(
+                        schema.field(*col_idx).name(),
+                        *col_idx,
+                    ),
+                ),
+                options: datafusion::arrow::compute::SortOptions {
+                    descending: !asc,
+                    nulls_first: *nulls_first,
+                },
+            }
+        })
+        .collect();
+    let sort_exec: Arc<dyn ExecutionPlan> = Arc::new(
+        SortExec::new(LexOrdering::new(sort_exprs), input.clone())
+            .with_fetch(Some(*limit)),
+    );
+    p.with_new_children(vec![sort_exec])
+}
@@ -186,5 +186,6 @@ pub fn plan_panic_worker() -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
         WorkerPlanningParams {
             worker_partition_count: 1,
         },
+        /* worker_sort_and_limit */ None,
     )))
 }