test(expression_analyzer): add StatisticsTable and end-to-end SLT for OR selectivity

asolimando · asolimando · commit 189287fad2a9 · 2026-04-16T11:47:54.000+02:00
Add a reusable StatisticsTable (TableProvider + ExecutionPlan with user-supplied
statistics) to the sqllogictest harness, and use it in expression_analyzer.slt
diff --git a/datafusion/physical-expr/src/expression_analyzer/default.rs b/datafusion/physical-expr/src/expression_analyzer/default.rs
@@ -70,7 +70,6 @@ impl DefaultExpressionAnalyzer {
             _ => None,
         }
     }
-
 }
 
 impl ExpressionAnalyzer for DefaultExpressionAnalyzer {
diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs
@@ -2658,7 +2658,7 @@ mod tests {
             schema.clone(),
         ));
         // (a = 42 OR b = 5): OR is not expressible as a single interval
-        let predicate = Arc::new(BinaryExpr::new(
+        let predicate: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
             Arc::new(BinaryExpr::new(
                 Arc::new(Column::new("a", 0)),
                 Operator::Eq,
@@ -2673,7 +2673,7 @@ mod tests {
         ));
 
         // Without ExpressionAnalyzer: default 20% selectivity -> 200 rows
-        let filter = Arc::new(FilterExec::try_new(predicate.clone(), input as _)?);
+        let filter = Arc::new(FilterExec::try_new(Arc::clone(&predicate), input as _)?);
         let stats = filter.partition_statistics(None)?;
         assert_eq!(stats.num_rows, Precision::Inexact(200));
 
diff --git a/datafusion/sqllogictest/src/test_context.rs b/datafusion/sqllogictest/src/test_context.rs
@@ -177,6 +177,10 @@ impl TestContext {
                 info!("Registering dummy async udf");
                 register_async_abs_udf(test_ctx.session_ctx())
             }
+            "expression_analyzer.slt" => {
+                info!("Registering tables with controlled statistics");
+                statistics_table::register_statistics_tables(test_ctx.session_ctx());
+            }
             _ => {
                 info!("Using default SessionContext");
             }
@@ -615,3 +619,165 @@ fn register_async_abs_udf(ctx: &SessionContext) {
     let udf = AsyncScalarUDF::new(Arc::new(async_abs));
     ctx.register_udf(udf.into_scalar_udf());
 }
+
+/// A table provider with fully controlled statistics for testing
+/// statistics-dependent optimizer and planner behaviors.
+///
+/// Unlike [`MemTable`] (which derives statistics from actual data), this
+/// provider returns whatever `Statistics` you supply, letting tests exercise
+/// code paths that depend on specific column NDV, min/max, or row-count values
+/// without needing real data files.
+pub mod statistics_table {
+    use std::sync::Arc;
+
+    use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+    use async_trait::async_trait;
+    use datafusion::catalog::Session;
+    use datafusion::common::tree_node::TreeNodeRecursion;
+    use datafusion::common::{Result, stats::Precision};
+    use datafusion::datasource::{TableProvider, TableType};
+    use datafusion::execution::TaskContext;
+    use datafusion::logical_expr::Expr;
+    use datafusion::physical_expr::{EquivalenceProperties, PhysicalExpr};
+    use datafusion::physical_plan::{
+        ColumnStatistics, DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning,
+        PlanProperties, SendableRecordBatchStream, Statistics,
+        execution_plan::{Boundedness, EmissionType},
+    };
+    use datafusion::prelude::SessionContext;
+
+    /// A [`TableProvider`] and [`ExecutionPlan`] that returns user-supplied
+    /// statistics. Useful for testing code paths that depend on specific column
+    /// NDV, min/max, or row counts without requiring real data files.
+    #[derive(Debug, Clone)]
+    pub struct StatisticsTable {
+        schema: SchemaRef,
+        stats: Statistics,
+        cache: Arc<PlanProperties>,
+    }
+
+    impl StatisticsTable {
+        pub fn new(schema: SchemaRef, stats: Statistics) -> Self {
+            assert_eq!(
+                schema.fields().len(),
+                stats.column_statistics.len(),
+                "column_statistics length must match schema field count"
+            );
+            let cache = Arc::new(PlanProperties::new(
+                EquivalenceProperties::new(Arc::clone(&schema)),
+                Partitioning::UnknownPartitioning(1),
+                EmissionType::Incremental,
+                Boundedness::Bounded,
+            ));
+            Self {
+                schema,
+                stats,
+                cache,
+            }
+        }
+    }
+
+    impl DisplayAs for StatisticsTable {
+        fn fmt_as(
+            &self,
+            _t: DisplayFormatType,
+            f: &mut std::fmt::Formatter,
+        ) -> std::fmt::Result {
+            write!(f, "StatisticsTable")
+        }
+    }
+
+    impl ExecutionPlan for StatisticsTable {
+        fn name(&self) -> &'static str {
+            "StatisticsTable"
+        }
+
+        fn properties(&self) -> &Arc<PlanProperties> {
+            &self.cache
+        }
+
+        fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+            vec![]
+        }
+
+        fn apply_expressions(
+            &self,
+            _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            Ok(TreeNodeRecursion::Continue)
+        }
+
+        fn with_new_children(
+            self: Arc<Self>,
+            _children: Vec<Arc<dyn ExecutionPlan>>,
+        ) -> Result<Arc<dyn ExecutionPlan>> {
+            Ok(self)
+        }
+
+        fn execute(
+            &self,
+            _partition: usize,
+            _context: Arc<TaskContext>,
+        ) -> Result<SendableRecordBatchStream> {
+            datafusion::common::not_impl_err!(
+                "StatisticsTable is for statistics testing only"
+            )
+        }
+
+        fn partition_statistics(
+            &self,
+            _partition: Option<usize>,
+        ) -> Result<Arc<Statistics>> {
+            Ok(Arc::new(self.stats.clone()))
+        }
+    }
+
+    #[async_trait]
+    impl TableProvider for StatisticsTable {
+        fn schema(&self) -> SchemaRef {
+            Arc::clone(&self.schema)
+        }
+
+        fn table_type(&self) -> TableType {
+            TableType::Base
+        }
+
+        async fn scan(
+            &self,
+            _state: &dyn Session,
+            projection: Option<&Vec<usize>>,
+            _filters: &[Expr],
+            _limit: Option<usize>,
+        ) -> Result<Arc<dyn ExecutionPlan>> {
+            let schema = datafusion::common::project_schema(&self.schema, projection)?;
+            let stats = self.stats.clone().project(projection);
+            Ok(Arc::new(StatisticsTable::new(schema, stats)))
+        }
+    }
+
+    /// Registers named [`StatisticsTable`] instances needed by SLT tests
+    /// that require controlled statistics (NDV, row count, min/max).
+    pub fn register_statistics_tables(ctx: &SessionContext) {
+        // t_ndv: 1000 rows, column a (Int64, NDV=10), column b (Int64, NDV=5).
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int64, false),
+            Field::new("b", DataType::Int64, false),
+        ]));
+        let stats = Statistics {
+            num_rows: Precision::Inexact(1000),
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![
+                ColumnStatistics {
+                    distinct_count: Precision::Inexact(10),
+                    ..Default::default()
+                },
+                ColumnStatistics {
+                    distinct_count: Precision::Inexact(5),
+                    ..Default::default()
+                },
+            ],
+        };
+        ctx.register_table("t_ndv", Arc::new(StatisticsTable::new(schema, stats)))
+            .expect("registering t_ndv should succeed");
+    }
+}
diff --git a/datafusion/sqllogictest/test_files/expression_analyzer.slt b/datafusion/sqllogictest/test_files/expression_analyzer.slt
@@ -0,0 +1,118 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for ExpressionAnalyzerRegistry end-to-end integration.
+#
+# t_ndv is a StatisticsTable with controlled statistics:
+#   1000 rows, column a (Int64, NDV=10), column b (Int64, NDV=5).
+#
+# OR predicates are not expressible as a single interval, so the built-in
+# interval-arithmetic path always falls back to the default selectivity (20%).
+# ExpressionAnalyzerRegistry applies inclusion-exclusion instead:
+#   P(a=42 OR b=5) = P(a=42) + P(b=5) - P(a=42)*P(b=5)
+#                  = 1/10 + 1/5 - 1/50
+#                  = 0.1 + 0.2 - 0.02 = 0.28
+# Expected rows = round(1000 * 0.28) = 280.
+
+statement ok
+SET datafusion.execution.target_partitions = 1;
+
+statement ok
+SET datafusion.explain.show_statistics = true;
+
+statement ok
+SET datafusion.explain.physical_plan_only = true;
+
+# Without ExpressionAnalyzerRegistry: OR predicate falls back to 20% default selectivity
+# → FilterExec estimated rows = 1000 * 0.20 = 200
+query TT
+EXPLAIN SELECT * FROM t_ndv WHERE a = 42 OR b = 5;
+----
+physical_plan
+01)FilterExec: a@0 = 42 OR b@1 = 5, statistics=[Rows=Inexact(200), Bytes=Absent, [(Col[0]: Distinct=Inexact(10)),(Col[1]: Distinct=Inexact(5))]]
+02)--CooperativeExec, statistics=[Rows=Inexact(1000), Bytes=Absent, [(Col[0]: Distinct=Inexact(10)),(Col[1]: Distinct=Inexact(5))]]
+03)----StatisticsTable, statistics=[Rows=Inexact(1000), Bytes=Absent, [(Col[0]: Distinct=Inexact(10)),(Col[1]: Distinct=Inexact(5))]]
+
+# Enable ExpressionAnalyzerRegistry so inclusion-exclusion applies to OR predicates
+statement ok
+SET datafusion.optimizer.use_expression_analyzer = true;
+
+# With ExpressionAnalyzerRegistry: OR uses inclusion-exclusion
+# P(a=42 OR b=5) = 1/10 + 1/5 - (1/10 * 1/5) = 0.28 → 280 rows
+query TT
+EXPLAIN SELECT * FROM t_ndv WHERE a = 42 OR b = 5;
+----
+physical_plan
+01)FilterExec: a@0 = 42 OR b@1 = 5, statistics=[Rows=Inexact(280), Bytes=Absent, [(Col[0]: Distinct=Inexact(10)),(Col[1]: Distinct=Inexact(5))]]
+02)--CooperativeExec, statistics=[Rows=Inexact(1000), Bytes=Absent, [(Col[0]: Distinct=Inexact(10)),(Col[1]: Distinct=Inexact(5))]]
+03)----StatisticsTable, statistics=[Rows=Inexact(1000), Bytes=Absent, [(Col[0]: Distinct=Inexact(10)),(Col[1]: Distinct=Inexact(5))]]
+
+# Verify the registry survives physical optimizer rules: ORDER BY + LIMIT triggers the
+# TopK sort rule which rewrites the plan above FilterExec. The FilterExec row estimate
+# must still reflect inclusion-exclusion (280), not the 20% default.
+query TT
+EXPLAIN SELECT * FROM t_ndv WHERE a = 42 OR b = 5 ORDER BY a LIMIT 100;
+----
+physical_plan
+01)SortExec: TopK(fetch=100), expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false], statistics=[Rows=Inexact(100), Bytes=Absent, [(Col[0]: Distinct=Inexact(10)),(Col[1]: Distinct=Inexact(5))]]
+02)--FilterExec: a@0 = 42 OR b@1 = 5, statistics=[Rows=Inexact(280), Bytes=Absent, [(Col[0]: Distinct=Inexact(10)),(Col[1]: Distinct=Inexact(5))]]
+03)----CooperativeExec, statistics=[Rows=Inexact(1000), Bytes=Absent, [(Col[0]: Distinct=Inexact(10)),(Col[1]: Distinct=Inexact(5))]]
+04)------StatisticsTable, statistics=[Rows=Inexact(1000), Bytes=Absent, [(Col[0]: Distinct=Inexact(10)),(Col[1]: Distinct=Inexact(5))]]
+
+# Verify the registry reaches FilterExec nodes created by optimizer rules: the
+# filter_pushdown rule running on UnionExec creates fresh FilterExec nodes (one per
+# branch) that never existed when the registry was first injected. Both must show
+# 280 rows, confirming re-injection after each rule reaches newly created nodes.
+# The UnionExec row count (560 = 2 * 280) and doubled NDVs (20, 10) also confirm
+# that distinct-count propagation through UnionExec is correct.
+query TT
+EXPLAIN SELECT * FROM (SELECT * FROM t_ndv UNION ALL SELECT * FROM t_ndv) WHERE a = 42 OR b = 5;
+----
+physical_plan
+01)UnionExec, statistics=[Rows=Inexact(560), Bytes=Absent, [(Col[0]: Distinct=Inexact(20)),(Col[1]: Distinct=Inexact(10))]]
+02)--FilterExec: a@0 = 42 OR b@1 = 5, statistics=[Rows=Inexact(280), Bytes=Absent, [(Col[0]: Distinct=Inexact(10)),(Col[1]: Distinct=Inexact(5))]]
+03)----CooperativeExec, statistics=[Rows=Inexact(1000), Bytes=Absent, [(Col[0]: Distinct=Inexact(10)),(Col[1]: Distinct=Inexact(5))]]
+04)------StatisticsTable, statistics=[Rows=Inexact(1000), Bytes=Absent, [(Col[0]: Distinct=Inexact(10)),(Col[1]: Distinct=Inexact(5))]]
+05)--FilterExec: a@0 = 42 OR b@1 = 5, statistics=[Rows=Inexact(280), Bytes=Absent, [(Col[0]: Distinct=Inexact(10)),(Col[1]: Distinct=Inexact(5))]]
+06)----CooperativeExec, statistics=[Rows=Inexact(1000), Bytes=Absent, [(Col[0]: Distinct=Inexact(10)),(Col[1]: Distinct=Inexact(5))]]
+07)------StatisticsTable, statistics=[Rows=Inexact(1000), Bytes=Absent, [(Col[0]: Distinct=Inexact(10)),(Col[1]: Distinct=Inexact(5))]]
+
+# Verify the registry reaches a FilterExec pushed through a join: filter_pushdown
+# moves the WHERE clause filter to the left side of the HashJoinExec, creating a
+# FilterExec that was not present in the plan at initial injection time.
+query TT
+EXPLAIN SELECT l.a, r.b FROM t_ndv l JOIN t_ndv r ON l.a = r.a WHERE l.a = 42 OR l.b = 5;
+----
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0)], projection=[a@0, b@2], statistics=[Rows=Inexact(28000), Bytes=Absent, [(Col[0]: Distinct=Inexact(10)),(Col[1]: Distinct=Inexact(5))]]
+02)--FilterExec: a@0 = 42 OR b@1 = 5, projection=[a@0], statistics=[Rows=Inexact(280), Bytes=Absent, [(Col[0]: Distinct=Inexact(10))]]
+03)----CooperativeExec, statistics=[Rows=Inexact(1000), Bytes=Absent, [(Col[0]: Distinct=Inexact(10)),(Col[1]: Distinct=Inexact(5))]]
+04)------StatisticsTable, statistics=[Rows=Inexact(1000), Bytes=Absent, [(Col[0]: Distinct=Inexact(10)),(Col[1]: Distinct=Inexact(5))]]
+05)--CooperativeExec, statistics=[Rows=Inexact(1000), Bytes=Absent, [(Col[0]: Distinct=Inexact(10)),(Col[1]: Distinct=Inexact(5))]]
+06)----StatisticsTable, statistics=[Rows=Inexact(1000), Bytes=Absent, [(Col[0]: Distinct=Inexact(10)),(Col[1]: Distinct=Inexact(5))]]
+
+statement ok
+SET datafusion.optimizer.use_expression_analyzer = false;
+
+statement ok
+SET datafusion.explain.show_statistics = false;
+
+statement ok
+SET datafusion.explain.physical_plan_only = false;
+
+statement ok
+SET datafusion.execution.target_partitions = 4;
diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt
@@ -478,7 +478,7 @@ datafusion.optimizer.repartition_windows true Should DataFusion repartition data
 datafusion.optimizer.skip_failed_rules false When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail
 datafusion.optimizer.subset_repartition_threshold 4 Partition count threshold for subset satisfaction optimization. When the current partition count is >= this threshold, DataFusion will skip repartitioning if the required partitioning expression is a subset of the current partition expression such as Hash(a) satisfies Hash(a, b). When the current partition count is < this threshold, DataFusion will repartition to increase parallelism even when subset satisfaction applies. Set to 0 to always repartition (disable subset satisfaction optimization). Set to a high value to always use subset satisfaction. Example (subset_repartition_threshold = 4): ```text     Hash([a]) satisfies Hash([a, b]) because (Hash([a, b]) is subset of Hash([a])     If current partitions (3) < threshold (4), repartition:     AggregateExec: mode=FinalPartitioned, gby=[a, b], aggr=[SUM(x)]       RepartitionExec: partitioning=Hash([a, b], 8), input_partitions=3         AggregateExec: mode=Partial, gby=[a, b], aggr=[SUM(x)]           DataSourceExec: file_groups={...}, output_partitioning=Hash([a], 3)     If current partitions (8) >= threshold (4), use subset satisfaction:     AggregateExec: mode=SinglePartitioned, gby=[a, b], aggr=[SUM(x)]       DataSourceExec: file_groups={...}, output_partitioning=Hash([a], 8) ```
 datafusion.optimizer.top_down_join_key_reordering true When set to true, the physical plan optimizer will run a top down process to reorder the join keys
-datafusion.optimizer.use_expression_analyzer false When set to true, the pluggable `ExpressionAnalyzerRegistry` from `SessionState` is injected into exec nodes that use expression-level statistics (`FilterExec`, `ProjectionExec`, `AggregateExec`, join nodes) and re-injected after each physical optimizer rule so rebuilt nodes always carry it. Custom analyzers then influence `partition_statistics` in those operators.
+datafusion.optimizer.use_expression_analyzer false When set to true, the pluggable `ExpressionAnalyzerRegistry` from `SessionState` is used for expression-level statistics estimation (NDV, selectivity, min/max, null fraction) in physical plan operators.
 datafusion.optimizer.use_statistics_registry false When set to true, the physical plan optimizer uses the pluggable `StatisticsRegistry` for a bottom-up statistics walk across operators, enabling more accurate cardinality estimates. Enabling `use_expression_analyzer` alongside this flag gives built-in providers access to custom expression-level analyzers (NDV, selectivity) for the operators they process.
 datafusion.runtime.list_files_cache_limit 1M Maximum memory to use for list files cache. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.
 datafusion.runtime.list_files_cache_ttl NULL TTL (time-to-live) of the entries in the list file cache. Supports units m (minutes), and s (seconds). Example: '2m' for 2 minutes.
diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md

Original file line number	Diff line number	Diff line change
`@@ -70,7 +70,6 @@ impl DefaultExpressionAnalyzer {`
`70`	`70`	`_ => None,`
`71`	`71`	`}`
`72`	`72`	`}`
`73`		`-`
`74`	`73`	`}`
`75`	`74`
`76`	`75`	`impl ExpressionAnalyzer for DefaultExpressionAnalyzer {`