2525use crate :: ExecutionPlan ;
2626use datafusion_common:: Result ;
2727use datafusion_common:: Statistics ;
28+ use std:: cell:: RefCell ;
29+ use std:: collections:: HashMap ;
30+ use std:: rc:: Rc ;
2831use std:: sync:: Arc ;
2932
33+ /// Per-call memoization cache for [`compute_statistics`].
34+ ///
35+ /// Keyed by `(plan node pointer address, partition)`. Created once per
36+ /// top-level [`compute_statistics`] call and shared across all recursive
37+ /// and operator-internal calls via [`StatisticsContext`].
38+ ///
39+ /// The pointer-based key is safe within a single synchronous
40+ /// `compute_statistics` call: all `Arc<dyn ExecutionPlan>` nodes are held
41+ /// by the plan tree for the duration of the walk, so addresses cannot be
42+ /// reused.
43+ #[ derive( Debug , Default ) ]
44+ struct StatsCache ( HashMap < ( usize , Option < usize > ) , Arc < Statistics > > ) ;
45+
46+ impl StatsCache {
47+ fn get (
48+ & self ,
49+ plan : & dyn ExecutionPlan ,
50+ partition : Option < usize > ,
51+ ) -> Option < & Arc < Statistics > > {
52+ let key = (
53+ plan as * const dyn ExecutionPlan as * const ( ) as usize ,
54+ partition,
55+ ) ;
56+ self . 0 . get ( & key)
57+ }
58+
59+ fn insert (
60+ & mut self ,
61+ plan : & dyn ExecutionPlan ,
62+ partition : Option < usize > ,
63+ stats : Arc < Statistics > ,
64+ ) {
65+ let key = (
66+ plan as * const dyn ExecutionPlan as * const ( ) as usize ,
67+ partition,
68+ ) ;
69+ self . 0 . insert ( key, stats) ;
70+ }
71+ }
72+
3073/// Context passed to [`ExecutionPlan::partition_statistics_with_context`]
3174/// carrying external information that operators can use when computing
3275/// their statistics.
33- ///
34- /// # Example
35- ///
36- /// ```ignore
37- /// use datafusion_physical_plan::statistics_context::StatisticsContext;
38- ///
39- /// // Leaf node: no children
40- /// let leaf_ctx = StatisticsContext::empty();
41- ///
42- /// // Parent node: receives pre-computed child stats
43- /// let child_stats = vec![child1_stats, child2_stats];
44- /// let parent_ctx = StatisticsContext::new(child_stats);
45- /// ```
46- #[ derive( Debug , Clone ) ]
76+ #[ derive( Debug ) ]
4777pub struct StatisticsContext {
4878 /// Pre-computed statistics for each child of the current node,
4979 /// in the same order as [`ExecutionPlan::children`].
5080 child_stats : Vec < Arc < Statistics > > ,
81+ /// Shared memoization cache for the current `compute_statistics` walk
82+ cache : Option < Rc < RefCell < StatsCache > > > ,
5183}
5284
5385impl StatisticsContext {
5486 /// Creates a new context with pre-computed child statistics.
5587 pub fn new ( child_stats : Vec < Arc < Statistics > > ) -> Self {
56- Self { child_stats }
88+ Self {
89+ child_stats,
90+ cache : None ,
91+ }
5792 }
5893
5994 /// Creates an empty context (for leaf nodes or when child stats
6095 /// are not available).
6196 pub fn empty ( ) -> Self {
6297 Self {
6398 child_stats : Vec :: new ( ) ,
99+ cache : None ,
64100 }
65101 }
66102
67103 /// Returns the pre-computed statistics for each child node.
68104 pub fn child_stats ( & self ) -> & [ Arc < Statistics > ] {
69105 & self . child_stats
70106 }
107+
108+ /// Computes statistics for a child plan, using the shared cache
109+ /// from the current [`compute_statistics`] walk.
110+ ///
111+ /// Use this when [`Self::child_stats`] does not provide the right
112+ /// granularity: partition-preserving operators needing per-partition
113+ /// child stats (via `Some(partition)`), or partition-merging operators
114+ /// needing overall stats (via `None`).
115+ pub fn compute_child_statistics (
116+ & self ,
117+ plan : & dyn ExecutionPlan ,
118+ partition : Option < usize > ,
119+ ) -> Result < Arc < Statistics > > {
120+ match & self . cache {
121+ Some ( cache) => compute_statistics_inner ( plan, partition, cache) ,
122+ None => compute_statistics ( plan, partition) ,
123+ }
124+ }
125+ }
126+
127+ impl Clone for StatisticsContext {
128+ fn clone ( & self ) -> Self {
129+ Self {
130+ child_stats : self . child_stats . clone ( ) ,
131+ cache : self . cache . clone ( ) ,
132+ }
133+ }
71134}
72135
73136impl Default for StatisticsContext {
@@ -78,25 +141,96 @@ impl Default for StatisticsContext {
78141
79142/// Computes statistics for a plan node by first recursively computing
80143/// statistics for all children, then calling
81- /// [`ExecutionPlan::partition_statistics_with_context`] with the pre-computed child
82- /// statistics.
144+ /// [`ExecutionPlan::partition_statistics_with_context`] with the pre-computed
145+ /// child statistics.
146+ ///
147+ /// Results are memoized within a single call: operators that internally
148+ /// call [`StatisticsContext::compute_child_statistics`] (e.g., partition-merging
149+ /// operators requesting overall child stats with `None`) will hit the
150+ /// cache instead of re-walking subtrees.
83151///
84152/// The `partition` parameter is forwarded to children. This is correct
85153/// for partition-preserving operators (filter, projection, sort, etc.),
86154/// but operators that need overall child stats regardless of the
87155/// requested partition (e.g., repartition, coalesce, asymmetric joins)
88- /// must handle this internally by calling `compute_statistics` with
89- /// `None` on the relevant children .
156+ /// must handle this internally by calling
157+ /// [`StatisticsContext::compute_child_statistics`] with `None` .
90158pub fn compute_statistics (
91159 plan : & dyn ExecutionPlan ,
92160 partition : Option < usize > ,
93161) -> Result < Arc < Statistics > > {
162+ let cache = Rc :: new ( RefCell :: new ( StatsCache :: default ( ) ) ) ;
163+ compute_statistics_inner ( plan, partition, & cache)
164+ }
165+
166+ fn compute_statistics_inner (
167+ plan : & dyn ExecutionPlan ,
168+ partition : Option < usize > ,
169+ cache : & Rc < RefCell < StatsCache > > ,
170+ ) -> Result < Arc < Statistics > > {
171+ if let Some ( cached) = cache. borrow ( ) . get ( plan, partition) {
172+ return Ok ( Arc :: clone ( cached) ) ;
173+ }
174+
94175 let child_stats = plan
95176 . children ( )
96177 . iter ( )
97- . map ( |child| compute_statistics ( child. as_ref ( ) , partition) )
178+ . map ( |child| compute_statistics_inner ( child. as_ref ( ) , partition, cache ) )
98179 . collect :: < Result < Vec < _ > > > ( ) ?;
99180
100- let ctx = StatisticsContext :: new ( child_stats) ;
101- plan. partition_statistics_with_context ( partition, & ctx)
181+ let ctx = StatisticsContext {
182+ child_stats,
183+ cache : Some ( Rc :: clone ( cache) ) ,
184+ } ;
185+ let result = plan. partition_statistics_with_context ( partition, & ctx) ?;
186+
187+ cache
188+ . borrow_mut ( )
189+ . insert ( plan, partition, Arc :: clone ( & result) ) ;
190+ Ok ( result)
191+ }
192+
193+ #[ cfg( all( test, feature = "test_utils" ) ) ]
194+ mod tests {
195+ use super :: * ;
196+ use crate :: coalesce_partitions:: CoalescePartitionsExec ;
197+ use crate :: test:: exec:: StatisticsExec ;
198+ use arrow:: datatypes:: { DataType , Field , Schema } ;
199+ use datafusion_common:: { ColumnStatistics , stats:: Precision } ;
200+
201+ fn make_stats_leaf ( num_rows : usize ) -> Arc < dyn ExecutionPlan > {
202+ let schema = Schema :: new ( vec ! [ Field :: new( "a" , DataType :: Int32 , false ) ] ) ;
203+ let col_stats = vec ! [ ColumnStatistics {
204+ null_count: Precision :: Exact ( 0 ) ,
205+ max_value: Precision :: Absent ,
206+ min_value: Precision :: Absent ,
207+ sum_value: Precision :: Absent ,
208+ distinct_count: Precision :: Absent ,
209+ byte_size: Precision :: Absent ,
210+ } ] ;
211+ Arc :: new ( StatisticsExec :: new (
212+ Statistics {
213+ num_rows : Precision :: Exact ( num_rows) ,
214+ total_byte_size : Precision :: Absent ,
215+ column_statistics : col_stats,
216+ } ,
217+ schema,
218+ ) )
219+ }
220+
221+ #[ test]
222+ fn child_stats_always_returns_overall_stats ( ) {
223+ // CoalescePartitionsExec merges partitions, so when called with
224+ // Some(0) the walk should still pre-compute children with None
225+ let leaf = make_stats_leaf ( 100 ) ;
226+ let plan: Arc < dyn ExecutionPlan > = Arc :: new ( CoalescePartitionsExec :: new ( leaf) ) ;
227+
228+ // Calling with Some(0) should still work and return correct stats
229+ let stats = compute_statistics ( plan. as_ref ( ) , Some ( 0 ) ) . unwrap ( ) ;
230+ assert_eq ! ( stats. num_rows, Precision :: Exact ( 100 ) ) ;
231+
232+ // Calling with None should return the same
233+ let stats_none = compute_statistics ( plan. as_ref ( ) , None ) . unwrap ( ) ;
234+ assert_eq ! ( stats_none. num_rows, Precision :: Exact ( 100 ) ) ;
235+ }
102236}
0 commit comments