@@ -212,6 +212,51 @@ impl RowGroupAccessPlanFilter {
212212 }
213213 }
214214
215+ /// Prune row groups that can be entirely skipped due to offset.
216+ ///
217+ /// When an offset is specified, rows at the beginning of the scan must be
218+ /// skipped. This method marks leading fully-matched row groups whose
219+ /// cumulative row count falls within the offset as skipped, so they are
220+ /// never read from disk.
221+ ///
222+ /// Returns the remaining offset (number of rows still to skip within the
223+ /// first non-pruned row group).
224+ pub fn prune_by_offset (
225+ & mut self ,
226+ offset : usize ,
227+ has_predicate : bool ,
228+ rg_metadata : & [ RowGroupMetaData ] ,
229+ metrics : & ParquetFileMetrics ,
230+ ) -> usize {
231+ let mut remaining = offset;
232+ let mut pruned_count = 0 ;
233+
234+ for & idx in self . access_plan . row_group_indexes ( ) . iter ( ) {
235+ if remaining == 0 {
236+ break ;
237+ }
238+ // We can skip a row group entirely if:
239+ // - No predicate: all rows match, row count is exact
240+ // - Has predicate but is_fully_matched: all rows pass filter
241+ let can_skip = !has_predicate || self . is_fully_matched [ idx] ;
242+ if can_skip {
243+ let rg_rows = rg_metadata[ idx] . num_rows ( ) as usize ;
244+ if remaining >= rg_rows {
245+ self . access_plan . skip ( idx) ;
246+ remaining -= rg_rows;
247+ pruned_count += 1 ;
248+ } else {
249+ break ;
250+ }
251+ } else {
252+ break ;
253+ }
254+ }
255+
256+ metrics. offset_pruned_row_groups . add_pruned ( pruned_count) ;
257+ remaining
258+ }
259+
215260 /// Prune remaining row groups to only those within the specified range.
216261 ///
217262 /// Updates this set to mark row groups that should not be scanned
@@ -1438,6 +1483,133 @@ mod tests {
14381483 ParquetFileMetrics :: new ( 0 , "file.parquet" , & metrics)
14391484 }
14401485
1486+ /// Create a RowGroupMetaData with the specified number of rows.
1487+ /// Uses a minimal schema with a single INT32 column.
1488+ fn make_row_group_meta ( num_rows : i64 ) -> RowGroupMetaData {
1489+ let schema_descr = get_test_schema_descr ( vec ! [ PrimitiveTypeField :: new(
1490+ "id" ,
1491+ PhysicalType :: INT32 ,
1492+ ) ] ) ;
1493+ let column = ColumnChunkMetaData :: builder ( schema_descr. column ( 0 ) )
1494+ . set_num_values ( num_rows)
1495+ . build ( )
1496+ . unwrap ( ) ;
1497+ RowGroupMetaData :: builder ( schema_descr)
1498+ . set_num_rows ( num_rows)
1499+ . set_total_byte_size ( 1000 )
1500+ . set_column_metadata ( vec ! [ column] )
1501+ . build ( )
1502+ . unwrap ( )
1503+ }
1504+
1505+ /// Helper to build a RowGroupAccessPlanFilter with specified fully_matched flags.
1506+ fn make_filter_with_fully_matched (
1507+ num_rgs : usize ,
1508+ fully_matched : Vec < bool > ,
1509+ ) -> RowGroupAccessPlanFilter {
1510+ assert_eq ! ( num_rgs, fully_matched. len( ) ) ;
1511+ let access_plan = ParquetAccessPlan :: new_all ( num_rgs) ;
1512+ let mut filter = RowGroupAccessPlanFilter :: new ( access_plan) ;
1513+ filter. is_fully_matched = fully_matched;
1514+ filter
1515+ }
1516+
1517+ #[ test]
1518+ fn test_prune_by_offset_skips_fully_matched_rgs ( ) {
1519+ // 3 RGs each with 100 rows, all fully_matched. offset=250.
1520+ // Should skip 2 RGs (200 rows), remaining=50.
1521+ let rg_metadata: Vec < RowGroupMetaData > =
1522+ ( 0 ..3 ) . map ( |_| make_row_group_meta ( 100 ) ) . collect ( ) ;
1523+ let metrics = parquet_file_metrics ( ) ;
1524+ let mut filter = make_filter_with_fully_matched ( 3 , vec ! [ true , true , true ] ) ;
1525+
1526+ let remaining = filter. prune_by_offset ( 250 , false , & rg_metadata, & metrics) ;
1527+ assert_eq ! ( remaining, 50 ) ;
1528+ // First two RGs should be skipped, third should still be scanned
1529+ let indexes: Vec < usize > = filter. row_group_indexes ( ) . collect ( ) ;
1530+ assert_eq ! ( indexes, vec![ 2 ] ) ;
1531+ }
1532+
1533+ #[ test]
1534+ fn test_prune_by_offset_stops_at_non_fully_matched ( ) {
1535+ // 3 RGs each with 100 rows. First two fully_matched, third not.
1536+ // offset=250 → skip 2 RGs (200 rows), remaining=50.
1537+ // Cannot skip the non-fully-matched third RG even though offset
1538+ // still needs more rows skipped.
1539+ let rg_metadata: Vec < RowGroupMetaData > =
1540+ ( 0 ..3 ) . map ( |_| make_row_group_meta ( 100 ) ) . collect ( ) ;
1541+ let metrics = parquet_file_metrics ( ) ;
1542+ let mut filter = make_filter_with_fully_matched ( 3 , vec ! [ true , true , false ] ) ;
1543+
1544+ let remaining = filter. prune_by_offset ( 250 , true , & rg_metadata, & metrics) ;
1545+ assert_eq ! ( remaining, 50 ) ;
1546+ // First two RGs skipped, third still scanned (not fully matched)
1547+ let indexes: Vec < usize > = filter. row_group_indexes ( ) . collect ( ) ;
1548+ assert_eq ! ( indexes, vec![ 2 ] ) ;
1549+ }
1550+
1551+ #[ test]
1552+ fn test_prune_by_offset_zero ( ) {
1553+ // offset=0 → no pruning, remaining=0.
1554+ let rg_metadata: Vec < RowGroupMetaData > =
1555+ ( 0 ..3 ) . map ( |_| make_row_group_meta ( 100 ) ) . collect ( ) ;
1556+ let metrics = parquet_file_metrics ( ) ;
1557+ let mut filter = make_filter_with_fully_matched ( 3 , vec ! [ true , true , true ] ) ;
1558+
1559+ let remaining = filter. prune_by_offset ( 0 , false , & rg_metadata, & metrics) ;
1560+ assert_eq ! ( remaining, 0 ) ;
1561+ // All RGs should still be scanned
1562+ let indexes: Vec < usize > = filter. row_group_indexes ( ) . collect ( ) ;
1563+ assert_eq ! ( indexes, vec![ 0 , 1 , 2 ] ) ;
1564+ }
1565+
1566+ #[ test]
1567+ fn test_prune_by_offset_exact_boundary ( ) {
1568+ // 3 RGs each 100 rows. offset=200 → skip exactly 2 RGs, remaining=0.
1569+ let rg_metadata: Vec < RowGroupMetaData > =
1570+ ( 0 ..3 ) . map ( |_| make_row_group_meta ( 100 ) ) . collect ( ) ;
1571+ let metrics = parquet_file_metrics ( ) ;
1572+ let mut filter = make_filter_with_fully_matched ( 3 , vec ! [ true , true , true ] ) ;
1573+
1574+ let remaining = filter. prune_by_offset ( 200 , false , & rg_metadata, & metrics) ;
1575+ assert_eq ! ( remaining, 0 ) ;
1576+ // First two RGs skipped, third still scanned
1577+ let indexes: Vec < usize > = filter. row_group_indexes ( ) . collect ( ) ;
1578+ assert_eq ! ( indexes, vec![ 2 ] ) ;
1579+ }
1580+
1581+ #[ test]
1582+ fn test_prune_by_offset_exceeds_total ( ) {
1583+ // offset=400 > total 300 rows → skip all fully_matched RGs,
1584+ // remaining = 400 - 300 = 100.
1585+ let rg_metadata: Vec < RowGroupMetaData > =
1586+ ( 0 ..3 ) . map ( |_| make_row_group_meta ( 100 ) ) . collect ( ) ;
1587+ let metrics = parquet_file_metrics ( ) ;
1588+ let mut filter = make_filter_with_fully_matched ( 3 , vec ! [ true , true , true ] ) ;
1589+
1590+ let remaining = filter. prune_by_offset ( 400 , false , & rg_metadata, & metrics) ;
1591+ assert_eq ! ( remaining, 100 ) ;
1592+ // All RGs should be skipped
1593+ let indexes: Vec < usize > = filter. row_group_indexes ( ) . collect ( ) ;
1594+ assert ! ( indexes. is_empty( ) ) ;
1595+ }
1596+
1597+ #[ test]
1598+ fn test_prune_by_offset_partial_rg ( ) {
1599+ // offset=50 (less than first RG of 100 rows) → don't skip any RG,
1600+ // remaining=50.
1601+ let rg_metadata: Vec < RowGroupMetaData > =
1602+ ( 0 ..3 ) . map ( |_| make_row_group_meta ( 100 ) ) . collect ( ) ;
1603+ let metrics = parquet_file_metrics ( ) ;
1604+ let mut filter = make_filter_with_fully_matched ( 3 , vec ! [ true , true , true ] ) ;
1605+
1606+ let remaining = filter. prune_by_offset ( 50 , false , & rg_metadata, & metrics) ;
1607+ assert_eq ! ( remaining, 50 ) ;
1608+ // No RGs should be skipped since offset < first RG's row count
1609+ let indexes: Vec < usize > = filter. row_group_indexes ( ) . collect ( ) ;
1610+ assert_eq ! ( indexes, vec![ 0 , 1 , 2 ] ) ;
1611+ }
1612+
14411613 #[ tokio:: test]
14421614 async fn test_row_group_bloom_filter_pruning_predicate_simple_expr ( ) {
14431615 BloomFilterTest :: new_data_index_bloom_encoding_stats ( )
0 commit comments