From df00a76fa7d86e6d8a539df4c3835fe52d50200a Mon Sep 17 00:00:00 2001 From: cl Date: Fri, 17 Apr 2026 22:55:19 +0800 Subject: [PATCH 1/4] arrow-select: fuse inline BinaryView filter coalescing Avoid materialising an intermediate filtered RecordBatch when coalescing sparse inline BinaryView batches, and reuse shared filter indices for primitive columns in mixed batches. This addresses issue #9143 and adds focused tests and benchmarks for single-column and mixed BinaryView filter paths. Signed-off-by: cl --- arrow-select/src/coalesce.rs | 233 +++++++++++++++++++++++-- arrow-select/src/coalesce/byte_view.rs | 99 ++++++++++- arrow-select/src/coalesce/primitive.rs | 98 ++++++++++- arrow-select/src/filter.rs | 21 ++- arrow/benches/coalesce_kernels.rs | 85 +++++++++ 5 files changed, 519 insertions(+), 17 deletions(-) diff --git a/arrow-select/src/coalesce.rs b/arrow-select/src/coalesce.rs index 8fe88fb8c377..01cb2fa19579 100644 --- a/arrow-select/src/coalesce.rs +++ b/arrow-select/src/coalesce.rs @@ -20,8 +20,12 @@ //! //! [`filter`]: crate::filter::filter //! [`take`]: crate::take::take -use crate::filter::filter_record_batch; +use crate::filter::{ + FilterBuilder, FilterPredicate, IndexIterator, IterationStrategy, SlicesIterator, + filter_record_batch, +}; use crate::take::take_record_batch; +use arrow_array::cast::AsArray; use arrow_array::types::{BinaryViewType, StringViewType}; use arrow_array::{Array, ArrayRef, BooleanArray, RecordBatch, downcast_primitive}; use arrow_schema::{ArrowError, DataType, SchemaRef}; @@ -238,6 +242,83 @@ impl BatchCoalescer { batch: RecordBatch, filter: &BooleanArray, ) -> Result<(), ArrowError> { + if supports_fused_inline_binary_view_filter(&batch) { + if filter.len() > batch.num_rows() { + return Err(ArrowError::InvalidArgumentError(format!( + "Filter predicate of length {} is larger than target array of length {}", + filter.len(), + batch.num_rows() + ))); + } + + let mut filter_builder = FilterBuilder::new(filter); + if batch.num_columns() > 1 { + filter_builder = filter_builder.optimize(); + } + let predicate = filter_builder.build(); + let selected_count = predicate.count(); + + if selected_count == 0 { + return Ok(()); + } + + if selected_count == batch.num_rows() && filter.len() == batch.num_rows() { + return self.push_batch(batch); + } + + if let Some(limit) = self.biggest_coalesce_batch_size { + if selected_count > limit { + let filtered_batch = predicate.filter_record_batch(&batch)?; + return self.push_batch(filtered_batch); + } + } + + // For dense inline filters, the existing filter kernel remains faster. + if selected_count.saturating_mul(4) > filter.len() { + let filtered_batch = predicate.filter_record_batch(&batch)?; + return self.push_batch(filtered_batch); + } + + let space_in_batch = self.target_batch_size - self.buffered_rows; + if selected_count <= space_in_batch { + let (_schema, arrays, _num_rows) = batch.into_parts(); + + if arrays.len() != self.in_progress_arrays.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Batch has {} columns but BatchCoalescer expects {}", + arrays.len(), + self.in_progress_arrays.len() + ))); + } + + self.in_progress_arrays + .iter_mut() + .zip(arrays) + .for_each(|(in_progress, array)| { + in_progress.set_source(Some(array)); + }); + + let result = (|| { + for in_progress in self.in_progress_arrays.iter_mut() { + in_progress.copy_rows_by_filter(&predicate)?; + } + + self.buffered_rows += selected_count; + if self.buffered_rows >= self.target_batch_size { + self.finish_buffered_batch()?; + } + + Ok(()) + })(); + + for in_progress in self.in_progress_arrays.iter_mut() { + in_progress.set_source(None); + } + + return result; + } + } + // TODO: optimize this to avoid materializing (copying the results // of filter to a new batch) let filtered_batch = filter_record_batch(&batch, filter)?; @@ -588,6 +669,31 @@ fn create_in_progress_array(data_type: &DataType, batch_size: usize) -> Box bool { + let mut has_inline_binary_view = false; + + let supported = batch + .schema() + .fields() + .iter() + .zip(batch.columns()) + .all(|(field, array)| { + if field.data_type().is_primitive() { + return true; + } + + let Some(binary_view) = array.as_binary_view_opt() else { + return false; + }; + + let inline = binary_view.data_buffers().is_empty(); + has_inline_binary_view |= inline; + inline + }); + + supported && has_inline_binary_view +} + /// Incrementally builds up arrays /// /// [`GenericInProgressArray`] is the default implementation that buffers @@ -611,6 +717,39 @@ trait InProgressArray: std::fmt::Debug + Send + Sync { /// Return an error if the source array is not set fn copy_rows(&mut self, offset: usize, len: usize) -> Result<(), ArrowError>; + /// Copy rows selected by `filter` from the current source array. + fn copy_rows_by_filter(&mut self, filter: &FilterPredicate) -> Result<(), ArrowError> { + match filter.strategy() { + IterationStrategy::None => Ok(()), + IterationStrategy::All => self.copy_rows(0, filter.count()), + IterationStrategy::Slices(slices) => { + for &(start, end) in slices { + self.copy_rows(start, end - start)?; + } + Ok(()) + } + IterationStrategy::SlicesIterator => { + for (start, end) in SlicesIterator::new(filter.filter_array()) { + self.copy_rows(start, end - start)?; + } + Ok(()) + } + IterationStrategy::Indices(indices) => self.copy_rows_by_indices(indices), + IterationStrategy::IndexIterator => { + let indices = IndexIterator::new(filter.filter_array(), filter.count()).collect(); + self.copy_rows_by_indices(&indices) + } + } + } + + /// Copy rows at the specified indices from the current source array. + fn copy_rows_by_indices(&mut self, indices: &[usize]) -> Result<(), ArrowError> { + for &idx in indices { + self.copy_rows(idx, 1)?; + } + Ok(()) + } + /// Finish the currently in-progress array and return it as an `ArrayRef` fn finish(&mut self) -> Result; } @@ -1197,6 +1336,78 @@ mod tests { .run(); } + #[test] + fn test_binary_view_filtered() { + let values: Vec> = vec![ + Some(b"foo"), + None, + Some(b"A longer string that is more than 12 bytes"), + ]; + + let binary_view = + BinaryViewArray::from_iter(std::iter::repeat(values.iter()).flatten().take(1000)); + let batch = + RecordBatch::try_from_iter(vec![("c0", Arc::new(binary_view) as ArrayRef)]).unwrap(); + let filter = BooleanArray::from_iter((0..1000).map(|idx| Some(idx % 2 == 0))); + + Test::new("coalesce_binary_view_filtered") + .with_batch(batch.clone()) + .with_filter(filter.clone()) + .with_batch(batch) + .with_filter(filter) + .with_batch_size(256) + .with_expected_output_sizes(vec![256, 256, 256, 232]) + .run(); + } + + #[test] + fn test_binary_view_filtered_inline() { + let values: Vec> = vec![Some(b"foo"), None, Some(b"barbaz")]; + + let binary_view = + BinaryViewArray::from_iter(std::iter::repeat(values.iter()).flatten().take(1000)); + let batch = + RecordBatch::try_from_iter(vec![("c0", Arc::new(binary_view) as ArrayRef)]).unwrap(); + let filter = BooleanArray::from_iter((0..1000).map(|idx| Some(idx % 3 != 1))); + + Test::new("coalesce_binary_view_filtered_inline") + .with_batch(batch.clone()) + .with_filter(filter.clone()) + .with_batch(batch) + .with_filter(filter) + .with_batch_size(300) + .with_expected_output_sizes(vec![300, 300, 300, 300, 134]) + .run(); + } + + #[test] + fn test_mixed_inline_binary_view_filtered() { + let int_values = Int32Array::from_iter((0..1000).map(Some)); + let float_values = arrow_array::Float64Array::from_iter((0..1000).map(|v| Some(v as f64))); + let binary_values: Vec> = vec![Some(b"foo"), None, Some(b"barbaz")]; + let binary_view = BinaryViewArray::from_iter( + std::iter::repeat(binary_values.iter()).flatten().take(1000), + ); + + let batch = RecordBatch::try_from_iter(vec![ + ("i", Arc::new(int_values) as ArrayRef), + ("f", Arc::new(float_values) as ArrayRef), + ("b", Arc::new(binary_view) as ArrayRef), + ]) + .unwrap(); + + let filter = BooleanArray::from_iter((0..1000).map(|idx| Some(idx % 3 != 1))); + + Test::new("coalesce_mixed_inline_binary_view_filtered") + .with_batch(batch.clone()) + .with_filter(filter.clone()) + .with_batch(batch) + .with_filter(filter) + .with_batch_size(300) + .with_expected_output_sizes(vec![300, 300, 300, 300, 134]) + .run(); + } + #[derive(Debug, Clone, PartialEq)] struct ExpectedLayout { len: usize, @@ -1701,18 +1912,20 @@ mod tests { let (schema, mut columns, row_count) = batch.into_parts(); for column in columns.iter_mut() { - let Some(string_view) = column.as_string_view_opt() else { + if let Some(string_view) = column.as_string_view_opt() { + // Re-create the StringViewArray to ensure memory layout is + // consistent + let mut builder = StringViewBuilder::new(); + for s in string_view.iter() { + builder.append_option(s); + } + *column = Arc::new(builder.finish()); continue; - }; + } - // Re-create the StringViewArray to ensure memory layout is - // consistent - let mut builder = StringViewBuilder::new(); - for s in string_view.iter() { - builder.append_option(s); + if let Some(binary_view) = column.as_binary_view_opt() { + *column = Arc::new(BinaryViewArray::from_iter(binary_view.iter())); } - // Update the column with the new StringViewArray - *column = Arc::new(builder.finish()); } let options = RecordBatchOptions::new().with_row_count(Some(row_count)); diff --git a/arrow-select/src/coalesce/byte_view.rs b/arrow-select/src/coalesce/byte_view.rs index 6062cd5e77aa..ce39c4a3ebf4 100644 --- a/arrow-select/src/coalesce/byte_view.rs +++ b/arrow-select/src/coalesce/byte_view.rs @@ -16,10 +16,13 @@ // under the License. use crate::coalesce::InProgressArray; +use crate::filter::{ + FilterPredicate, IndexIterator, IterationStrategy, SlicesIterator, filter_null_mask, +}; use arrow_array::cast::AsArray; use arrow_array::types::ByteViewType; use arrow_array::{Array, ArrayRef, GenericByteViewArray}; -use arrow_buffer::{Buffer, NullBufferBuilder}; +use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer, NullBufferBuilder}; use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN}; use arrow_schema::ArrowError; use std::marker::PhantomData; @@ -111,6 +114,75 @@ impl InProgressByteViewArray { self.completed.push(next_buffer.into()); } + fn append_inline_views_by_filter(&mut self, views: &[u128], filter: &FilterPredicate) { + let current_len = self.views.len(); + self.views.reserve(filter.count()); + + let mut written = 0; + + unsafe { + let mut out = self.views.spare_capacity_mut().as_mut_ptr().cast::(); + + match filter.strategy() { + IterationStrategy::None => {} + IterationStrategy::All => { + std::ptr::copy_nonoverlapping(views.as_ptr(), out, filter.count()); + written = filter.count(); + } + IterationStrategy::Slices(slices) => { + for &(start, end) in slices { + let len = end - start; + std::ptr::copy_nonoverlapping(views.as_ptr().add(start), out, len); + out = out.add(len); + written += len; + } + } + IterationStrategy::SlicesIterator => { + for (start, end) in SlicesIterator::new(filter.filter_array()) { + let len = end - start; + std::ptr::copy_nonoverlapping(views.as_ptr().add(start), out, len); + out = out.add(len); + written += len; + } + } + IterationStrategy::Indices(indices) => { + for &idx in indices { + out.write(*views.get_unchecked(idx)); + out = out.add(1); + written += 1; + } + } + IterationStrategy::IndexIterator => { + for idx in IndexIterator::new(filter.filter_array(), filter.count()) { + out.write(*views.get_unchecked(idx)); + out = out.add(1); + written += 1; + } + } + } + + self.views.set_len(current_len + written); + } + + debug_assert_eq!(written, filter.count()); + } + + fn append_nulls_by_filter( + &mut self, + filter: &FilterPredicate, + source_nulls: Option<&NullBuffer>, + ) { + let Some((null_count, nulls)) = filter_null_mask(source_nulls, filter) else { + self.nulls.append_n_non_nulls(filter.count()); + return; + }; + + let nulls = unsafe { + NullBuffer::new_unchecked(BooleanBuffer::new(nulls, 0, filter.count()), null_count) + }; + self.nulls.append_buffer(&nulls); + } + /// Append views to self.views, updating the buffer index if necessary #[inline(never)] fn append_views_and_update_buffer_index(&mut self, views: &[u128], buffers: &[Buffer]) { @@ -346,6 +418,31 @@ impl InProgressArray for InProgressByteViewArray { Ok(()) } + fn copy_rows_by_filter(&mut self, filter: &FilterPredicate) -> Result<(), ArrowError> { + self.ensure_capacity(); + let source = self.source.take().ok_or_else(|| { + ArrowError::InvalidArgumentError( + "Internal Error: InProgressByteViewArray: source not set".to_string(), + ) + })?; + + let s = source.array.as_byte_view::(); + + if !s.data_buffers().is_empty() { + let filtered = filter.filter(source.array.as_ref())?; + self.set_source(Some(filtered)); + let result = self.copy_rows(0, filter.count()); + self.source = Some(source); + return result; + } + + self.append_nulls_by_filter(filter, s.nulls()); + self.append_inline_views_by_filter(s.views(), filter); + + self.source = Some(source); + Ok(()) + } + fn finish(&mut self) -> Result { self.finish_current(); assert!(self.current.is_none()); diff --git a/arrow-select/src/coalesce/primitive.rs b/arrow-select/src/coalesce/primitive.rs index a7f2fb32ce49..6bb93a21e2b3 100644 --- a/arrow-select/src/coalesce/primitive.rs +++ b/arrow-select/src/coalesce/primitive.rs @@ -16,9 +16,12 @@ // under the License. use crate::coalesce::InProgressArray; +use crate::filter::{ + FilterPredicate, IndexIterator, IterationStrategy, SlicesIterator, filter_null_mask, +}; use arrow_array::cast::AsArray; use arrow_array::{Array, ArrayRef, ArrowPrimitiveType, PrimitiveArray}; -use arrow_buffer::{NullBufferBuilder, ScalarBuffer}; +use arrow_buffer::{BooleanBuffer, NullBuffer, NullBufferBuilder, ScalarBuffer}; use arrow_schema::{ArrowError, DataType}; use std::fmt::Debug; use std::sync::Arc; @@ -94,6 +97,99 @@ impl InProgressArray for InProgressPrimitiveArray Ok(()) } + fn copy_rows_by_filter(&mut self, filter: &FilterPredicate) -> Result<(), ArrowError> { + self.ensure_capacity(); + + let s = self + .source + .as_ref() + .ok_or_else(|| { + ArrowError::InvalidArgumentError( + "Internal Error: InProgressPrimitiveArray: source not set".to_string(), + ) + })? + .as_primitive::(); + + match filter.strategy() { + IterationStrategy::None => return Ok(()), + IterationStrategy::All => return self.copy_rows(0, filter.count()), + IterationStrategy::Slices(slices) => { + for &(start, end) in slices { + self.copy_rows(start, end - start)?; + } + return Ok(()); + } + IterationStrategy::SlicesIterator => { + for (start, end) in SlicesIterator::new(filter.filter_array()) { + self.copy_rows(start, end - start)?; + } + return Ok(()); + } + IterationStrategy::Indices(_) | IterationStrategy::IndexIterator => {} + } + + if let Some((null_count, nulls)) = filter_null_mask(s.nulls(), filter) { + let nulls = unsafe { + NullBuffer::new_unchecked(BooleanBuffer::new(nulls, 0, filter.count()), null_count) + }; + self.nulls.append_buffer(&nulls); + } else { + self.nulls.append_n_non_nulls(filter.count()); + } + + let values = s.values(); + self.current.reserve(filter.count()); + + match filter.strategy() { + IterationStrategy::Indices(indices) => { + for &idx in indices { + self.current.push(values[idx]); + } + } + IterationStrategy::IndexIterator => { + for idx in IndexIterator::new(filter.filter_array(), filter.count()) { + self.current.push(values[idx]); + } + } + IterationStrategy::None + | IterationStrategy::All + | IterationStrategy::Slices(_) + | IterationStrategy::SlicesIterator => unreachable!(), + } + + Ok(()) + } + + fn copy_rows_by_indices(&mut self, indices: &[usize]) -> Result<(), ArrowError> { + self.ensure_capacity(); + + let s = self + .source + .as_ref() + .ok_or_else(|| { + ArrowError::InvalidArgumentError( + "Internal Error: InProgressPrimitiveArray: source not set".to_string(), + ) + })? + .as_primitive::(); + + if let Some(nulls) = s.nulls().filter(|nulls| nulls.null_count() > 0) { + for &idx in indices { + self.nulls.append(nulls.is_valid(idx)); + } + } else { + self.nulls.append_n_non_nulls(indices.len()); + } + + let values = s.values(); + self.current.reserve(indices.len()); + for &idx in indices { + self.current.push(values[idx]); + } + + Ok(()) + } + fn finish(&mut self) -> Result { // take and reset the current values and nulls let values = std::mem::take(&mut self.current); diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index e95d01f2b592..3f4bb24b9bb1 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -80,13 +80,13 @@ impl Iterator for SlicesIterator<'_> { /// /// This provides the best performance on most predicates, apart from those which keep /// large runs and therefore favour [`SlicesIterator`] -struct IndexIterator<'a> { +pub(crate) struct IndexIterator<'a> { remaining: usize, iter: BitIndexIterator<'a>, } impl<'a> IndexIterator<'a> { - fn new(filter: &'a BooleanArray, remaining: usize) -> Self { + pub(crate) fn new(filter: &'a BooleanArray, remaining: usize) -> Self { assert_eq!(filter.null_count(), 0); let iter = filter.values().set_indices(); Self { remaining, iter } @@ -326,7 +326,7 @@ impl FilterBuilder { /// The iteration strategy used to evaluate [`FilterPredicate`] #[derive(Debug)] -enum IterationStrategy { +pub(crate) enum IterationStrategy { /// A lazily evaluated iterator of ranges SlicesIterator, /// A lazily evaluated iterator of indices @@ -408,6 +408,14 @@ impl FilterPredicate { pub fn count(&self) -> usize { self.count } + + pub(crate) fn filter_array(&self) -> &BooleanArray { + &self.filter + } + + pub(crate) fn strategy(&self) -> &IterationStrategy { + &self.strategy + } } fn filter_array(values: &dyn Array, predicate: &FilterPredicate) -> Result { @@ -558,7 +566,7 @@ where /// `Some((null_count, null_buffer))` where `null_count` is the number of nulls /// in the filtered output, and `null_buffer` is the filtered null buffer /// -fn filter_null_mask( +pub(crate) fn filter_null_mask( nulls: Option<&NullBuffer>, predicate: &FilterPredicate, ) -> Option<(usize, Buffer)> { @@ -639,7 +647,10 @@ fn filter_boolean(array: &BooleanArray, predicate: &FilterPredicate) -> BooleanA } #[inline(never)] -fn filter_native(values: &[T], predicate: &FilterPredicate) -> Buffer { +pub(crate) fn filter_native( + values: &[T], + predicate: &FilterPredicate, +) -> Buffer { assert!(values.len() >= predicate.filter.len()); match &predicate.strategy { diff --git a/arrow/benches/coalesce_kernels.rs b/arrow/benches/coalesce_kernels.rs index b85c5cc532db..577460723f62 100644 --- a/arrow/benches/coalesce_kernels.rs +++ b/arrow/benches/coalesce_kernels.rs @@ -50,6 +50,13 @@ fn add_all_filter_benchmarks(c: &mut Criterion) { true, )])); + // Single BinaryViewArray + let single_binaryview_schema = SchemaRef::new(Schema::new(vec![Field::new( + "value", + DataType::BinaryView, + true, + )])); + // Mixed primitive, StringViewArray let mixed_utf8view_schema = SchemaRef::new(Schema::new(vec![ Field::new("int32_val", DataType::Int32, true), @@ -57,6 +64,13 @@ fn add_all_filter_benchmarks(c: &mut Criterion) { Field::new("utf8view_val", DataType::Utf8View, true), ])); + // Mixed primitive, BinaryViewArray + let mixed_binaryview_schema = SchemaRef::new(Schema::new(vec![ + Field::new("int32_val", DataType::Int32, true), + Field::new("float_val", DataType::Float64, true), + Field::new("binaryview_val", DataType::BinaryView, true), + ])); + // Mixed primitive, StringArray let mixed_utf8_schema = SchemaRef::new(Schema::new(vec![ Field::new("int32_val", DataType::Int32, true), @@ -105,6 +119,30 @@ fn add_all_filter_benchmarks(c: &mut Criterion) { } .build(); + FilterBenchmarkBuilder { + c, + name: "single_binaryview", + batch_size, + num_output_batches: 50, + null_density, + selectivity, + max_string_len: 30, + schema: &single_binaryview_schema, + } + .build(); + + FilterBenchmarkBuilder { + c, + name: "single_binaryview (max_string_len=8)", + batch_size, + num_output_batches: 50, + null_density, + selectivity, + max_string_len: 8, + schema: &single_binaryview_schema, + } + .build(); + // Model mostly short strings, but some longer ones FilterBenchmarkBuilder { c, @@ -131,6 +169,42 @@ fn add_all_filter_benchmarks(c: &mut Criterion) { } .build(); + FilterBenchmarkBuilder { + c, + name: "mixed_binaryview (max_string_len=20)", + batch_size, + num_output_batches: 20, + null_density, + selectivity, + max_string_len: 20, + schema: &mixed_binaryview_schema, + } + .build(); + + FilterBenchmarkBuilder { + c, + name: "mixed_binaryview (max_string_len=8)", + batch_size, + num_output_batches: 20, + null_density, + selectivity, + max_string_len: 8, + schema: &mixed_binaryview_schema, + } + .build(); + + FilterBenchmarkBuilder { + c, + name: "mixed_binaryview (max_string_len=128)", + batch_size, + num_output_batches: 20, + null_density, + selectivity, + max_string_len: 128, + schema: &mixed_binaryview_schema, + } + .build(); + FilterBenchmarkBuilder { c, name: "mixed_utf8", @@ -455,6 +529,17 @@ impl DataStreamBuilder { self.max_string_len, )) // TODO seed } + DataType::BinaryView => Arc::new(BinaryViewArray::from_iter( + create_binary_array_with_len_range_and_prefix_and_seed::( + self.batch_size, + self.null_density, + 0, + self.max_string_len, + b"", + seed, + ) + .iter(), + )), DataType::Dictionary(key_type, value_type) if key_type.as_ref() == &DataType::Int32 && value_type.as_ref() == &DataType::Utf8 => From 79078260cabe9d086292b6c2f5e6ea1d0012b901 Mon Sep 17 00:00:00 2001 From: cl Date: Sat, 18 Apr 2026 13:16:46 +0800 Subject: [PATCH 2/4] arrow-select: clarify fused BinaryView filter coalescing dispatch Split push_batch_with_filter into an explicit fused path classifier and dedicated inline BinaryView dispatch helpers, and use iterator-based extension in the primitive fast path. This keeps unsupported BinaryView layouts on the existing filter path while making the optimised routing for issue #9143 clearer to review and maintain. Signed-off-by: cl --- arrow-select/src/coalesce.rs | 249 +++++++++++++++---------- arrow-select/src/coalesce/primitive.rs | 17 +- 2 files changed, 157 insertions(+), 109 deletions(-) diff --git a/arrow-select/src/coalesce.rs b/arrow-select/src/coalesce.rs index 01cb2fa19579..c414ff29a96c 100644 --- a/arrow-select/src/coalesce.rs +++ b/arrow-select/src/coalesce.rs @@ -151,6 +151,13 @@ pub struct BatchCoalescer { biggest_coalesce_batch_size: Option, } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum FusedFilterPath { + InlineBinaryViewOnly, + MixedPrimitiveInlineBinaryView, + Unsupported, +} + impl BatchCoalescer { /// Create a new `BatchCoalescer` /// @@ -242,81 +249,16 @@ impl BatchCoalescer { batch: RecordBatch, filter: &BooleanArray, ) -> Result<(), ArrowError> { - if supports_fused_inline_binary_view_filter(&batch) { - if filter.len() > batch.num_rows() { - return Err(ArrowError::InvalidArgumentError(format!( - "Filter predicate of length {} is larger than target array of length {}", - filter.len(), - batch.num_rows() - ))); - } - - let mut filter_builder = FilterBuilder::new(filter); - if batch.num_columns() > 1 { - filter_builder = filter_builder.optimize(); - } - let predicate = filter_builder.build(); - let selected_count = predicate.count(); - - if selected_count == 0 { - return Ok(()); - } - - if selected_count == batch.num_rows() && filter.len() == batch.num_rows() { - return self.push_batch(batch); - } - - if let Some(limit) = self.biggest_coalesce_batch_size { - if selected_count > limit { - let filtered_batch = predicate.filter_record_batch(&batch)?; - return self.push_batch(filtered_batch); - } - } - - // For dense inline filters, the existing filter kernel remains faster. - if selected_count.saturating_mul(4) > filter.len() { - let filtered_batch = predicate.filter_record_batch(&batch)?; - return self.push_batch(filtered_batch); + match classify_fused_filter_path(&batch) { + FusedFilterPath::InlineBinaryViewOnly => { + return self.push_batch_with_filter_fused_inline_binary_view_only(batch, filter); } - - let space_in_batch = self.target_batch_size - self.buffered_rows; - if selected_count <= space_in_batch { - let (_schema, arrays, _num_rows) = batch.into_parts(); - - if arrays.len() != self.in_progress_arrays.len() { - return Err(ArrowError::InvalidArgumentError(format!( - "Batch has {} columns but BatchCoalescer expects {}", - arrays.len(), - self.in_progress_arrays.len() - ))); - } - - self.in_progress_arrays - .iter_mut() - .zip(arrays) - .for_each(|(in_progress, array)| { - in_progress.set_source(Some(array)); - }); - - let result = (|| { - for in_progress in self.in_progress_arrays.iter_mut() { - in_progress.copy_rows_by_filter(&predicate)?; - } - - self.buffered_rows += selected_count; - if self.buffered_rows >= self.target_batch_size { - self.finish_buffered_batch()?; - } - - Ok(()) - })(); - - for in_progress in self.in_progress_arrays.iter_mut() { - in_progress.set_source(None); - } - - return result; + FusedFilterPath::MixedPrimitiveInlineBinaryView => { + return self.push_batch_with_filter_fused_mixed_primitive_inline_binary_view( + batch, filter, + ); } + FusedFilterPath::Unsupported => {} } // TODO: optimize this to avoid materializing (copying the results @@ -647,6 +589,142 @@ impl BatchCoalescer { } } +#[inline] +fn classify_fused_filter_path(batch: &RecordBatch) -> FusedFilterPath { + let mut has_inline_binary_view = false; + let mut has_primitive = false; + + // The fused path only supports fully inline BinaryView columns, optionally + // mixed with primitive columns. Any non-inline BinaryView still uses the + // existing filter_record_batch path. + for (field, array) in batch.schema().fields().iter().zip(batch.columns()) { + if field.data_type().is_primitive() { + has_primitive = true; + continue; + } + + let Some(binary_view) = array.as_binary_view_opt() else { + return FusedFilterPath::Unsupported; + }; + + if !binary_view.data_buffers().is_empty() { + return FusedFilterPath::Unsupported; + } + + has_inline_binary_view = true; + } + + match (has_inline_binary_view, has_primitive) { + (true, true) => FusedFilterPath::MixedPrimitiveInlineBinaryView, + (true, false) => FusedFilterPath::InlineBinaryViewOnly, + (false, _) => FusedFilterPath::Unsupported, + } +} + +impl BatchCoalescer { + #[inline] + fn push_batch_with_filter_fused_inline_binary_view_only( + &mut self, + batch: RecordBatch, + filter: &BooleanArray, + ) -> Result<(), ArrowError> { + self.push_batch_with_filter_fused_inline_binary_view(batch, filter) + } + + #[inline] + fn push_batch_with_filter_fused_mixed_primitive_inline_binary_view( + &mut self, + batch: RecordBatch, + filter: &BooleanArray, + ) -> Result<(), ArrowError> { + self.push_batch_with_filter_fused_inline_binary_view(batch, filter) + } + + fn push_batch_with_filter_fused_inline_binary_view( + &mut self, + batch: RecordBatch, + filter: &BooleanArray, + ) -> Result<(), ArrowError> { + if filter.len() > batch.num_rows() { + return Err(ArrowError::InvalidArgumentError(format!( + "Filter predicate of length {} is larger than target array of length {}", + filter.len(), + batch.num_rows() + ))); + } + + let mut filter_builder = FilterBuilder::new(filter); + if batch.num_columns() > 1 { + filter_builder = filter_builder.optimize(); + } + let predicate = filter_builder.build(); + let selected_count = predicate.count(); + + if selected_count == 0 { + return Ok(()); + } + + if selected_count == batch.num_rows() && filter.len() == batch.num_rows() { + return self.push_batch(batch); + } + + if let Some(limit) = self.biggest_coalesce_batch_size { + if selected_count > limit { + let filtered_batch = predicate.filter_record_batch(&batch)?; + return self.push_batch(filtered_batch); + } + } + + // For dense inline filters, the existing filter kernel remains faster. + if selected_count.saturating_mul(4) > filter.len() { + let filtered_batch = predicate.filter_record_batch(&batch)?; + return self.push_batch(filtered_batch); + } + + let space_in_batch = self.target_batch_size - self.buffered_rows; + if selected_count > space_in_batch { + let filtered_batch = predicate.filter_record_batch(&batch)?; + return self.push_batch(filtered_batch); + } + + let (_schema, arrays, _num_rows) = batch.into_parts(); + + if arrays.len() != self.in_progress_arrays.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Batch has {} columns but BatchCoalescer expects {}", + arrays.len(), + self.in_progress_arrays.len() + ))); + } + + self.in_progress_arrays + .iter_mut() + .zip(arrays) + .for_each(|(in_progress, array)| { + in_progress.set_source(Some(array)); + }); + + let result = (|| { + for in_progress in self.in_progress_arrays.iter_mut() { + in_progress.copy_rows_by_filter(&predicate)?; + } + + self.buffered_rows += selected_count; + if self.buffered_rows >= self.target_batch_size { + self.finish_buffered_batch()?; + } + + Ok(()) + })(); + + for in_progress in self.in_progress_arrays.iter_mut() { + in_progress.set_source(None); + } + + result + } +} + /// Return a new `InProgressArray` for the given data type fn create_in_progress_array(data_type: &DataType, batch_size: usize) -> Box { macro_rules! instantiate_primitive { @@ -669,31 +747,6 @@ fn create_in_progress_array(data_type: &DataType, batch_size: usize) -> Box bool { - let mut has_inline_binary_view = false; - - let supported = batch - .schema() - .fields() - .iter() - .zip(batch.columns()) - .all(|(field, array)| { - if field.data_type().is_primitive() { - return true; - } - - let Some(binary_view) = array.as_binary_view_opt() else { - return false; - }; - - let inline = binary_view.data_buffers().is_empty(); - has_inline_binary_view |= inline; - inline - }); - - supported && has_inline_binary_view -} - /// Incrementally builds up arrays /// /// [`GenericInProgressArray`] is the default implementation that buffers diff --git a/arrow-select/src/coalesce/primitive.rs b/arrow-select/src/coalesce/primitive.rs index 6bb93a21e2b3..bfdb7295fe4f 100644 --- a/arrow-select/src/coalesce/primitive.rs +++ b/arrow-select/src/coalesce/primitive.rs @@ -138,18 +138,16 @@ impl InProgressArray for InProgressPrimitiveArray } let values = s.values(); - self.current.reserve(filter.count()); match filter.strategy() { IterationStrategy::Indices(indices) => { - for &idx in indices { - self.current.push(values[idx]); - } + self.current.extend(indices.iter().map(|&idx| values[idx])); } IterationStrategy::IndexIterator => { - for idx in IndexIterator::new(filter.filter_array(), filter.count()) { - self.current.push(values[idx]); - } + self.current.extend( + IndexIterator::new(filter.filter_array(), filter.count()) + .map(|idx| values[idx]), + ); } IterationStrategy::None | IterationStrategy::All @@ -182,10 +180,7 @@ impl InProgressArray for InProgressPrimitiveArray } let values = s.values(); - self.current.reserve(indices.len()); - for &idx in indices { - self.current.push(values[idx]); - } + self.current.extend(indices.iter().map(|&idx| values[idx])); Ok(()) } From c9e37ec76208a1eef9ff0db3a0f3d6565f23833c Mon Sep 17 00:00:00 2001 From: cl Date: Mon, 20 Apr 2026 16:19:00 +0800 Subject: [PATCH 3/4] patch: cache optimisation path Signed-off-by: cl --- arrow-select/src/coalesce.rs | 70 ++++++++++++++++++++++++++++-------- 1 file changed, 55 insertions(+), 15 deletions(-) diff --git a/arrow-select/src/coalesce.rs b/arrow-select/src/coalesce.rs index c414ff29a96c..e075c9cc0282 100644 --- a/arrow-select/src/coalesce.rs +++ b/arrow-select/src/coalesce.rs @@ -149,6 +149,8 @@ pub struct BatchCoalescer { completed: VecDeque, /// Biggest coalesce batch size. See [`Self::with_biggest_coalesce_batch_size`] biggest_coalesce_batch_size: Option, + /// Cached schema-level support for fused filter coalescing. + fused_filter_support: FusedFilterSupport, } #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -158,6 +160,13 @@ enum FusedFilterPath { Unsupported, } +#[derive(Debug, Clone, PartialEq, Eq)] +enum FusedFilterSupport { + InlineBinaryViewOnly(Vec), + MixedPrimitiveInlineBinaryView(Vec), + Unsupported, +} + impl BatchCoalescer { /// Create a new `BatchCoalescer` /// @@ -167,6 +176,7 @@ impl BatchCoalescer { /// Typical values are `4096` or `8192` rows. /// pub fn new(schema: SchemaRef, target_batch_size: usize) -> Self { + let fused_filter_support = classify_fused_filter_support(&schema); let in_progress_arrays = schema .fields() .iter() @@ -181,6 +191,7 @@ impl BatchCoalescer { completed: VecDeque::with_capacity(1), buffered_rows: 0, biggest_coalesce_batch_size: None, + fused_filter_support, } } @@ -249,7 +260,7 @@ impl BatchCoalescer { batch: RecordBatch, filter: &BooleanArray, ) -> Result<(), ArrowError> { - match classify_fused_filter_path(&batch) { + match classify_fused_filter_path(&self.fused_filter_support, &batch) { FusedFilterPath::InlineBinaryViewOnly => { return self.push_batch_with_filter_fused_inline_binary_view_only(batch, filter); } @@ -590,35 +601,64 @@ impl BatchCoalescer { } #[inline] -fn classify_fused_filter_path(batch: &RecordBatch) -> FusedFilterPath { - let mut has_inline_binary_view = false; +fn classify_fused_filter_support(schema: &SchemaRef) -> FusedFilterSupport { + let mut binary_view_indices = Vec::new(); let mut has_primitive = false; - // The fused path only supports fully inline BinaryView columns, optionally - // mixed with primitive columns. Any non-inline BinaryView still uses the - // existing filter_record_batch path. - for (field, array) in batch.schema().fields().iter().zip(batch.columns()) { + for (index, field) in schema.fields().iter().enumerate() { if field.data_type().is_primitive() { has_primitive = true; continue; } - let Some(binary_view) = array.as_binary_view_opt() else { + if field.data_type() == &DataType::BinaryView { + binary_view_indices.push(index); + continue; + } + + return FusedFilterSupport::Unsupported; + } + + match (!binary_view_indices.is_empty(), has_primitive) { + (true, true) => FusedFilterSupport::MixedPrimitiveInlineBinaryView(binary_view_indices), + (true, false) => FusedFilterSupport::InlineBinaryViewOnly(binary_view_indices), + (false, _) => FusedFilterSupport::Unsupported, + } +} + +#[inline] +fn classify_fused_filter_path( + fused_filter_support: &FusedFilterSupport, + batch: &RecordBatch, +) -> FusedFilterPath { + let (path, binary_view_indices) = match fused_filter_support { + FusedFilterSupport::InlineBinaryViewOnly(indices) => { + (FusedFilterPath::InlineBinaryViewOnly, indices.as_slice()) + } + FusedFilterSupport::MixedPrimitiveInlineBinaryView(indices) => ( + FusedFilterPath::MixedPrimitiveInlineBinaryView, + indices.as_slice(), + ), + FusedFilterSupport::Unsupported => return FusedFilterPath::Unsupported, + }; + + // Only candidate schemas pay the per-batch check that all BinaryView values + // are inline and therefore eligible for the fused direct-copy path. + for &index in binary_view_indices { + let Some(binary_view) = batch + .columns() + .get(index) + .and_then(|array| array.as_binary_view_opt()) + else { return FusedFilterPath::Unsupported; }; if !binary_view.data_buffers().is_empty() { return FusedFilterPath::Unsupported; } - - has_inline_binary_view = true; } - match (has_inline_binary_view, has_primitive) { - (true, true) => FusedFilterPath::MixedPrimitiveInlineBinaryView, - (true, false) => FusedFilterPath::InlineBinaryViewOnly, - (false, _) => FusedFilterPath::Unsupported, - } + path } impl BatchCoalescer { From 6ec7f28f65494e956321571c279b584fe0e68242 Mon Sep 17 00:00:00 2001 From: cl Date: Mon, 20 Apr 2026 17:24:31 +0800 Subject: [PATCH 4/4] arrow-select: simplify fused view filter coalescing Collapse the cached Utf8View/BinaryView dispatch state into a single fused filter so unsupported schemas stay on the generic path with minimal overhead. This keeps the fused inline view path easier to follow while preserving the targeted coalescing fast path. Signed-off-by: cl --- arrow-select/src/coalesce.rs | 181 ++++++++++++++++++----------------- 1 file changed, 94 insertions(+), 87 deletions(-) diff --git a/arrow-select/src/coalesce.rs b/arrow-select/src/coalesce.rs index e075c9cc0282..3be95642b53f 100644 --- a/arrow-select/src/coalesce.rs +++ b/arrow-select/src/coalesce.rs @@ -149,22 +149,19 @@ pub struct BatchCoalescer { completed: VecDeque, /// Biggest coalesce batch size. See [`Self::with_biggest_coalesce_batch_size`] biggest_coalesce_batch_size: Option, - /// Cached schema-level support for fused filter coalescing. - fused_filter_support: FusedFilterSupport, + /// Cached schema-level fused filter support, if supported. + fused_filter: Option, } -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -enum FusedFilterPath { - InlineBinaryViewOnly, - MixedPrimitiveInlineBinaryView, - Unsupported, +#[derive(Debug, Clone, PartialEq, Eq)] +struct FusedFilter { + view_columns: Vec, } -#[derive(Debug, Clone, PartialEq, Eq)] -enum FusedFilterSupport { - InlineBinaryViewOnly(Vec), - MixedPrimitiveInlineBinaryView(Vec), - Unsupported, +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum FusedViewColumn { + Utf8(usize), + Binary(usize), } impl BatchCoalescer { @@ -176,7 +173,7 @@ impl BatchCoalescer { /// Typical values are `4096` or `8192` rows. /// pub fn new(schema: SchemaRef, target_batch_size: usize) -> Self { - let fused_filter_support = classify_fused_filter_support(&schema); + let fused_filter = classify_fused_filter(&schema); let in_progress_arrays = schema .fields() .iter() @@ -191,7 +188,7 @@ impl BatchCoalescer { completed: VecDeque::with_capacity(1), buffered_rows: 0, biggest_coalesce_batch_size: None, - fused_filter_support, + fused_filter, } } @@ -260,16 +257,10 @@ impl BatchCoalescer { batch: RecordBatch, filter: &BooleanArray, ) -> Result<(), ArrowError> { - match classify_fused_filter_path(&self.fused_filter_support, &batch) { - FusedFilterPath::InlineBinaryViewOnly => { - return self.push_batch_with_filter_fused_inline_binary_view_only(batch, filter); + if let Some(fused_filter) = &self.fused_filter { + if fused_filter.supports_batch(&batch) { + return self.push_batch_with_filter_fused_inline_view(batch, filter); } - FusedFilterPath::MixedPrimitiveInlineBinaryView => { - return self.push_batch_with_filter_fused_mixed_primitive_inline_binary_view( - batch, filter, - ); - } - FusedFilterPath::Unsupported => {} } // TODO: optimize this to avoid materializing (copying the results @@ -601,86 +592,54 @@ impl BatchCoalescer { } #[inline] -fn classify_fused_filter_support(schema: &SchemaRef) -> FusedFilterSupport { - let mut binary_view_indices = Vec::new(); - let mut has_primitive = false; +fn classify_fused_filter(schema: &SchemaRef) -> Option { + let mut view_columns = Vec::new(); for (index, field) in schema.fields().iter().enumerate() { if field.data_type().is_primitive() { - has_primitive = true; continue; } - if field.data_type() == &DataType::BinaryView { - binary_view_indices.push(index); - continue; + match field.data_type() { + DataType::Utf8View => view_columns.push(FusedViewColumn::Utf8(index)), + DataType::BinaryView => view_columns.push(FusedViewColumn::Binary(index)), + _ => return None, } - - return FusedFilterSupport::Unsupported; } - match (!binary_view_indices.is_empty(), has_primitive) { - (true, true) => FusedFilterSupport::MixedPrimitiveInlineBinaryView(binary_view_indices), - (true, false) => FusedFilterSupport::InlineBinaryViewOnly(binary_view_indices), - (false, _) => FusedFilterSupport::Unsupported, - } + (!view_columns.is_empty()).then_some(FusedFilter { view_columns }) } -#[inline] -fn classify_fused_filter_path( - fused_filter_support: &FusedFilterSupport, - batch: &RecordBatch, -) -> FusedFilterPath { - let (path, binary_view_indices) = match fused_filter_support { - FusedFilterSupport::InlineBinaryViewOnly(indices) => { - (FusedFilterPath::InlineBinaryViewOnly, indices.as_slice()) - } - FusedFilterSupport::MixedPrimitiveInlineBinaryView(indices) => ( - FusedFilterPath::MixedPrimitiveInlineBinaryView, - indices.as_slice(), - ), - FusedFilterSupport::Unsupported => return FusedFilterPath::Unsupported, - }; - - // Only candidate schemas pay the per-batch check that all BinaryView values - // are inline and therefore eligible for the fused direct-copy path. - for &index in binary_view_indices { - let Some(binary_view) = batch - .columns() - .get(index) - .and_then(|array| array.as_binary_view_opt()) - else { - return FusedFilterPath::Unsupported; - }; - - if !binary_view.data_buffers().is_empty() { - return FusedFilterPath::Unsupported; +impl FusedFilter { + #[inline] + fn supports_batch(&self, batch: &RecordBatch) -> bool { + // Only candidate schemas pay the per-batch check that all supported view values + // are inline and therefore eligible for the fused direct-copy path. + for view_column in &self.view_columns { + let is_inline = match *view_column { + FusedViewColumn::Utf8(index) => batch + .columns() + .get(index) + .and_then(|array| array.as_string_view_opt()) + .is_some_and(|view| view.data_buffers().is_empty()), + FusedViewColumn::Binary(index) => batch + .columns() + .get(index) + .and_then(|array| array.as_binary_view_opt()) + .is_some_and(|view| view.data_buffers().is_empty()), + }; + + if !is_inline { + return false; + } } - } - path + true + } } impl BatchCoalescer { - #[inline] - fn push_batch_with_filter_fused_inline_binary_view_only( - &mut self, - batch: RecordBatch, - filter: &BooleanArray, - ) -> Result<(), ArrowError> { - self.push_batch_with_filter_fused_inline_binary_view(batch, filter) - } - - #[inline] - fn push_batch_with_filter_fused_mixed_primitive_inline_binary_view( - &mut self, - batch: RecordBatch, - filter: &BooleanArray, - ) -> Result<(), ArrowError> { - self.push_batch_with_filter_fused_inline_binary_view(batch, filter) - } - - fn push_batch_with_filter_fused_inline_binary_view( + fn push_batch_with_filter_fused_inline_view( &mut self, batch: RecordBatch, filter: &BooleanArray, @@ -1473,6 +1432,26 @@ mod tests { .run(); } + #[test] + fn test_string_view_filtered_inline() { + let values: Vec> = vec![Some("foo"), None, Some("barbaz")]; + + let string_view = + StringViewArray::from_iter(std::iter::repeat(values.iter()).flatten().take(1000)); + let batch = + RecordBatch::try_from_iter(vec![("c0", Arc::new(string_view) as ArrayRef)]).unwrap(); + let filter = BooleanArray::from_iter((0..1000).map(|idx| Some(idx % 3 != 1))); + + Test::new("coalesce_string_view_filtered_inline") + .with_batch(batch.clone()) + .with_filter(filter.clone()) + .with_batch(batch) + .with_filter(filter) + .with_batch_size(300) + .with_expected_output_sizes(vec![300, 300, 300, 300, 134]) + .run(); + } + #[test] fn test_mixed_inline_binary_view_filtered() { let int_values = Int32Array::from_iter((0..1000).map(Some)); @@ -1501,6 +1480,34 @@ mod tests { .run(); } + #[test] + fn test_mixed_inline_string_view_filtered() { + let int_values = Int32Array::from_iter((0..1000).map(Some)); + let float_values = arrow_array::Float64Array::from_iter((0..1000).map(|v| Some(v as f64))); + let string_values: Vec> = vec![Some("foo"), None, Some("barbaz")]; + let string_view = StringViewArray::from_iter( + std::iter::repeat(string_values.iter()).flatten().take(1000), + ); + + let batch = RecordBatch::try_from_iter(vec![ + ("i", Arc::new(int_values) as ArrayRef), + ("f", Arc::new(float_values) as ArrayRef), + ("s", Arc::new(string_view) as ArrayRef), + ]) + .unwrap(); + + let filter = BooleanArray::from_iter((0..1000).map(|idx| Some(idx % 3 != 1))); + + Test::new("coalesce_mixed_inline_string_view_filtered") + .with_batch(batch.clone()) + .with_filter(filter.clone()) + .with_batch(batch) + .with_filter(filter) + .with_batch_size(300) + .with_expected_output_sizes(vec![300, 300, 300, 300, 134]) + .run(); + } + #[derive(Debug, Clone, PartialEq)] struct ExpectedLayout { len: usize,